pax_global_header00006660000000000000000000000064142445306260014520gustar00rootroot0000000000000052 comment=6c601163998616bb88991931e443c645858e162c muscle-5.1.0/000077500000000000000000000000001424453062600130135ustar00rootroot00000000000000muscle-5.1.0/.gitattributes000066400000000000000000000047261424453062600157170ustar00rootroot00000000000000############################################################################### # Set default behavior to automatically normalize line endings. ############################################################################### * text=auto ############################################################################### # Set default behavior for command prompt diff. # # This is need for earlier builds of msysgit that does not have it on by # default for csharp files. # Note: This is only used by command line ############################################################################### #*.cs diff=csharp ############################################################################### # Set the merge driver for project and solution files # # Merging from the command prompt will add diff markers to the files if there # are conflicts (Merging from VS is not affected by the settings below, in VS # the diff markers are never inserted). Diff markers may cause the following # file extensions to fail to load in VS. An alternative would be to treat # these files as binary and thus will always conflict and require user # intervention with every merge. To do so, just uncomment the entries below ############################################################################### #*.sln merge=binary #*.csproj merge=binary #*.vbproj merge=binary #*.vcxproj merge=binary #*.vcproj merge=binary #*.dbproj merge=binary #*.fsproj merge=binary #*.lsproj merge=binary #*.wixproj merge=binary #*.modelproj merge=binary #*.sqlproj merge=binary #*.wwaproj merge=binary ############################################################################### # behavior for image files # # image files are treated as binary by default. ############################################################################### #*.jpg binary #*.png binary #*.gif binary ############################################################################### # diff behavior for common document formats # # Convert binary document formats to text before diffing them. This feature # is only available from the command line. Turn it on by uncommenting the # entries below. ############################################################################### #*.doc diff=astextplain #*.DOC diff=astextplain #*.docx diff=astextplain #*.DOCX diff=astextplain #*.dot diff=astextplain #*.DOT diff=astextplain #*.pdf diff=astextplain #*.PDF diff=astextplain #*.rtf diff=astextplain #*.RTF diff=astextplain muscle-5.1.0/.github/000077500000000000000000000000001424453062600143535ustar00rootroot00000000000000muscle-5.1.0/.github/workflows/000077500000000000000000000000001424453062600164105ustar00rootroot00000000000000muscle-5.1.0/.github/workflows/build_linux.yml000066400000000000000000000012731424453062600214540ustar00rootroot00000000000000name: build_linux on: # Enable manual run from the Actions tab workflow_dispatch: jobs: build: runs-on: ubuntu-20.04 steps: # Checks-out the repository under $GITHUB_WORKSPACE - uses: actions/checkout@v2 # Runs a set of commands using the runners shell - name: Build-commands run: | echo Starting Build-commands echo GITHUB_WORKSPACE=$GITHUB_WORKSPACE uname -a cd $GITHUB_WORKSPACE ls -lh cd src make LDFLAGS2=-static - name: Upload binary artifact uses: actions/upload-artifact@v2 with: name: muscle-linux-binary path: src/Linux/muscle muscle-5.1.0/.github/workflows/build_osx.yml000066400000000000000000000013351424453062600211250ustar00rootroot00000000000000name: build_osx on: # Enable manual run from the Actions tab workflow_dispatch: jobs: # This workflow contains a single job called "build" build: runs-on: macos-10.15 steps: # Checks-out the repository under $GITHUB_WORKSPACE - uses: actions/checkout@v2 # Runs a set of commands using the runners shell - name: Build-commands run: | echo Starting Build-commands echo GITHUB_WORKSPACE=$GITHUB_WORKSPACE uname -a cd $GITHUB_WORKSPACE ls -lh cd src make - name: Upload binary artifact uses: actions/upload-artifact@v2 with: name: muscle-osx-binary path: src/Darwin/muscle muscle-5.1.0/.github/workflows/build_windows.yml000066400000000000000000000011511424453062600220020ustar00rootroot00000000000000name: build_windows on: workflow_dispatch jobs: build: runs-on: windows-2022 steps: - uses: actions/checkout@v2 - uses: actions/setup-dotnet@v1 - name: Build run: | cd src $msbuild = "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Msbuild\Current\Bin\MSBuild.exe" $a = @("muscle.sln", "/p:Platform=x64", "/p:Configuration=Release") & $msbuild $a - name: Upload binary artifact uses: actions/upload-artifact@v2 with: name: muscle-windows-exe path: src/x64/Release/muscle.exe muscle-5.1.0/.gitignore000066400000000000000000000071101424453062600150020ustar00rootroot00000000000000## Ignore Visual Studio temporary files, build results, and ## files generated by popular Visual Studio add-ons. ## ## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore # User-specific files *.rsuser *.suo *.user *.userosscache *.sln.docstates # User-specific files (MonoDevelop/Xamarin Studio) *.userprefs # Mono auto generated files mono_crash.* # Build results [Dd]ebug/ [Dd]ebugPublic/ [Rr]elease/ [Rr]eleases/ x64/ x86/ [Ww][Ii][Nn]32/ [Aa][Rr][Mm]/ [Aa][Rr][Mm]64/ bld/ [Bb]in/ [Oo]bj/ [Ll]og/ [Ll]ogs/ # Visual Studio 2015/2017 cache/options directory .vs/ __pycache__/ *.o *.pyc # Cake - Uncomment if you are using it # tools/** # !tools/packages.config # Tabs Studio *.tss # Telerik's JustMock configuration file *.jmconfig # BizTalk build output *.btp.cs *.btm.cs *.odx.cs *.xsd.cs # OpenCover UI analysis results OpenCover/ # Azure Stream Analytics local run output ASALocalRun/ # MSBuild Binary and Structured Log *.binlog # NVidia Nsight GPU debugger configuration file *.nvuser # MFractors (Xamarin productivity tool) working folder .mfractor/ # Local History for Visual Studio .localhistory/ # Visual Studio History (VSHistory) files .vshistory/ # BeatPulse healthcheck temp database healthchecksdb # Backup folder for Package Reference Convert tool in Visual Studio 2017 MigrationBackufiles _Chutzpah* # Visual C++ cache files ipch/ *.aps *.ncb *.opendb *.opensdf *.sdf *.cachefile *.VC.db *.VC.VC.opendb # Visual Studio profiler *.psess *.vsp *.vspx *.sap # Visual Studio Trace Files *.e2e # TFS 2012 Local Workspace $tf/ # Guidance Automation Toolkit *.gpState # ReSharper is a .NET coding add-in _ReSharper*/ *.[Rr]e[Ss]harper *.DotSettings.user # TeamCity is a build add-in _TeamCity* # DotCover # files ending in .cache can be ignored *.[Cc]ache # but keep track of directories ending in .cache !?*.[Cc]ache/ # Others ClientBin/ ~$* *~ *.dbmdl *.dbproj.schemaview *.jfm *.pfx *.publishsettings orleans.codegen.cs # Including strong name files can present a security risk # (https://github.com/github/gitignore/pull/2483#issue-259490424) #*.snk # Since there are multiple workflows, uncomment next line to ignore bower_components # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) #bower_components/ # RIA/Silverlight projects Generated_Code/ # Backup & report files from converting an old project file # to a newer Visual Studio version. Backup files are not needed, # because we have git ;-) _UpgradeReport_Files/ Backup*/ UpgradeLog*.XML UpgradeLog*.htm ServiceFabricBackup/ *.rptproj.bak # SQL Server files *.mdf *.ldf *.ndf # Business Intelligence projects *.rdl.data *.bim.layout *.bim_*.sep/ # Ionide (cross platform F# VS Code tools) working folder .ionide/ # Fody - auto-generated XML schema FodyWeavers.xsd # VS Code files for those working on multiple tools .vscode/* !.vscode/settings.json !.vscode/tasks.json !.vscode/launch.json !.vscode/extensions.json *.code-workspace # Local History for Visual Studio Code .history/ # Windows Installer files from build outputs *.cab *.msi *.msix *.msm *.msp # JetBrains Rider *.sln.iml asePS/ dlldata.c # Benchmark Results BenchmarkDotNet.Artifacts/ # .NET Core project.lock.json project.fragment.lock.json artifacts/ # ASP.NET Scaffolding ScaffoldingReadMe.txt # StyleCop StyleCopReport.xml # Files built by Visual Studio *_i.c *_p.c *_h.h *.ilk *.meta *.obj *.iobj *.pch *.pdb *.ipdb *.pgc *.pgd *.rsp *.sbr *.tlb *.tli *.tlh *.tmp *.tmp_proj *_wpftmp.csproj *.log *.tlog *.vspscc *.vssscc .builds *.pidb *.svclog *.scc # Chutzpah Test CYGWIN*/ [lL]inux/ Darwin/ gitver.txt muscle-5.1.0/CONTRIBUTING.md000066400000000000000000000001121424453062600152360ustar00rootroot00000000000000Please see https://github.com/rcedgar/muscle/wiki/Contributing-to-MUSCLE. muscle-5.1.0/LICENSE000066400000000000000000001045151424453062600140260ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . muscle-5.1.0/README.md000066400000000000000000000023531424453062600142750ustar00rootroot00000000000000![Muscle5](http://drive5.com/images/muscle5_header.jpg) MUSCLE is widely-used software for making multiple alignments of biological sequences. Version 5 of MUSCLE achieves highest scores on Balibase, Bralibase and Balifam benchmark tests and scales to thousands of sequences on a commodity desktop computer. This version supports generating an ensemble of alternative alignments with the same high accuracy obtained with default parameters. By comparing downstream predictions from different alignments, such as trees, a biologist can evaluation the robustness of conclusions against alignment errors. ### Downloads Binary files are self-contained, no dependencies. https://github.com/rcedgar/muscle/releases ### Documentation [Muscle v5 home page](https://drive5.com/muscle5) [Manual](https://drive5.com/muscle5/manual) ### Building MUSCLE from source [https://github.com/rcedgar/muscle/wiki/Building-MUSCLE](https://github.com/rcedgar/muscle/wiki/Building-MUSCLE) ### Reference R.C. Edgar (2021) "MUSCLE v5 enables improved estimates of phylogenetic tree confidence by ensemble bootstrapping" [https://www.biorxiv.org/content/10.1101/2021.06.20.449169v1.full.pdf](https://www.biorxiv.org/content/10.1101/2021.06.20.449169v1.full.pdf) muscle-5.1.0/src/000077500000000000000000000000001424453062600136025ustar00rootroot00000000000000muscle-5.1.0/src/Makefile000066400000000000000000000026151424453062600152460ustar00rootroot00000000000000# The $(OS) variable is the o/s name returned by uname, which is # used as the sub-directory name under src/ where object files # and the executable are stored. This allows several target # operating systems in the same directory structure. # Typical values are: # Platform Value of $(OS) # -------- -------------- # Linux linux # Mac OSX Darwin # Cygwin CYGWIN_NT-10.0 # Building on Mac OSX is challenging because Apple does not support gcc or # the OMP threading library. Hacks to install gcc and OMP vary by OSX release. # This Makefile works with the AWS Catalina v10.15.7 AMI. With this AMI, # running 'brew install gcc' currently installs gcc v11. OS := $(shell uname) CPPFLAGS := $(CPPFLAGS) -DNDEBUG -pthread CXX := g++ ifeq ($(OS),Darwin) CXX := g++-11 endif CXXFLAGS := $(CXXFLAGS) -O3 -fopenmp -ffast-math LDFLAGS := $(LDFLAGS) -O3 -fopenmp -pthread -lpthread ${LDFLAGS2} HDRS := $(shell echo *.h) OBJS := $(shell echo *.cpp | sed "-es/^/$(OS)\//" | sed "-es/ / $(OS)\//g" | sed "-es/\.cpp/.o/g") SRCS := $(shell ls *.cpp *.h) .PHONY: clean $(OS)/muscle : gitver.txt $(OS)/ $(OBJS) $(CXX) $(LDFLAGS) $(OBJS) -o $@ # Warning: do not add -d option to strip, this is not portable strip $(OS)/muscle gitver.txt : $(SRCS) bash ./gitver.bash $(OS)/ : mkdir -p $(OS)/ $(OS)/%.o : %.cpp $(HDRS) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $< clean: rm -rf gitver.txt $(OS)/ muscle-5.1.0/src/addconfseq.cpp000066400000000000000000000042151424453062600164170ustar00rootroot00000000000000#include "muscle.h" #include "ensemble.h" #include "qscorer.h" static char ConfToChar1(double Conf) { asserta(Conf >= 0 && Conf <= 1); uint Tenth = uint(Conf*10); asserta(Tenth >= 0 && Tenth <= 10); if (Tenth == 10) return '+'; return '0' + Tenth; } static char ConfToChar2(double Conf) { asserta(Conf >= 0 && Conf <= 1); uint H = uint(Conf*100); asserta(H >= 0 && H <= 100); if (H == 100) return '+'; return '0' + H%10; } static void Do1(FILE *fOut, const Ensemble &E, uint MSAIndex, const string &ConfLabel, int Dec) { const MSA &M = E.GetMSA(MSAIndex); const uint ColCount = M.GetColCount(); string ConfSeq; for (uint ColIndex = 0; ColIndex < ColCount; ++ColIndex) { double Conf = E.GetConf_MSACol(MSAIndex, ColIndex); char c = '?'; switch (Dec) { case 1: c = ConfToChar1(Conf); break; case 2: c = ConfToChar2(Conf); break; default: asserta(false); } ConfSeq += c; } Pf(fOut, ">%s\n", ConfLabel.c_str()); Pf(fOut, "%s\n", ConfSeq.c_str()); } void cmd_addconfseq() { const string &InputFileName = opt(addconfseq); const string RefFileName = opt(ref); const string &OutputFileName = opt(output); string ConfLabel = "_conf_"; if (optset_label) ConfLabel = opt(label); MSA Ref; if (optset_ref) Ref.FromFASTAFile_PreserveCase(RefFileName); Ensemble E; E.FromFile(InputFileName); if (optset_ref) E.SortMSA(Ref); FILE *fOut = CreateStdioFile(OutputFileName); const uint MSACount = E.GetMSACount(); const uint SeqCount = E.GetSeqCount(); for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const string &MSAName = E.GetMSAName(MSAIndex); Pf(fOut, "<%s\n", MSAName.c_str()); const MSA &M = E.GetMSA(MSAIndex); const uint ColCount = M.GetColCount(); const uint MSASeqCount = M.GetSeqCount(); asserta(MSASeqCount == SeqCount); Do1(fOut, E, MSAIndex, ConfLabel, 1); Do1(fOut, E, MSAIndex, ConfLabel + "2", 2); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { const char *S = M.m_szSeqs[SeqIndex]; const char *Label = M.m_szNames[SeqIndex]; Pf(fOut, ">%s\n", Label); Pf(fOut, "%*.*s\n", ColCount, ColCount, S); } } CloseStdioFile(fOut); } muscle-5.1.0/src/align.cpp000066400000000000000000000073551424453062600154120ustar00rootroot00000000000000#include "muscle.h" void MakeReplicateFileName(const string &Pattern, TREEPERM TP, uint PerturbSeed, string &FileName) { FileName.clear(); size_t pos = Pattern.find('@'); if (pos == string::npos) Die("'@' not found in '%s'", Pattern.c_str()); for (size_t i = 0; i < pos; ++i) FileName += Pattern[i]; Psa(FileName, "%s.%u", TREEPERMToStr(TP), PerturbSeed); for (size_t i = pos+1; i < SIZE(Pattern); ++i) FileName += Pattern[i]; } static void Align(MPCFlat &M, MultiSequence &InputSeqs, uint PerturbSeed, TREEPERM TP, bool WriteEfaHdr, FILE *fOut) { if (fOut == 0) return; bool Nucleo = (g_Alpha == ALPHA_Nucleo); HMMParams HP; HP.FromDefaults(Nucleo); if (PerturbSeed > 0) { ResetRand(PerturbSeed); HP.PerturbProbs(PerturbSeed); } HP.ToPairHMM(); M.m_TreePerm = TP; M.Run(&InputSeqs); asserta(M.m_MSA != 0); if (WriteEfaHdr) { const char *TPStr = TREEPERMToStr(TP); fprintf(fOut, "<%s.%u\n", TPStr, PerturbSeed); } M.m_MSA->WriteMFA(fOut); } void cmd_align() { MultiSequence InputSeqs; InputSeqs.LoadMFA(opt(align), true); const uint InputSeqCount = InputSeqs.GetSeqCount(); const string &OutputPattern = opt(output); if (OutputPattern.empty()) Die("Must set -output"); double MeanSeqLength = InputSeqs.GetMeanSeqLength(); uint MaxSeqLength = InputSeqs.GetMaxSeqLength(); ProgressLog("Input: %u seqs, avg length %.0f, max %u\n\n", InputSeqCount, MeanSeqLength, MaxSeqLength); if (InputSeqCount > 1000) Warning(">1k sequences, may be slow or use excessive memory, consider using -super5"); bool OutputWildcard = OutputPattern.find('@') != string::npos; FILE *fOut = 0; bool IsNucleo = InputSeqs.GuessIsNucleo(); if (IsNucleo) SetAlpha(ALPHA_Nucleo); else SetAlpha(ALPHA_Amino); MPCFlat M; if (optset_consiters) M.m_ConsistencyIterCount = opt(consiters); if (optset_refineiters) M.m_RefineIterCount = opt(refineiters); if (opt(stratified) && opt(diversified)) Die("Cannot set both -stratified and -diversified"); if (opt(stratified) || opt(diversified)) { if (optset_perm || optset_perturb) Die("Cannot set -perm or -perturb with -stratified or -diversified"); } uint RepCount = 1; if (opt(stratified)) RepCount = 4; else if (opt(diversified)) RepCount = 100; if (optset_replicates) RepCount = opt(replicates); if (RepCount == 1) { uint PerturbSeed = 0; if (optset_perturb) PerturbSeed = opt(perturb); TREEPERM TP = TP_None; if (optset_perm) TP = StrToTREEPERM(opt(perm)); if (TP == TP_All) Die("-perm all not supported, use -stratified"); string OutputFileName; if (OutputWildcard) MakeReplicateFileName(OutputPattern, TP, PerturbSeed, OutputFileName); else OutputFileName = OutputPattern; fOut = CreateStdioFile(OutputFileName); Align(M, InputSeqs, PerturbSeed, TP, false, fOut); CloseStdioFile(fOut); return; } bool Stratified = false; if (opt(stratified)) { Stratified = true; RepCount *= 4; if (optset_perm) Die("Cannot set both -perm and -stratified"); asserta(RepCount > 0); } string OutputFileName; if (!OutputWildcard) fOut = CreateStdioFile(OutputPattern); for (uint RepIndex = 0; RepIndex < RepCount; ++RepIndex) { uint PerturbSeed = (Stratified ? RepIndex/4 : RepIndex); TREEPERM TP = (optset_perm ? StrToTREEPERM(opt(perm)) : TREEPERM(RepIndex%4)); ProgressLog("Replicate %u/%u, %s.%u\n", RepIndex+1, RepCount, TREEPERMToStr(TP), PerturbSeed); if (OutputWildcard) { MakeReplicateFileName(OutputPattern, TP, PerturbSeed, OutputFileName); fOut = CreateStdioFile(OutputFileName); } bool WriteEfaHeader = !OutputWildcard; Align(M, InputSeqs, PerturbSeed, TP, WriteEfaHeader, fOut); if (OutputWildcard) CloseStdioFile(fOut); } if (!OutputWildcard) CloseStdioFile(fOut); } muscle-5.1.0/src/alignpairflat.cpp000066400000000000000000000020441424453062600171230ustar00rootroot00000000000000#include "muscle.h" float AlignPairFlat_SparsePost(const Sequence *Seq1, const Sequence *Seq2, string &Path, MySparseMx *SparsePost) { InitProbcons(); uint L1 = Seq1->GetLength(); uint L2 = Seq2->GetLength(); asserta(L1 > 0); asserta(L2 > 0); const byte *ByteSeq1 = Seq1->GetBytePtr(); const byte *ByteSeq2 = Seq2->GetBytePtr(); float *Fwd = AllocFB(L1, L2); float *Bwd = AllocFB(L1, L2); float *Post = AllocPost(L1, L2); CalcFwdFlat(ByteSeq1, L1, ByteSeq2, L2, Fwd); CalcBwdFlat(ByteSeq1, L1, ByteSeq2, L2, Bwd); CalcPostFlat(Fwd, Bwd, L1, L2, Post); delete Fwd; delete Bwd; float *DPRows = AllocDPRows(L1, L2); char *TB = AllocTB(L1, L2); float Score = CalcAlnFlat(Post, L1, L2, DPRows, TB, Path); if (SparsePost != 0) SparsePost->FromPost(Post, L1, L2); delete Post; delete DPRows; delete TB; asserta(L1 > 0 && L2 > 0); float EA = Score/min(L1, L2); return EA; } float AlignPairFlat(const Sequence *Seq1, const Sequence *Seq2, string &Path) { float EA = AlignPairFlat_SparsePost(Seq1, Seq2, Path, 0); return EA; } muscle-5.1.0/src/allocflat.cpp000066400000000000000000000021501424453062600162450ustar00rootroot00000000000000#include "muscle.h" uint64 GetFBSize(uint LX, uint LY) { uint64 Size64 = uint64(LX + 1)*uint64(LY + 1)*HMMSTATE_COUNT; if (double(Size64) > 4e9) Die("Memory object too large due to sequence lengths %u, %u", LX, LY); uint Size = uint(Size64); asserta(Size == uint(Size64)); return Size; } uint64 GetPostSize(uint LX, uint LY) { uint64 Size64 = uint64(LX)*uint64(LY); uint Size = uint(Size64); asserta(uint64(Size) == Size64); return Size; } uint64 GetDPRowsSize(uint LX, uint LY) { uint64 Size64 = 2*uint64(LY + 1); uint Size = uint(Size64); asserta(uint64(Size) == Size64); return Size; } uint64 GetTBSize(uint LX, uint LY) { uint64 Size64 = uint64(LX + 1)*uint64(LY + 1); uint Size = uint(Size64); asserta(uint64(Size) == Size64); return Size; } float *AllocFB(uint LX, uint LY) { return myalloc64(float, GetFBSize(LX, LY)); } float *AllocPost(uint LX, uint LY) { return myalloc64(float, GetPostSize(LX, LY)); } float *AllocDPRows(uint LX, uint LY) { return myalloc64(float, GetDPRowsSize(LX, LY)); } char *AllocTB(uint LX, uint LY) { return myalloc64(char, GetTBSize(LX, LY)); } muscle-5.1.0/src/alnalnsflat.cpp000066400000000000000000000023151424453062600166060ustar00rootroot00000000000000#include "muscle.h" #include "mpcflat.h" float CalcAlnFlat(const float *Post, uint LX, uint LY, float *DPRows, char *TB, string &Path); MultiSequence *MPCFlat::AlignAlns(const MultiSequence &MSA1, const MultiSequence &MSA2) { const uint SeqCount1 = MSA1.GetSeqCount(); const uint SeqCount2 = MSA2.GetSeqCount(); const uint ColCount1 = MSA1.GetColCount(); const uint ColCount2 = MSA2.GetColCount(); float *Post = AllocPost(ColCount1, ColCount2); BuildPost(MSA1, MSA2, Post); float *DPRows = AllocDPRows(ColCount1, ColCount2); char *TB = AllocTB(ColCount1, ColCount2); string Path; CalcAlnFlat(Post, ColCount1, ColCount2, DPRows, TB, Path); myfree(Post); myfree(DPRows); myfree(TB); MultiSequence *result = new MultiSequence(); for (uint SeqIndex1 = 0; SeqIndex1 < SeqCount1; ++SeqIndex1) { const Sequence *InputRow = MSA1.GetSequence(SeqIndex1); Sequence *AlignedRow = InputRow->AddGapsPath(Path, 'X'); result->AddSequence(AlignedRow, true); } for (uint SeqIndex2 = 0; SeqIndex2 < SeqCount2; ++SeqIndex2) { const Sequence *InputRow = MSA2.GetSequence(SeqIndex2); Sequence *AlignedRow = InputRow->AddGapsPath(Path, 'Y'); result->AddSequence(AlignedRow, true); } return result; } muscle-5.1.0/src/alnmsasflat.cpp000066400000000000000000000023771424453062600166240ustar00rootroot00000000000000#include "muscle.h" #include "locallock.h" float AlignMSAsFlat(const string &ProgressStr, const MultiSequence &MSA1, const MultiSequence &MSA2, uint TargetPairCount, string &Path) { const uint SeqCount1 = MSA1.GetNumSequences(); const uint SeqCount2 = MSA2.GetNumSequences(); asserta(SeqCount1 > 0); asserta(SeqCount2 > 0); asserta(MSA1.IsAligned()); asserta(MSA2.IsAligned()); const uint ColCount1 = MSA1.GetColCount(); const uint ColCount2 = MSA2.GetColCount(); vector SeqIndexes1; vector SeqIndexes2; GetPairs(SeqCount1, SeqCount2, TargetPairCount, SeqIndexes1, SeqIndexes2); const uint PairCount = SIZE(SeqIndexes1); asserta(SIZE(SeqIndexes2) == PairCount); vector SparseMxs; float AvgEA = GetPostPairsAlignedFlat(ProgressStr, MSA1, MSA2, SeqIndexes1, SeqIndexes2, SparseMxs); const uint L1 = ColCount1; const uint L2 = ColCount2; float *Post = AllocPost(L1, L2); CalcPosteriorFlat3(MSA1, MSA2, SeqIndexes1, SeqIndexes2, SparseMxs, Post); for (uint i = 0; i < PairCount; ++i) delete SparseMxs[i]; SparseMxs.clear(); float *DPRows = AllocDPRows(L1, L2); char *TB = AllocTB(L1, L2); CalcAlnFlat(Post, ColCount1, ColCount2, DPRows, TB, Path); delete Post; delete DPRows; delete TB; return AvgEA; } muscle-5.1.0/src/alnmsasflat3.cpp000066400000000000000000000024601424453062600167000ustar00rootroot00000000000000#include "muscle.h" float AlignMSAsFlat3(const string &ProgressStr, const MultiSequence &MSA1, const MultiSequence &MSA2, const vector &SparseMxVec, uint Index1, uint Index2, uint TargetPairCount, string &Path) { const uint SeqCount1 = MSA1.GetNumSequences(); const uint SeqCount2 = MSA2.GetNumSequences(); asserta(SeqCount1 > 0); asserta(SeqCount2 > 0); asserta(MSA1.IsAligned()); asserta(MSA2.IsAligned()); const uint ColCount1 = MSA1.GetColCount(); const uint ColCount2 = MSA2.GetColCount(); vector SeqIndexes1; vector SeqIndexes2; GetPairs(SeqCount1, SeqCount2, TargetPairCount, SeqIndexes1, SeqIndexes2); const uint PairCount = SIZE(SeqIndexes1); asserta(SIZE(SeqIndexes2) == PairCount); vector SparseMxs; float AvgEA = GetPostPairsAlignedFlat(ProgressStr, MSA1, MSA2, SeqIndexes1, SeqIndexes2, SparseMxs); const uint L1 = ColCount1; const uint L2 = ColCount2; float *Post = AllocPost(L1, L2); CalcPosteriorFlat3(MSA1, MSA2, SeqIndexes1, SeqIndexes2, SparseMxs, Post); for (uint i = 0; i < PairCount; ++i) delete SparseMxs[i]; SparseMxs.clear(); float *DPRows = AllocDPRows(L1, L2); char *TB = AllocTB(L1, L2); CalcAlnFlat(Post, ColCount1, ColCount2, DPRows, TB, Path); delete Post; delete DPRows; delete TB; return AvgEA; } muscle-5.1.0/src/alpha.cpp000066400000000000000000000132321424453062600153740ustar00rootroot00000000000000#include "muscle.h" /*** From Bioperl docs: Extended DNA / RNA alphabet ------------------------------------------ Symbol Meaning Nucleic Acid ------------------------------------------ A A Adenine C C Cytosine G G Guanine T T Thymine U U Uracil M A or C R A or G W A or T S C or G Y C or T K G or T V A or C or G H A or C or T D A or G or T B C or G or T X G or A or T or C N G or A or T or C IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE: Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030. ***/ unsigned g_CharToLetter[MAX_CHAR]; unsigned g_CharToLetterEx[MAX_CHAR]; char g_LetterToChar[MAX_ALPHA]; char g_LetterExToChar[MAX_ALPHA_EX]; char g_UnalignChar[MAX_CHAR]; char g_AlignChar[MAX_CHAR]; bool g_IsWildcardChar[MAX_CHAR]; bool g_IsResidueChar[MAX_CHAR]; ALPHA g_Alpha = ALPHA_Undefined; unsigned g_AlphaSize = 0; #define Res(c, Letter) \ { \ const unsigned char Upper = (unsigned char) toupper(c); \ const unsigned char Lower = (unsigned char) tolower(c); \ g_CharToLetter[Upper] = Letter; \ g_CharToLetter[Lower] = Letter; \ g_CharToLetterEx[Upper] = Letter; \ g_CharToLetterEx[Lower] = Letter; \ g_LetterToChar[Letter] = Upper; \ g_LetterExToChar[Letter] = Upper; \ g_IsResidueChar[Upper] = true; \ g_IsResidueChar[Lower] = true; \ g_AlignChar[Upper] = Upper; \ g_AlignChar[Lower] = Upper; \ g_UnalignChar[Upper] = Lower; \ g_UnalignChar[Lower] = Lower; \ } #define Wild(c, Letter) \ { \ const unsigned char Upper = (unsigned char) toupper(c); \ const unsigned char Lower = (unsigned char) tolower(c); \ g_CharToLetterEx[Upper] = Letter; \ g_CharToLetterEx[Lower] = Letter; \ g_LetterExToChar[Letter] = Upper; \ g_IsResidueChar[Upper] = true; \ g_IsResidueChar[Lower] = true; \ g_AlignChar[Upper] = Upper; \ g_AlignChar[Lower] = Upper; \ g_UnalignChar[Upper] = Lower; \ g_UnalignChar[Lower] = Lower; \ g_IsWildcardChar[Lower] = true; \ g_IsWildcardChar[Upper] = true; \ } static unsigned GetAlphaSize(ALPHA Alpha) { switch (Alpha) { case ALPHA_Amino: return 20; case ALPHA_Nucleo: return 4; } Die("Invalid Alpha=%d", Alpha); return 0; } static void InitArrays() { memset(g_CharToLetter, 0xff, sizeof(g_CharToLetter)); memset(g_CharToLetterEx, 0xff, sizeof(g_CharToLetterEx)); memset(g_LetterToChar, '?', sizeof(g_LetterToChar)); memset(g_LetterExToChar, '?', sizeof(g_LetterExToChar)); memset(g_AlignChar, '?', sizeof(g_UnalignChar)); memset(g_UnalignChar, '?', sizeof(g_UnalignChar)); memset(g_IsWildcardChar, 0, sizeof(g_IsWildcardChar)); } static void SetGapChar(char c) { unsigned char u = (unsigned char) c; g_CharToLetterEx[u] = AX_GAP; g_LetterExToChar[AX_GAP] = u; g_AlignChar[u] = u; g_UnalignChar[u] = u; } static void SetAlphaNucleo() { Res('A', NX_A) Res('C', NX_C) Res('G', NX_G) Res('T', NX_T) Res('U', NX_T) Wild('M', NX_M) Wild('R', NX_R) Wild('W', NX_W) Wild('S', NX_S) Wild('Y', NX_Y) Wild('K', NX_K) Wild('V', NX_V) Wild('H', NX_H) Wild('D', NX_D) Wild('B', NX_B) Wild('X', NX_X) Wild('N', NX_N) } static void SetAlphaDNA() { Res('A', NX_A) Res('C', NX_C) Res('G', NX_G) Res('T', NX_T) Wild('M', NX_M) Wild('R', NX_R) Wild('W', NX_W) Wild('S', NX_S) Wild('Y', NX_Y) Wild('K', NX_K) Wild('V', NX_V) Wild('H', NX_H) Wild('D', NX_D) Wild('B', NX_B) Wild('X', NX_X) Wild('N', NX_N) } static void SetAlphaRNA() { Res('A', NX_A) Res('C', NX_C) Res('G', NX_G) Res('U', NX_U) Res('T', NX_T) Wild('M', NX_M) Wild('R', NX_R) Wild('W', NX_W) Wild('S', NX_S) Wild('Y', NX_Y) Wild('K', NX_K) Wild('V', NX_V) Wild('H', NX_H) Wild('D', NX_D) Wild('B', NX_B) Wild('X', NX_X) Wild('N', NX_N) } static void SetAlphaAmino() { Res('A', AX_A) Res('C', AX_C) Res('D', AX_D) Res('E', AX_E) Res('F', AX_F) Res('G', AX_G) Res('H', AX_H) Res('I', AX_I) Res('K', AX_K) Res('L', AX_L) Res('M', AX_M) Res('N', AX_N) Res('P', AX_P) Res('Q', AX_Q) Res('R', AX_R) Res('S', AX_S) Res('T', AX_T) Res('V', AX_V) Res('W', AX_W) Res('Y', AX_Y) Wild('B', AX_B) Wild('X', AX_X) Wild('Z', AX_Z) } void SetAlpha(ALPHA Alpha) { InitArrays(); SetGapChar('.'); SetGapChar('-'); switch (Alpha) { case ALPHA_Amino: SetAlphaAmino(); break; case ALPHA_Nucleo: SetAlphaNucleo(); break; default: Die("Invalid Alpha=%d", Alpha); } g_AlphaSize = GetAlphaSize(Alpha); g_Alpha = Alpha; } char GetWildcardChar() { switch (g_Alpha) { case ALPHA_Amino: return 'X'; case ALPHA_Nucleo: return 'N'; default: Die("Invalid Alpha=%d", g_Alpha); } return '?'; } bool IsNucleo(char c) { return strchr("ACGTURYNacgturyn", c) != 0; } bool IsDNA(char c) { return strchr("AGCTNagctn", c) != 0; } bool IsRNA(char c) { return strchr("AGCUNagcun", c) != 0; } static char InvalidLetters[256]; static int InvalidLetterCount = 0; void ClearInvalidLetterWarning() { memset(InvalidLetters, 0, 256); } void InvalidLetterWarning(char c, char w) { InvalidLetters[(unsigned char) c] = 1; ++InvalidLetterCount; } void ReportInvalidLetters() { if (0 == InvalidLetterCount) return; char Str[257]; memset(Str, 0, 257); int n = 0; for (int i = 0; i < 256; ++i) { if (InvalidLetters[i]) Str[n++] = (char) i; } Warning("Invalid letters found: %s", Str); } muscle-5.1.0/src/alpha.h000066400000000000000000000041331424453062600150410ustar00rootroot00000000000000#ifndef alpha_h #define alpha_h enum ALPHA { ALPHA_Undefined, ALPHA_Nucleo, ALPHA_Amino }; bool StrHasAmino(const char *Str); bool StrHasGap(const char *Str); void ClearInvalidLetterWarning(); void InvalidLetterWarning(char c, char w); void ReportInvalidLetters(); extern unsigned g_CharToLetter[]; extern unsigned g_CharToLetterEx[]; extern char g_LetterToChar[]; extern char g_LetterExToChar[]; extern char g_UnalignChar[]; extern char g_AlignChar[]; extern bool g_IsWildcardChar[]; extern bool g_IsResidueChar[]; #define CharToLetter(c) (g_CharToLetter[(unsigned char) (c)]) #define CharToLetterEx(c) (g_CharToLetterEx[(unsigned char) (c)]) #define LetterToChar(u) (g_LetterToChar[u]) #define LetterExToChar(u) (g_LetterExToChar[u]) #define IsResidueChar(c) (g_IsResidueChar[(unsigned char) (c)]) #define IsGapChar(c) ('-' == (c) || '.' == (c)) #define IsWildcardChar(c) (g_IsWildcardChar[(unsigned char) (c)]) #define AlignChar(c) (g_AlignChar[(unsigned char) (c)]) #define UnalignChar(c) (g_UnalignChar[(unsigned char) (c)]) // AX=Amino alphabet with eXtensions (B, Z and X) enum AX { AX_A, AX_C, AX_D, AX_E, AX_F, AX_G, AX_H, AX_I, AX_K, AX_L, AX_M, AX_N, AX_P, AX_Q, AX_R, AX_S, AX_T, AX_V, AX_W, AX_Y, AX_X, // Any AX_B, // D or N AX_Z, // E or Q AX_GAP, }; const unsigned AX_COUNT = AX_GAP + 1; // NX=Nucleotide alphabet with extensions enum NX { NX_A, NX_C, NX_G, NX_T, NX_U = NX_T, NX_M, // AC NX_R, // AG NX_W, // AT NX_S, // CG NX_Y, // CT NX_K, // GT NX_V, // ACG NX_H, // ACT NX_D, // AGT NX_B, // CGT NX_X, // GATC NX_N, // GATC NX_GAP }; const unsigned NX_COUNT = NX_GAP + 1; const unsigned MAX_ALPHA = 20; const unsigned MAX_ALPHA_EX = AX_COUNT; const unsigned MAX_CHAR = 256; extern ALPHA g_Alpha; extern unsigned g_AlphaSize; void SetAlpha(ALPHA Alpha); char GetWildcardChar(); bool IsNucleo(char c); bool IsDNA(char c); bool IsRNA(char c); static inline bool isgap(char c) { return c == '-' || c == '.'; } extern byte g_CharToLetterNucleo[256]; extern byte g_CharToLetterAmino[256]; #endif // alpha_h muscle-5.1.0/src/alpha3.cpp000066400000000000000000004032341424453062600154640ustar00rootroot00000000000000// Generated by /e/r/py/alphac.py #include "myutils.h" #include "alpha3.h" byte g_AminoAcidChars[20] = { 'A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y', }; byte g_CharToLetterAminoStop[256] = { INVALID_LETTER, // [ 0] 0x00 INVALID_LETTER, // [ 1] 0x01 INVALID_LETTER, // [ 2] 0x02 INVALID_LETTER, // [ 3] 0x03 INVALID_LETTER, // [ 4] 0x04 INVALID_LETTER, // [ 5] 0x05 INVALID_LETTER, // [ 6] 0x06 INVALID_LETTER, // [ 7] 0x07 INVALID_LETTER, // [ 8] 0x08 INVALID_LETTER, // [ 9] 0x09 INVALID_LETTER, // [ 10] 0x0a INVALID_LETTER, // [ 11] 0x0b INVALID_LETTER, // [ 12] 0x0c INVALID_LETTER, // [ 13] 0x0d INVALID_LETTER, // [ 14] 0x0e INVALID_LETTER, // [ 15] 0x0f INVALID_LETTER, // [ 16] 0x10 INVALID_LETTER, // [ 17] 0x11 INVALID_LETTER, // [ 18] 0x12 INVALID_LETTER, // [ 19] 0x13 INVALID_LETTER, // [ 20] 0x14 INVALID_LETTER, // [ 21] 0x15 INVALID_LETTER, // [ 22] 0x16 INVALID_LETTER, // [ 23] 0x17 INVALID_LETTER, // [ 24] 0x18 INVALID_LETTER, // [ 25] 0x19 INVALID_LETTER, // [ 26] 0x1a INVALID_LETTER, // [ 27] 0x1b INVALID_LETTER, // [ 28] 0x1c INVALID_LETTER, // [ 29] 0x1d INVALID_LETTER, // [ 30] 0x1e INVALID_LETTER, // [ 31] 0x1f INVALID_LETTER, // [ 32] ' ' INVALID_LETTER, // [ 33] '!' INVALID_LETTER, // [ 34] '"' INVALID_LETTER, // [ 35] '#' INVALID_LETTER, // [ 36] '$' INVALID_LETTER, // [ 37] '%' INVALID_LETTER, // [ 38] '&' INVALID_LETTER, // [ 39] ''' INVALID_LETTER, // [ 40] '(' INVALID_LETTER, // [ 41] ')' 20 , // [ 42] '*' = STP INVALID_LETTER, // [ 43] '+' INVALID_LETTER, // [ 44] ',' INVALID_LETTER, // [ 45] '-' INVALID_LETTER, // [ 46] '.' INVALID_LETTER, // [ 47] '/' INVALID_LETTER, // [ 48] '0' INVALID_LETTER, // [ 49] '1' INVALID_LETTER, // [ 50] '2' INVALID_LETTER, // [ 51] '3' INVALID_LETTER, // [ 52] '4' INVALID_LETTER, // [ 53] '5' INVALID_LETTER, // [ 54] '6' INVALID_LETTER, // [ 55] '7' INVALID_LETTER, // [ 56] '8' INVALID_LETTER, // [ 57] '9' INVALID_LETTER, // [ 58] ':' INVALID_LETTER, // [ 59] ';' INVALID_LETTER, // [ 60] '<' INVALID_LETTER, // [ 61] '=' INVALID_LETTER, // [ 62] '>' INVALID_LETTER, // [ 63] '?' INVALID_LETTER, // [ 64] '@' 0 , // [ 65] 'A' = Ala INVALID_LETTER, // [ 66] 'B' 1 , // [ 67] 'C' = Cys 2 , // [ 68] 'D' = Asp 3 , // [ 69] 'E' = Glu 4 , // [ 70] 'F' = Phe 5 , // [ 71] 'G' = Gly 6 , // [ 72] 'H' = His 7 , // [ 73] 'I' = Ile INVALID_LETTER, // [ 74] 'J' 8 , // [ 75] 'K' = Lys 9 , // [ 76] 'L' = Leu 10 , // [ 77] 'M' = Met 11 , // [ 78] 'N' = Asn INVALID_LETTER, // [ 79] 'O' 12 , // [ 80] 'P' = Pro 13 , // [ 81] 'Q' = Gln 14 , // [ 82] 'R' = Arg 15 , // [ 83] 'S' = Ser 16 , // [ 84] 'T' = Thr INVALID_LETTER, // [ 85] 'U' 17 , // [ 86] 'V' = Val 18 , // [ 87] 'W' = Trp INVALID_LETTER, // [ 88] 'X' 19 , // [ 89] 'Y' = Tyr INVALID_LETTER, // [ 90] 'Z' INVALID_LETTER, // [ 91] '[' INVALID_LETTER, // [ 92] '\' INVALID_LETTER, // [ 93] ']' INVALID_LETTER, // [ 94] '^' INVALID_LETTER, // [ 95] '_' INVALID_LETTER, // [ 96] '`' 0 , // [ 97] 'a' = Ala INVALID_LETTER, // [ 98] 'b' 1 , // [ 99] 'c' = Cys 2 , // [100] 'd' = Asp 3 , // [101] 'e' = Glu 4 , // [102] 'f' = Phe 5 , // [103] 'g' = Gly 6 , // [104] 'h' = His 7 , // [105] 'i' = Ile INVALID_LETTER, // [106] 'j' 8 , // [107] 'k' = Lys 9 , // [108] 'l' = Leu 10 , // [109] 'm' = Met 11 , // [110] 'n' = Asn INVALID_LETTER, // [111] 'o' 12 , // [112] 'p' = Pro 13 , // [113] 'q' = Gln 14 , // [114] 'r' = Arg 15 , // [115] 's' = Ser 16 , // [116] 't' = Thr INVALID_LETTER, // [117] 'u' 17 , // [118] 'v' = Val 18 , // [119] 'w' = Trp INVALID_LETTER, // [120] 'x' 19 , // [121] 'y' = Tyr INVALID_LETTER, // [122] 'z' INVALID_LETTER, // [123] '{' INVALID_LETTER, // [124] '|' INVALID_LETTER, // [125] '}' INVALID_LETTER, // [126] '~' INVALID_LETTER, // [127] 0x7f INVALID_LETTER, // [128] 0x80 INVALID_LETTER, // [129] 0x81 INVALID_LETTER, // [130] 0x82 INVALID_LETTER, // [131] 0x83 INVALID_LETTER, // [132] 0x84 INVALID_LETTER, // [133] 0x85 INVALID_LETTER, // [134] 0x86 INVALID_LETTER, // [135] 0x87 INVALID_LETTER, // [136] 0x88 INVALID_LETTER, // [137] 0x89 INVALID_LETTER, // [138] 0x8a INVALID_LETTER, // [139] 0x8b INVALID_LETTER, // [140] 0x8c INVALID_LETTER, // [141] 0x8d INVALID_LETTER, // [142] 0x8e INVALID_LETTER, // [143] 0x8f INVALID_LETTER, // [144] 0x90 INVALID_LETTER, // [145] 0x91 INVALID_LETTER, // [146] 0x92 INVALID_LETTER, // [147] 0x93 INVALID_LETTER, // [148] 0x94 INVALID_LETTER, // [149] 0x95 INVALID_LETTER, // [150] 0x96 INVALID_LETTER, // [151] 0x97 INVALID_LETTER, // [152] 0x98 INVALID_LETTER, // [153] 0x99 INVALID_LETTER, // [154] 0x9a INVALID_LETTER, // [155] 0x9b INVALID_LETTER, // [156] 0x9c INVALID_LETTER, // [157] 0x9d INVALID_LETTER, // [158] 0x9e INVALID_LETTER, // [159] 0x9f INVALID_LETTER, // [160] 0xa0 INVALID_LETTER, // [161] 0xa1 INVALID_LETTER, // [162] 0xa2 INVALID_LETTER, // [163] 0xa3 INVALID_LETTER, // [164] 0xa4 INVALID_LETTER, // [165] 0xa5 INVALID_LETTER, // [166] 0xa6 INVALID_LETTER, // [167] 0xa7 INVALID_LETTER, // [168] 0xa8 INVALID_LETTER, // [169] 0xa9 INVALID_LETTER, // [170] 0xaa INVALID_LETTER, // [171] 0xab INVALID_LETTER, // [172] 0xac INVALID_LETTER, // [173] 0xad INVALID_LETTER, // [174] 0xae INVALID_LETTER, // [175] 0xaf INVALID_LETTER, // [176] 0xb0 INVALID_LETTER, // [177] 0xb1 INVALID_LETTER, // [178] 0xb2 INVALID_LETTER, // [179] 0xb3 INVALID_LETTER, // [180] 0xb4 INVALID_LETTER, // [181] 0xb5 INVALID_LETTER, // [182] 0xb6 INVALID_LETTER, // [183] 0xb7 INVALID_LETTER, // [184] 0xb8 INVALID_LETTER, // [185] 0xb9 INVALID_LETTER, // [186] 0xba INVALID_LETTER, // [187] 0xbb INVALID_LETTER, // [188] 0xbc INVALID_LETTER, // [189] 0xbd INVALID_LETTER, // [190] 0xbe INVALID_LETTER, // [191] 0xbf INVALID_LETTER, // [192] 0xc0 INVALID_LETTER, // [193] 0xc1 INVALID_LETTER, // [194] 0xc2 INVALID_LETTER, // [195] 0xc3 INVALID_LETTER, // [196] 0xc4 INVALID_LETTER, // [197] 0xc5 INVALID_LETTER, // [198] 0xc6 INVALID_LETTER, // [199] 0xc7 INVALID_LETTER, // [200] 0xc8 INVALID_LETTER, // [201] 0xc9 INVALID_LETTER, // [202] 0xca INVALID_LETTER, // [203] 0xcb INVALID_LETTER, // [204] 0xcc INVALID_LETTER, // [205] 0xcd INVALID_LETTER, // [206] 0xce INVALID_LETTER, // [207] 0xcf INVALID_LETTER, // [208] 0xd0 INVALID_LETTER, // [209] 0xd1 INVALID_LETTER, // [210] 0xd2 INVALID_LETTER, // [211] 0xd3 INVALID_LETTER, // [212] 0xd4 INVALID_LETTER, // [213] 0xd5 INVALID_LETTER, // [214] 0xd6 INVALID_LETTER, // [215] 0xd7 INVALID_LETTER, // [216] 0xd8 INVALID_LETTER, // [217] 0xd9 INVALID_LETTER, // [218] 0xda INVALID_LETTER, // [219] 0xdb INVALID_LETTER, // [220] 0xdc INVALID_LETTER, // [221] 0xdd INVALID_LETTER, // [222] 0xde INVALID_LETTER, // [223] 0xdf INVALID_LETTER, // [224] 0xe0 INVALID_LETTER, // [225] 0xe1 INVALID_LETTER, // [226] 0xe2 INVALID_LETTER, // [227] 0xe3 INVALID_LETTER, // [228] 0xe4 INVALID_LETTER, // [229] 0xe5 INVALID_LETTER, // [230] 0xe6 INVALID_LETTER, // [231] 0xe7 INVALID_LETTER, // [232] 0xe8 INVALID_LETTER, // [233] 0xe9 INVALID_LETTER, // [234] 0xea INVALID_LETTER, // [235] 0xeb INVALID_LETTER, // [236] 0xec INVALID_LETTER, // [237] 0xed INVALID_LETTER, // [238] 0xee INVALID_LETTER, // [239] 0xef INVALID_LETTER, // [240] 0xf0 INVALID_LETTER, // [241] 0xf1 INVALID_LETTER, // [242] 0xf2 INVALID_LETTER, // [243] 0xf3 INVALID_LETTER, // [244] 0xf4 INVALID_LETTER, // [245] 0xf5 INVALID_LETTER, // [246] 0xf6 INVALID_LETTER, // [247] 0xf7 INVALID_LETTER, // [248] 0xf8 INVALID_LETTER, // [249] 0xf9 INVALID_LETTER, // [250] 0xfa INVALID_LETTER, // [251] 0xfb INVALID_LETTER, // [252] 0xfc INVALID_LETTER, // [253] 0xfd INVALID_LETTER, // [254] 0xfe INVALID_LETTER, // [255] 0xff }; byte g_CharToLetterAminoGap[256] = { INVALID_LETTER, // [ 0] 0x00 INVALID_LETTER, // [ 1] 0x01 INVALID_LETTER, // [ 2] 0x02 INVALID_LETTER, // [ 3] 0x03 INVALID_LETTER, // [ 4] 0x04 INVALID_LETTER, // [ 5] 0x05 INVALID_LETTER, // [ 6] 0x06 INVALID_LETTER, // [ 7] 0x07 INVALID_LETTER, // [ 8] 0x08 INVALID_LETTER, // [ 9] 0x09 INVALID_LETTER, // [ 10] 0x0a INVALID_LETTER, // [ 11] 0x0b INVALID_LETTER, // [ 12] 0x0c INVALID_LETTER, // [ 13] 0x0d INVALID_LETTER, // [ 14] 0x0e INVALID_LETTER, // [ 15] 0x0f INVALID_LETTER, // [ 16] 0x10 INVALID_LETTER, // [ 17] 0x11 INVALID_LETTER, // [ 18] 0x12 INVALID_LETTER, // [ 19] 0x13 INVALID_LETTER, // [ 20] 0x14 INVALID_LETTER, // [ 21] 0x15 INVALID_LETTER, // [ 22] 0x16 INVALID_LETTER, // [ 23] 0x17 INVALID_LETTER, // [ 24] 0x18 INVALID_LETTER, // [ 25] 0x19 INVALID_LETTER, // [ 26] 0x1a INVALID_LETTER, // [ 27] 0x1b INVALID_LETTER, // [ 28] 0x1c INVALID_LETTER, // [ 29] 0x1d INVALID_LETTER, // [ 30] 0x1e INVALID_LETTER, // [ 31] 0x1f INVALID_LETTER, // [ 32] ' ' INVALID_LETTER, // [ 33] '!' INVALID_LETTER, // [ 34] '"' INVALID_LETTER, // [ 35] '#' INVALID_LETTER, // [ 36] '$' INVALID_LETTER, // [ 37] '%' INVALID_LETTER, // [ 38] '&' INVALID_LETTER, // [ 39] ''' INVALID_LETTER, // [ 40] '(' INVALID_LETTER, // [ 41] ')' 20 , // [ 42] '*' = STP INVALID_LETTER, // [ 43] '+' INVALID_LETTER, // [ 44] ',' 20 , // [ 45] '-' gap INVALID_LETTER, // [ 46] '.' INVALID_LETTER, // [ 47] '/' INVALID_LETTER, // [ 48] '0' INVALID_LETTER, // [ 49] '1' INVALID_LETTER, // [ 50] '2' INVALID_LETTER, // [ 51] '3' INVALID_LETTER, // [ 52] '4' INVALID_LETTER, // [ 53] '5' INVALID_LETTER, // [ 54] '6' INVALID_LETTER, // [ 55] '7' INVALID_LETTER, // [ 56] '8' INVALID_LETTER, // [ 57] '9' INVALID_LETTER, // [ 58] ':' INVALID_LETTER, // [ 59] ';' INVALID_LETTER, // [ 60] '<' INVALID_LETTER, // [ 61] '=' INVALID_LETTER, // [ 62] '>' INVALID_LETTER, // [ 63] '?' INVALID_LETTER, // [ 64] '@' 0 , // [ 65] 'A' = Ala INVALID_LETTER, // [ 66] 'B' 1 , // [ 67] 'C' = Cys 2 , // [ 68] 'D' = Asp 3 , // [ 69] 'E' = Glu 4 , // [ 70] 'F' = Phe 5 , // [ 71] 'G' = Gly 6 , // [ 72] 'H' = His 7 , // [ 73] 'I' = Ile INVALID_LETTER, // [ 74] 'J' 8 , // [ 75] 'K' = Lys 9 , // [ 76] 'L' = Leu 10 , // [ 77] 'M' = Met 11 , // [ 78] 'N' = Asn INVALID_LETTER, // [ 79] 'O' 12 , // [ 80] 'P' = Pro 13 , // [ 81] 'Q' = Gln 14 , // [ 82] 'R' = Arg 15 , // [ 83] 'S' = Ser 16 , // [ 84] 'T' = Thr INVALID_LETTER, // [ 85] 'U' 17 , // [ 86] 'V' = Val 18 , // [ 87] 'W' = Trp INVALID_LETTER, // [ 88] 'X' 19 , // [ 89] 'Y' = Tyr INVALID_LETTER, // [ 90] 'Z' INVALID_LETTER, // [ 91] '[' INVALID_LETTER, // [ 92] '\' INVALID_LETTER, // [ 93] ']' INVALID_LETTER, // [ 94] '^' INVALID_LETTER, // [ 95] '_' INVALID_LETTER, // [ 96] '`' 0 , // [ 97] 'a' = Ala INVALID_LETTER, // [ 98] 'b' 1 , // [ 99] 'c' = Cys 2 , // [100] 'd' = Asp 3 , // [101] 'e' = Glu 4 , // [102] 'f' = Phe 5 , // [103] 'g' = Gly 6 , // [104] 'h' = His 7 , // [105] 'i' = Ile INVALID_LETTER, // [106] 'j' 8 , // [107] 'k' = Lys 9 , // [108] 'l' = Leu 10 , // [109] 'm' = Met 11 , // [110] 'n' = Asn INVALID_LETTER, // [111] 'o' 12 , // [112] 'p' = Pro 13 , // [113] 'q' = Gln 14 , // [114] 'r' = Arg 15 , // [115] 's' = Ser 16 , // [116] 't' = Thr INVALID_LETTER, // [117] 'u' 17 , // [118] 'v' = Val 18 , // [119] 'w' = Trp INVALID_LETTER, // [120] 'x' 19 , // [121] 'y' = Tyr INVALID_LETTER, // [122] 'z' INVALID_LETTER, // [123] '{' INVALID_LETTER, // [124] '|' INVALID_LETTER, // [125] '}' INVALID_LETTER, // [126] '~' INVALID_LETTER, // [127] 0x7f INVALID_LETTER, // [128] 0x80 INVALID_LETTER, // [129] 0x81 INVALID_LETTER, // [130] 0x82 INVALID_LETTER, // [131] 0x83 INVALID_LETTER, // [132] 0x84 INVALID_LETTER, // [133] 0x85 INVALID_LETTER, // [134] 0x86 INVALID_LETTER, // [135] 0x87 INVALID_LETTER, // [136] 0x88 INVALID_LETTER, // [137] 0x89 INVALID_LETTER, // [138] 0x8a INVALID_LETTER, // [139] 0x8b INVALID_LETTER, // [140] 0x8c INVALID_LETTER, // [141] 0x8d INVALID_LETTER, // [142] 0x8e INVALID_LETTER, // [143] 0x8f INVALID_LETTER, // [144] 0x90 INVALID_LETTER, // [145] 0x91 INVALID_LETTER, // [146] 0x92 INVALID_LETTER, // [147] 0x93 INVALID_LETTER, // [148] 0x94 INVALID_LETTER, // [149] 0x95 INVALID_LETTER, // [150] 0x96 INVALID_LETTER, // [151] 0x97 INVALID_LETTER, // [152] 0x98 INVALID_LETTER, // [153] 0x99 INVALID_LETTER, // [154] 0x9a INVALID_LETTER, // [155] 0x9b INVALID_LETTER, // [156] 0x9c INVALID_LETTER, // [157] 0x9d INVALID_LETTER, // [158] 0x9e INVALID_LETTER, // [159] 0x9f INVALID_LETTER, // [160] 0xa0 INVALID_LETTER, // [161] 0xa1 INVALID_LETTER, // [162] 0xa2 INVALID_LETTER, // [163] 0xa3 INVALID_LETTER, // [164] 0xa4 INVALID_LETTER, // [165] 0xa5 INVALID_LETTER, // [166] 0xa6 INVALID_LETTER, // [167] 0xa7 INVALID_LETTER, // [168] 0xa8 INVALID_LETTER, // [169] 0xa9 INVALID_LETTER, // [170] 0xaa INVALID_LETTER, // [171] 0xab INVALID_LETTER, // [172] 0xac INVALID_LETTER, // [173] 0xad INVALID_LETTER, // [174] 0xae INVALID_LETTER, // [175] 0xaf INVALID_LETTER, // [176] 0xb0 INVALID_LETTER, // [177] 0xb1 INVALID_LETTER, // [178] 0xb2 INVALID_LETTER, // [179] 0xb3 INVALID_LETTER, // [180] 0xb4 INVALID_LETTER, // [181] 0xb5 INVALID_LETTER, // [182] 0xb6 INVALID_LETTER, // [183] 0xb7 INVALID_LETTER, // [184] 0xb8 INVALID_LETTER, // [185] 0xb9 INVALID_LETTER, // [186] 0xba INVALID_LETTER, // [187] 0xbb INVALID_LETTER, // [188] 0xbc INVALID_LETTER, // [189] 0xbd INVALID_LETTER, // [190] 0xbe INVALID_LETTER, // [191] 0xbf INVALID_LETTER, // [192] 0xc0 INVALID_LETTER, // [193] 0xc1 INVALID_LETTER, // [194] 0xc2 INVALID_LETTER, // [195] 0xc3 INVALID_LETTER, // [196] 0xc4 INVALID_LETTER, // [197] 0xc5 INVALID_LETTER, // [198] 0xc6 INVALID_LETTER, // [199] 0xc7 INVALID_LETTER, // [200] 0xc8 INVALID_LETTER, // [201] 0xc9 INVALID_LETTER, // [202] 0xca INVALID_LETTER, // [203] 0xcb INVALID_LETTER, // [204] 0xcc INVALID_LETTER, // [205] 0xcd INVALID_LETTER, // [206] 0xce INVALID_LETTER, // [207] 0xcf INVALID_LETTER, // [208] 0xd0 INVALID_LETTER, // [209] 0xd1 INVALID_LETTER, // [210] 0xd2 INVALID_LETTER, // [211] 0xd3 INVALID_LETTER, // [212] 0xd4 INVALID_LETTER, // [213] 0xd5 INVALID_LETTER, // [214] 0xd6 INVALID_LETTER, // [215] 0xd7 INVALID_LETTER, // [216] 0xd8 INVALID_LETTER, // [217] 0xd9 INVALID_LETTER, // [218] 0xda INVALID_LETTER, // [219] 0xdb INVALID_LETTER, // [220] 0xdc INVALID_LETTER, // [221] 0xdd INVALID_LETTER, // [222] 0xde INVALID_LETTER, // [223] 0xdf INVALID_LETTER, // [224] 0xe0 INVALID_LETTER, // [225] 0xe1 INVALID_LETTER, // [226] 0xe2 INVALID_LETTER, // [227] 0xe3 INVALID_LETTER, // [228] 0xe4 INVALID_LETTER, // [229] 0xe5 INVALID_LETTER, // [230] 0xe6 INVALID_LETTER, // [231] 0xe7 INVALID_LETTER, // [232] 0xe8 INVALID_LETTER, // [233] 0xe9 INVALID_LETTER, // [234] 0xea INVALID_LETTER, // [235] 0xeb INVALID_LETTER, // [236] 0xec INVALID_LETTER, // [237] 0xed INVALID_LETTER, // [238] 0xee INVALID_LETTER, // [239] 0xef INVALID_LETTER, // [240] 0xf0 INVALID_LETTER, // [241] 0xf1 INVALID_LETTER, // [242] 0xf2 INVALID_LETTER, // [243] 0xf3 INVALID_LETTER, // [244] 0xf4 INVALID_LETTER, // [245] 0xf5 INVALID_LETTER, // [246] 0xf6 INVALID_LETTER, // [247] 0xf7 INVALID_LETTER, // [248] 0xf8 INVALID_LETTER, // [249] 0xf9 INVALID_LETTER, // [250] 0xfa INVALID_LETTER, // [251] 0xfb INVALID_LETTER, // [252] 0xfc INVALID_LETTER, // [253] 0xfd INVALID_LETTER, // [254] 0xfe INVALID_LETTER, // [255] 0xff }; byte g_CharToLetterAmino[256] = { INVALID_LETTER, // [ 0] 0x00 INVALID_LETTER, // [ 1] 0x01 INVALID_LETTER, // [ 2] 0x02 INVALID_LETTER, // [ 3] 0x03 INVALID_LETTER, // [ 4] 0x04 INVALID_LETTER, // [ 5] 0x05 INVALID_LETTER, // [ 6] 0x06 INVALID_LETTER, // [ 7] 0x07 INVALID_LETTER, // [ 8] 0x08 INVALID_LETTER, // [ 9] 0x09 INVALID_LETTER, // [ 10] 0x0a INVALID_LETTER, // [ 11] 0x0b INVALID_LETTER, // [ 12] 0x0c INVALID_LETTER, // [ 13] 0x0d INVALID_LETTER, // [ 14] 0x0e INVALID_LETTER, // [ 15] 0x0f INVALID_LETTER, // [ 16] 0x10 INVALID_LETTER, // [ 17] 0x11 INVALID_LETTER, // [ 18] 0x12 INVALID_LETTER, // [ 19] 0x13 INVALID_LETTER, // [ 20] 0x14 INVALID_LETTER, // [ 21] 0x15 INVALID_LETTER, // [ 22] 0x16 INVALID_LETTER, // [ 23] 0x17 INVALID_LETTER, // [ 24] 0x18 INVALID_LETTER, // [ 25] 0x19 INVALID_LETTER, // [ 26] 0x1a INVALID_LETTER, // [ 27] 0x1b INVALID_LETTER, // [ 28] 0x1c INVALID_LETTER, // [ 29] 0x1d INVALID_LETTER, // [ 30] 0x1e INVALID_LETTER, // [ 31] 0x1f INVALID_LETTER, // [ 32] ' ' INVALID_LETTER, // [ 33] '!' INVALID_LETTER, // [ 34] '"' INVALID_LETTER, // [ 35] '#' INVALID_LETTER, // [ 36] '$' INVALID_LETTER, // [ 37] '%' INVALID_LETTER, // [ 38] '&' INVALID_LETTER, // [ 39] ''' INVALID_LETTER, // [ 40] '(' INVALID_LETTER, // [ 41] ')' INVALID_LETTER, // [ 42] '*' INVALID_LETTER, // [ 43] '+' INVALID_LETTER, // [ 44] ',' INVALID_LETTER, // [ 45] '-' INVALID_LETTER, // [ 46] '.' INVALID_LETTER, // [ 47] '/' INVALID_LETTER, // [ 48] '0' INVALID_LETTER, // [ 49] '1' INVALID_LETTER, // [ 50] '2' INVALID_LETTER, // [ 51] '3' INVALID_LETTER, // [ 52] '4' INVALID_LETTER, // [ 53] '5' INVALID_LETTER, // [ 54] '6' INVALID_LETTER, // [ 55] '7' INVALID_LETTER, // [ 56] '8' INVALID_LETTER, // [ 57] '9' INVALID_LETTER, // [ 58] ':' INVALID_LETTER, // [ 59] ';' INVALID_LETTER, // [ 60] '<' INVALID_LETTER, // [ 61] '=' INVALID_LETTER, // [ 62] '>' INVALID_LETTER, // [ 63] '?' INVALID_LETTER, // [ 64] '@' 0 , // [ 65] 'A' = Ala INVALID_LETTER, // [ 66] 'B' 1 , // [ 67] 'C' = Cys 2 , // [ 68] 'D' = Asp 3 , // [ 69] 'E' = Glu 4 , // [ 70] 'F' = Phe 5 , // [ 71] 'G' = Gly 6 , // [ 72] 'H' = His 7 , // [ 73] 'I' = Ile INVALID_LETTER, // [ 74] 'J' 8 , // [ 75] 'K' = Lys 9 , // [ 76] 'L' = Leu 10 , // [ 77] 'M' = Met 11 , // [ 78] 'N' = Asn INVALID_LETTER, // [ 79] 'O' 12 , // [ 80] 'P' = Pro 13 , // [ 81] 'Q' = Gln 14 , // [ 82] 'R' = Arg 15 , // [ 83] 'S' = Ser 16 , // [ 84] 'T' = Thr INVALID_LETTER, // [ 85] 'U' 17 , // [ 86] 'V' = Val 18 , // [ 87] 'W' = Trp INVALID_LETTER, // [ 88] 'X' 19 , // [ 89] 'Y' = Tyr INVALID_LETTER, // [ 90] 'Z' INVALID_LETTER, // [ 91] '[' INVALID_LETTER, // [ 92] '\' INVALID_LETTER, // [ 93] ']' INVALID_LETTER, // [ 94] '^' INVALID_LETTER, // [ 95] '_' INVALID_LETTER, // [ 96] '`' 0 , // [ 97] 'a' = Ala INVALID_LETTER, // [ 98] 'b' 1 , // [ 99] 'c' = Cys 2 , // [100] 'd' = Asp 3 , // [101] 'e' = Glu 4 , // [102] 'f' = Phe 5 , // [103] 'g' = Gly 6 , // [104] 'h' = His 7 , // [105] 'i' = Ile INVALID_LETTER, // [106] 'j' 8 , // [107] 'k' = Lys 9 , // [108] 'l' = Leu 10 , // [109] 'm' = Met 11 , // [110] 'n' = Asn INVALID_LETTER, // [111] 'o' 12 , // [112] 'p' = Pro 13 , // [113] 'q' = Gln 14 , // [114] 'r' = Arg 15 , // [115] 's' = Ser 16 , // [116] 't' = Thr INVALID_LETTER, // [117] 'u' 17 , // [118] 'v' = Val 18 , // [119] 'w' = Trp INVALID_LETTER, // [120] 'x' 19 , // [121] 'y' = Tyr INVALID_LETTER, // [122] 'z' INVALID_LETTER, // [123] '{' INVALID_LETTER, // [124] '|' INVALID_LETTER, // [125] '}' INVALID_LETTER, // [126] '~' INVALID_LETTER, // [127] 0x7f INVALID_LETTER, // [128] 0x80 INVALID_LETTER, // [129] 0x81 INVALID_LETTER, // [130] 0x82 INVALID_LETTER, // [131] 0x83 INVALID_LETTER, // [132] 0x84 INVALID_LETTER, // [133] 0x85 INVALID_LETTER, // [134] 0x86 INVALID_LETTER, // [135] 0x87 INVALID_LETTER, // [136] 0x88 INVALID_LETTER, // [137] 0x89 INVALID_LETTER, // [138] 0x8a INVALID_LETTER, // [139] 0x8b INVALID_LETTER, // [140] 0x8c INVALID_LETTER, // [141] 0x8d INVALID_LETTER, // [142] 0x8e INVALID_LETTER, // [143] 0x8f INVALID_LETTER, // [144] 0x90 INVALID_LETTER, // [145] 0x91 INVALID_LETTER, // [146] 0x92 INVALID_LETTER, // [147] 0x93 INVALID_LETTER, // [148] 0x94 INVALID_LETTER, // [149] 0x95 INVALID_LETTER, // [150] 0x96 INVALID_LETTER, // [151] 0x97 INVALID_LETTER, // [152] 0x98 INVALID_LETTER, // [153] 0x99 INVALID_LETTER, // [154] 0x9a INVALID_LETTER, // [155] 0x9b INVALID_LETTER, // [156] 0x9c INVALID_LETTER, // [157] 0x9d INVALID_LETTER, // [158] 0x9e INVALID_LETTER, // [159] 0x9f INVALID_LETTER, // [160] 0xa0 INVALID_LETTER, // [161] 0xa1 INVALID_LETTER, // [162] 0xa2 INVALID_LETTER, // [163] 0xa3 INVALID_LETTER, // [164] 0xa4 INVALID_LETTER, // [165] 0xa5 INVALID_LETTER, // [166] 0xa6 INVALID_LETTER, // [167] 0xa7 INVALID_LETTER, // [168] 0xa8 INVALID_LETTER, // [169] 0xa9 INVALID_LETTER, // [170] 0xaa INVALID_LETTER, // [171] 0xab INVALID_LETTER, // [172] 0xac INVALID_LETTER, // [173] 0xad INVALID_LETTER, // [174] 0xae INVALID_LETTER, // [175] 0xaf INVALID_LETTER, // [176] 0xb0 INVALID_LETTER, // [177] 0xb1 INVALID_LETTER, // [178] 0xb2 INVALID_LETTER, // [179] 0xb3 INVALID_LETTER, // [180] 0xb4 INVALID_LETTER, // [181] 0xb5 INVALID_LETTER, // [182] 0xb6 INVALID_LETTER, // [183] 0xb7 INVALID_LETTER, // [184] 0xb8 INVALID_LETTER, // [185] 0xb9 INVALID_LETTER, // [186] 0xba INVALID_LETTER, // [187] 0xbb INVALID_LETTER, // [188] 0xbc INVALID_LETTER, // [189] 0xbd INVALID_LETTER, // [190] 0xbe INVALID_LETTER, // [191] 0xbf INVALID_LETTER, // [192] 0xc0 INVALID_LETTER, // [193] 0xc1 INVALID_LETTER, // [194] 0xc2 INVALID_LETTER, // [195] 0xc3 INVALID_LETTER, // [196] 0xc4 INVALID_LETTER, // [197] 0xc5 INVALID_LETTER, // [198] 0xc6 INVALID_LETTER, // [199] 0xc7 INVALID_LETTER, // [200] 0xc8 INVALID_LETTER, // [201] 0xc9 INVALID_LETTER, // [202] 0xca INVALID_LETTER, // [203] 0xcb INVALID_LETTER, // [204] 0xcc INVALID_LETTER, // [205] 0xcd INVALID_LETTER, // [206] 0xce INVALID_LETTER, // [207] 0xcf INVALID_LETTER, // [208] 0xd0 INVALID_LETTER, // [209] 0xd1 INVALID_LETTER, // [210] 0xd2 INVALID_LETTER, // [211] 0xd3 INVALID_LETTER, // [212] 0xd4 INVALID_LETTER, // [213] 0xd5 INVALID_LETTER, // [214] 0xd6 INVALID_LETTER, // [215] 0xd7 INVALID_LETTER, // [216] 0xd8 INVALID_LETTER, // [217] 0xd9 INVALID_LETTER, // [218] 0xda INVALID_LETTER, // [219] 0xdb INVALID_LETTER, // [220] 0xdc INVALID_LETTER, // [221] 0xdd INVALID_LETTER, // [222] 0xde INVALID_LETTER, // [223] 0xdf INVALID_LETTER, // [224] 0xe0 INVALID_LETTER, // [225] 0xe1 INVALID_LETTER, // [226] 0xe2 INVALID_LETTER, // [227] 0xe3 INVALID_LETTER, // [228] 0xe4 INVALID_LETTER, // [229] 0xe5 INVALID_LETTER, // [230] 0xe6 INVALID_LETTER, // [231] 0xe7 INVALID_LETTER, // [232] 0xe8 INVALID_LETTER, // [233] 0xe9 INVALID_LETTER, // [234] 0xea INVALID_LETTER, // [235] 0xeb INVALID_LETTER, // [236] 0xec INVALID_LETTER, // [237] 0xed INVALID_LETTER, // [238] 0xee INVALID_LETTER, // [239] 0xef INVALID_LETTER, // [240] 0xf0 INVALID_LETTER, // [241] 0xf1 INVALID_LETTER, // [242] 0xf2 INVALID_LETTER, // [243] 0xf3 INVALID_LETTER, // [244] 0xf4 INVALID_LETTER, // [245] 0xf5 INVALID_LETTER, // [246] 0xf6 INVALID_LETTER, // [247] 0xf7 INVALID_LETTER, // [248] 0xf8 INVALID_LETTER, // [249] 0xf9 INVALID_LETTER, // [250] 0xfa INVALID_LETTER, // [251] 0xfb INVALID_LETTER, // [252] 0xfc INVALID_LETTER, // [253] 0xfd INVALID_LETTER, // [254] 0xfe INVALID_LETTER, // [255] 0xff }; byte g_LetterToCharAmino[256] = { 'A', // [0] 'C', // [1] 'D', // [2] 'E', // [3] 'F', // [4] 'G', // [5] 'H', // [6] 'I', // [7] 'K', // [8] 'L', // [9] 'M', // [10] 'N', // [11] 'P', // [12] 'Q', // [13] 'R', // [14] 'S', // [15] 'T', // [16] 'V', // [17] 'W', // [18] 'Y', // [19] '*', // [20] INVALID_CHAR, // [21] INVALID_CHAR, // [22] INVALID_CHAR, // [23] INVALID_CHAR, // [24] INVALID_CHAR, // [25] INVALID_CHAR, // [26] INVALID_CHAR, // [27] INVALID_CHAR, // [28] INVALID_CHAR, // [29] INVALID_CHAR, // [30] INVALID_CHAR, // [31] INVALID_CHAR, // [32] INVALID_CHAR, // [33] INVALID_CHAR, // [34] INVALID_CHAR, // [35] INVALID_CHAR, // [36] INVALID_CHAR, // [37] INVALID_CHAR, // [38] INVALID_CHAR, // [39] INVALID_CHAR, // [40] INVALID_CHAR, // [41] INVALID_CHAR, // [42] INVALID_CHAR, // [43] INVALID_CHAR, // [44] INVALID_CHAR, // [45] INVALID_CHAR, // [46] INVALID_CHAR, // [47] INVALID_CHAR, // [48] INVALID_CHAR, // [49] INVALID_CHAR, // [50] INVALID_CHAR, // [51] INVALID_CHAR, // [52] INVALID_CHAR, // [53] INVALID_CHAR, // [54] INVALID_CHAR, // [55] INVALID_CHAR, // [56] INVALID_CHAR, // [57] INVALID_CHAR, // [58] INVALID_CHAR, // [59] INVALID_CHAR, // [60] INVALID_CHAR, // [61] INVALID_CHAR, // [62] INVALID_CHAR, // [63] INVALID_CHAR, // [64] INVALID_CHAR, // [65] INVALID_CHAR, // [66] INVALID_CHAR, // [67] INVALID_CHAR, // [68] INVALID_CHAR, // [69] INVALID_CHAR, // [70] INVALID_CHAR, // [71] INVALID_CHAR, // [72] INVALID_CHAR, // [73] INVALID_CHAR, // [74] INVALID_CHAR, // [75] INVALID_CHAR, // [76] INVALID_CHAR, // [77] INVALID_CHAR, // [78] INVALID_CHAR, // [79] INVALID_CHAR, // [80] INVALID_CHAR, // [81] INVALID_CHAR, // [82] INVALID_CHAR, // [83] INVALID_CHAR, // [84] INVALID_CHAR, // [85] INVALID_CHAR, // [86] INVALID_CHAR, // [87] INVALID_CHAR, // [88] INVALID_CHAR, // [89] INVALID_CHAR, // [90] INVALID_CHAR, // [91] INVALID_CHAR, // [92] INVALID_CHAR, // [93] INVALID_CHAR, // [94] INVALID_CHAR, // [95] INVALID_CHAR, // [96] INVALID_CHAR, // [97] INVALID_CHAR, // [98] INVALID_CHAR, // [99] INVALID_CHAR, // [100] INVALID_CHAR, // [101] INVALID_CHAR, // [102] INVALID_CHAR, // [103] INVALID_CHAR, // [104] INVALID_CHAR, // [105] INVALID_CHAR, // [106] INVALID_CHAR, // [107] INVALID_CHAR, // [108] INVALID_CHAR, // [109] INVALID_CHAR, // [110] INVALID_CHAR, // [111] INVALID_CHAR, // [112] INVALID_CHAR, // [113] INVALID_CHAR, // [114] INVALID_CHAR, // [115] INVALID_CHAR, // [116] INVALID_CHAR, // [117] INVALID_CHAR, // [118] INVALID_CHAR, // [119] INVALID_CHAR, // [120] INVALID_CHAR, // [121] INVALID_CHAR, // [122] INVALID_CHAR, // [123] INVALID_CHAR, // [124] INVALID_CHAR, // [125] INVALID_CHAR, // [126] INVALID_CHAR, // [127] INVALID_CHAR, // [128] INVALID_CHAR, // [129] INVALID_CHAR, // [130] INVALID_CHAR, // [131] INVALID_CHAR, // [132] INVALID_CHAR, // [133] INVALID_CHAR, // [134] INVALID_CHAR, // [135] INVALID_CHAR, // [136] INVALID_CHAR, // [137] INVALID_CHAR, // [138] INVALID_CHAR, // [139] INVALID_CHAR, // [140] INVALID_CHAR, // [141] INVALID_CHAR, // [142] INVALID_CHAR, // [143] INVALID_CHAR, // [144] INVALID_CHAR, // [145] INVALID_CHAR, // [146] INVALID_CHAR, // [147] INVALID_CHAR, // [148] INVALID_CHAR, // [149] INVALID_CHAR, // [150] INVALID_CHAR, // [151] INVALID_CHAR, // [152] INVALID_CHAR, // [153] INVALID_CHAR, // [154] INVALID_CHAR, // [155] INVALID_CHAR, // [156] INVALID_CHAR, // [157] INVALID_CHAR, // [158] INVALID_CHAR, // [159] INVALID_CHAR, // [160] INVALID_CHAR, // [161] INVALID_CHAR, // [162] INVALID_CHAR, // [163] INVALID_CHAR, // [164] INVALID_CHAR, // [165] INVALID_CHAR, // [166] INVALID_CHAR, // [167] INVALID_CHAR, // [168] INVALID_CHAR, // [169] INVALID_CHAR, // [170] INVALID_CHAR, // [171] INVALID_CHAR, // [172] INVALID_CHAR, // [173] INVALID_CHAR, // [174] INVALID_CHAR, // [175] INVALID_CHAR, // [176] INVALID_CHAR, // [177] INVALID_CHAR, // [178] INVALID_CHAR, // [179] INVALID_CHAR, // [180] INVALID_CHAR, // [181] INVALID_CHAR, // [182] INVALID_CHAR, // [183] INVALID_CHAR, // [184] INVALID_CHAR, // [185] INVALID_CHAR, // [186] INVALID_CHAR, // [187] INVALID_CHAR, // [188] INVALID_CHAR, // [189] INVALID_CHAR, // [190] INVALID_CHAR, // [191] INVALID_CHAR, // [192] INVALID_CHAR, // [193] INVALID_CHAR, // [194] INVALID_CHAR, // [195] INVALID_CHAR, // [196] INVALID_CHAR, // [197] INVALID_CHAR, // [198] INVALID_CHAR, // [199] INVALID_CHAR, // [200] INVALID_CHAR, // [201] INVALID_CHAR, // [202] INVALID_CHAR, // [203] INVALID_CHAR, // [204] INVALID_CHAR, // [205] INVALID_CHAR, // [206] INVALID_CHAR, // [207] INVALID_CHAR, // [208] INVALID_CHAR, // [209] INVALID_CHAR, // [210] INVALID_CHAR, // [211] INVALID_CHAR, // [212] INVALID_CHAR, // [213] INVALID_CHAR, // [214] INVALID_CHAR, // [215] INVALID_CHAR, // [216] INVALID_CHAR, // [217] INVALID_CHAR, // [218] INVALID_CHAR, // [219] INVALID_CHAR, // [220] INVALID_CHAR, // [221] INVALID_CHAR, // [222] INVALID_CHAR, // [223] INVALID_CHAR, // [224] INVALID_CHAR, // [225] INVALID_CHAR, // [226] INVALID_CHAR, // [227] INVALID_CHAR, // [228] INVALID_CHAR, // [229] INVALID_CHAR, // [230] INVALID_CHAR, // [231] INVALID_CHAR, // [232] INVALID_CHAR, // [233] INVALID_CHAR, // [234] INVALID_CHAR, // [235] INVALID_CHAR, // [236] INVALID_CHAR, // [237] INVALID_CHAR, // [238] INVALID_CHAR, // [239] INVALID_CHAR, // [240] INVALID_CHAR, // [241] INVALID_CHAR, // [242] INVALID_CHAR, // [243] INVALID_CHAR, // [244] INVALID_CHAR, // [245] INVALID_CHAR, // [246] INVALID_CHAR, // [247] INVALID_CHAR, // [248] INVALID_CHAR, // [249] INVALID_CHAR, // [250] INVALID_CHAR, // [251] INVALID_CHAR, // [252] INVALID_CHAR, // [253] INVALID_CHAR, // [254] INVALID_CHAR, // [255] }; byte g_LetterToCharAminoGap[256] = { 'A', // [0] 'C', // [1] 'D', // [2] 'E', // [3] 'F', // [4] 'G', // [5] 'H', // [6] 'I', // [7] 'K', // [8] 'L', // [9] 'M', // [10] 'N', // [11] 'P', // [12] 'Q', // [13] 'R', // [14] 'S', // [15] 'T', // [16] 'V', // [17] 'W', // [18] 'Y', // [19] INVALID_CHAR, // [20] '-', // [20] INVALID_CHAR, // [22] INVALID_CHAR, // [23] INVALID_CHAR, // [24] INVALID_CHAR, // [25] INVALID_CHAR, // [26] INVALID_CHAR, // [27] INVALID_CHAR, // [28] INVALID_CHAR, // [29] INVALID_CHAR, // [30] INVALID_CHAR, // [31] INVALID_CHAR, // [32] INVALID_CHAR, // [33] INVALID_CHAR, // [34] INVALID_CHAR, // [35] INVALID_CHAR, // [36] INVALID_CHAR, // [37] INVALID_CHAR, // [38] INVALID_CHAR, // [39] INVALID_CHAR, // [40] INVALID_CHAR, // [41] INVALID_CHAR, // [42] INVALID_CHAR, // [43] INVALID_CHAR, // [44] INVALID_CHAR, // [45] INVALID_CHAR, // [46] INVALID_CHAR, // [47] INVALID_CHAR, // [48] INVALID_CHAR, // [49] INVALID_CHAR, // [50] INVALID_CHAR, // [51] INVALID_CHAR, // [52] INVALID_CHAR, // [53] INVALID_CHAR, // [54] INVALID_CHAR, // [55] INVALID_CHAR, // [56] INVALID_CHAR, // [57] INVALID_CHAR, // [58] INVALID_CHAR, // [59] INVALID_CHAR, // [60] INVALID_CHAR, // [61] INVALID_CHAR, // [62] INVALID_CHAR, // [63] INVALID_CHAR, // [64] INVALID_CHAR, // [65] INVALID_CHAR, // [66] INVALID_CHAR, // [67] INVALID_CHAR, // [68] INVALID_CHAR, // [69] INVALID_CHAR, // [70] INVALID_CHAR, // [71] INVALID_CHAR, // [72] INVALID_CHAR, // [73] INVALID_CHAR, // [74] INVALID_CHAR, // [75] INVALID_CHAR, // [76] INVALID_CHAR, // [77] INVALID_CHAR, // [78] INVALID_CHAR, // [79] INVALID_CHAR, // [80] INVALID_CHAR, // [81] INVALID_CHAR, // [82] INVALID_CHAR, // [83] INVALID_CHAR, // [84] INVALID_CHAR, // [85] INVALID_CHAR, // [86] INVALID_CHAR, // [87] INVALID_CHAR, // [88] INVALID_CHAR, // [89] INVALID_CHAR, // [90] INVALID_CHAR, // [91] INVALID_CHAR, // [92] INVALID_CHAR, // [93] INVALID_CHAR, // [94] INVALID_CHAR, // [95] INVALID_CHAR, // [96] INVALID_CHAR, // [97] INVALID_CHAR, // [98] INVALID_CHAR, // [99] INVALID_CHAR, // [100] INVALID_CHAR, // [101] INVALID_CHAR, // [102] INVALID_CHAR, // [103] INVALID_CHAR, // [104] INVALID_CHAR, // [105] INVALID_CHAR, // [106] INVALID_CHAR, // [107] INVALID_CHAR, // [108] INVALID_CHAR, // [109] INVALID_CHAR, // [110] INVALID_CHAR, // [111] INVALID_CHAR, // [112] INVALID_CHAR, // [113] INVALID_CHAR, // [114] INVALID_CHAR, // [115] INVALID_CHAR, // [116] INVALID_CHAR, // [117] INVALID_CHAR, // [118] INVALID_CHAR, // [119] INVALID_CHAR, // [120] INVALID_CHAR, // [121] INVALID_CHAR, // [122] INVALID_CHAR, // [123] INVALID_CHAR, // [124] INVALID_CHAR, // [125] INVALID_CHAR, // [126] INVALID_CHAR, // [127] INVALID_CHAR, // [128] INVALID_CHAR, // [129] INVALID_CHAR, // [130] INVALID_CHAR, // [131] INVALID_CHAR, // [132] INVALID_CHAR, // [133] INVALID_CHAR, // [134] INVALID_CHAR, // [135] INVALID_CHAR, // [136] INVALID_CHAR, // [137] INVALID_CHAR, // [138] INVALID_CHAR, // [139] INVALID_CHAR, // [140] INVALID_CHAR, // [141] INVALID_CHAR, // [142] INVALID_CHAR, // [143] INVALID_CHAR, // [144] INVALID_CHAR, // [145] INVALID_CHAR, // [146] INVALID_CHAR, // [147] INVALID_CHAR, // [148] INVALID_CHAR, // [149] INVALID_CHAR, // [150] INVALID_CHAR, // [151] INVALID_CHAR, // [152] INVALID_CHAR, // [153] INVALID_CHAR, // [154] INVALID_CHAR, // [155] INVALID_CHAR, // [156] INVALID_CHAR, // [157] INVALID_CHAR, // [158] INVALID_CHAR, // [159] INVALID_CHAR, // [160] INVALID_CHAR, // [161] INVALID_CHAR, // [162] INVALID_CHAR, // [163] INVALID_CHAR, // [164] INVALID_CHAR, // [165] INVALID_CHAR, // [166] INVALID_CHAR, // [167] INVALID_CHAR, // [168] INVALID_CHAR, // [169] INVALID_CHAR, // [170] INVALID_CHAR, // [171] INVALID_CHAR, // [172] INVALID_CHAR, // [173] INVALID_CHAR, // [174] INVALID_CHAR, // [175] INVALID_CHAR, // [176] INVALID_CHAR, // [177] INVALID_CHAR, // [178] INVALID_CHAR, // [179] INVALID_CHAR, // [180] INVALID_CHAR, // [181] INVALID_CHAR, // [182] INVALID_CHAR, // [183] INVALID_CHAR, // [184] INVALID_CHAR, // [185] INVALID_CHAR, // [186] INVALID_CHAR, // [187] INVALID_CHAR, // [188] INVALID_CHAR, // [189] INVALID_CHAR, // [190] INVALID_CHAR, // [191] INVALID_CHAR, // [192] INVALID_CHAR, // [193] INVALID_CHAR, // [194] INVALID_CHAR, // [195] INVALID_CHAR, // [196] INVALID_CHAR, // [197] INVALID_CHAR, // [198] INVALID_CHAR, // [199] INVALID_CHAR, // [200] INVALID_CHAR, // [201] INVALID_CHAR, // [202] INVALID_CHAR, // [203] INVALID_CHAR, // [204] INVALID_CHAR, // [205] INVALID_CHAR, // [206] INVALID_CHAR, // [207] INVALID_CHAR, // [208] INVALID_CHAR, // [209] INVALID_CHAR, // [210] INVALID_CHAR, // [211] INVALID_CHAR, // [212] INVALID_CHAR, // [213] INVALID_CHAR, // [214] INVALID_CHAR, // [215] INVALID_CHAR, // [216] INVALID_CHAR, // [217] INVALID_CHAR, // [218] INVALID_CHAR, // [219] INVALID_CHAR, // [220] INVALID_CHAR, // [221] INVALID_CHAR, // [222] INVALID_CHAR, // [223] INVALID_CHAR, // [224] INVALID_CHAR, // [225] INVALID_CHAR, // [226] INVALID_CHAR, // [227] INVALID_CHAR, // [228] INVALID_CHAR, // [229] INVALID_CHAR, // [230] INVALID_CHAR, // [231] INVALID_CHAR, // [232] INVALID_CHAR, // [233] INVALID_CHAR, // [234] INVALID_CHAR, // [235] INVALID_CHAR, // [236] INVALID_CHAR, // [237] INVALID_CHAR, // [238] INVALID_CHAR, // [239] INVALID_CHAR, // [240] INVALID_CHAR, // [241] INVALID_CHAR, // [242] INVALID_CHAR, // [243] INVALID_CHAR, // [244] INVALID_CHAR, // [245] INVALID_CHAR, // [246] INVALID_CHAR, // [247] INVALID_CHAR, // [248] INVALID_CHAR, // [249] INVALID_CHAR, // [250] INVALID_CHAR, // [251] INVALID_CHAR, // [252] INVALID_CHAR, // [253] INVALID_CHAR, // [254] INVALID_CHAR, // [255] }; byte g_CharToLetterNucleo[256] = { INVALID_LETTER, // [ 0] = 0x00 INVALID_LETTER, // [ 1] = 0x01 INVALID_LETTER, // [ 2] = 0x02 INVALID_LETTER, // [ 3] = 0x03 INVALID_LETTER, // [ 4] = 0x04 INVALID_LETTER, // [ 5] = 0x05 INVALID_LETTER, // [ 6] = 0x06 INVALID_LETTER, // [ 7] = 0x07 INVALID_LETTER, // [ 8] = 0x08 INVALID_LETTER, // [ 9] = 0x09 INVALID_LETTER, // [ 10] = 0x0a INVALID_LETTER, // [ 11] = 0x0b INVALID_LETTER, // [ 12] = 0x0c INVALID_LETTER, // [ 13] = 0x0d INVALID_LETTER, // [ 14] = 0x0e INVALID_LETTER, // [ 15] = 0x0f INVALID_LETTER, // [ 16] = 0x10 INVALID_LETTER, // [ 17] = 0x11 INVALID_LETTER, // [ 18] = 0x12 INVALID_LETTER, // [ 19] = 0x13 INVALID_LETTER, // [ 20] = 0x14 INVALID_LETTER, // [ 21] = 0x15 INVALID_LETTER, // [ 22] = 0x16 INVALID_LETTER, // [ 23] = 0x17 INVALID_LETTER, // [ 24] = 0x18 INVALID_LETTER, // [ 25] = 0x19 INVALID_LETTER, // [ 26] = 0x1a INVALID_LETTER, // [ 27] = 0x1b INVALID_LETTER, // [ 28] = 0x1c INVALID_LETTER, // [ 29] = 0x1d INVALID_LETTER, // [ 30] = 0x1e INVALID_LETTER, // [ 31] = 0x1f INVALID_LETTER, // [ 32] = 32 INVALID_LETTER, // [ 33] = 33 INVALID_LETTER, // [ 34] = 34 INVALID_LETTER, // [ 35] = 35 INVALID_LETTER, // [ 36] = 36 INVALID_LETTER, // [ 37] = 37 INVALID_LETTER, // [ 38] = 38 INVALID_LETTER, // [ 39] = 39 INVALID_LETTER, // [ 40] = 40 INVALID_LETTER, // [ 41] = 41 INVALID_LETTER, // [ 42] = 42 INVALID_LETTER, // [ 43] = 43 INVALID_LETTER, // [ 44] = 44 INVALID_LETTER, // [ 45] = 45 INVALID_LETTER, // [ 46] = 46 INVALID_LETTER, // [ 47] = 47 INVALID_LETTER, // [ 48] = 48 INVALID_LETTER, // [ 49] = 49 INVALID_LETTER, // [ 50] = 50 INVALID_LETTER, // [ 51] = 51 INVALID_LETTER, // [ 52] = 52 INVALID_LETTER, // [ 53] = 53 INVALID_LETTER, // [ 54] = 54 INVALID_LETTER, // [ 55] = 55 INVALID_LETTER, // [ 56] = 56 INVALID_LETTER, // [ 57] = 57 INVALID_LETTER, // [ 58] = 58 INVALID_LETTER, // [ 59] = 59 INVALID_LETTER, // [ 60] = 60 INVALID_LETTER, // [ 61] = 61 INVALID_LETTER, // [ 62] = 62 INVALID_LETTER, // [ 63] = 63 INVALID_LETTER, // [ 64] = 64 0 , // [ 65] = A (Nucleotide) INVALID_LETTER, // [ 66] = 66 1 , // [ 67] = C (Nucleotide) INVALID_LETTER, // [ 68] = 68 INVALID_LETTER, // [ 69] = 69 INVALID_LETTER, // [ 70] = 70 2 , // [ 71] = G (Nucleotide) INVALID_LETTER, // [ 72] = 72 INVALID_LETTER, // [ 73] = 73 INVALID_LETTER, // [ 74] = 74 INVALID_LETTER, // [ 75] = 75 INVALID_LETTER, // [ 76] = 76 INVALID_LETTER, // [ 77] = 77 INVALID_LETTER, // [ 78] = 78 INVALID_LETTER, // [ 79] = 79 INVALID_LETTER, // [ 80] = 80 INVALID_LETTER, // [ 81] = 81 INVALID_LETTER, // [ 82] = 82 INVALID_LETTER, // [ 83] = 83 3 , // [ 84] = T (Nucleotide) 3 , // [ 85] = U (Nucleotide) INVALID_LETTER, // [ 86] = 86 INVALID_LETTER, // [ 87] = 87 INVALID_LETTER, // [ 88] = 88 INVALID_LETTER, // [ 89] = 89 INVALID_LETTER, // [ 90] = 90 INVALID_LETTER, // [ 91] = 91 INVALID_LETTER, // [ 92] = 92 INVALID_LETTER, // [ 93] = 93 INVALID_LETTER, // [ 94] = 94 INVALID_LETTER, // [ 95] = 95 INVALID_LETTER, // [ 96] = 96 0 , // [ 97] = a (Nucleotide) INVALID_LETTER, // [ 98] = 98 1 , // [ 99] = c (Nucleotide) INVALID_LETTER, // [100] = 100 INVALID_LETTER, // [101] = 101 INVALID_LETTER, // [102] = 102 2 , // [103] = g (Nucleotide) INVALID_LETTER, // [104] = 104 INVALID_LETTER, // [105] = 105 INVALID_LETTER, // [106] = 106 INVALID_LETTER, // [107] = 107 INVALID_LETTER, // [108] = 108 INVALID_LETTER, // [109] = 109 INVALID_LETTER, // [110] = 110 INVALID_LETTER, // [111] = 111 INVALID_LETTER, // [112] = 112 INVALID_LETTER, // [113] = 113 INVALID_LETTER, // [114] = 114 INVALID_LETTER, // [115] = 115 3 , // [116] = t (Nucleotide) 3 , // [117] = u (Nucleotide) INVALID_LETTER, // [118] = 118 INVALID_LETTER, // [119] = 119 INVALID_LETTER, // [120] = 120 INVALID_LETTER, // [121] = 121 INVALID_LETTER, // [122] = 122 INVALID_LETTER, // [123] = 123 INVALID_LETTER, // [124] = 124 INVALID_LETTER, // [125] = 125 INVALID_LETTER, // [126] = 126 INVALID_LETTER, // [127] = 0x7f INVALID_LETTER, // [128] = 0x80 INVALID_LETTER, // [129] = 0x81 INVALID_LETTER, // [130] = 0x82 INVALID_LETTER, // [131] = 0x83 INVALID_LETTER, // [132] = 0x84 INVALID_LETTER, // [133] = 0x85 INVALID_LETTER, // [134] = 0x86 INVALID_LETTER, // [135] = 0x87 INVALID_LETTER, // [136] = 0x88 INVALID_LETTER, // [137] = 0x89 INVALID_LETTER, // [138] = 0x8a INVALID_LETTER, // [139] = 0x8b INVALID_LETTER, // [140] = 0x8c INVALID_LETTER, // [141] = 0x8d INVALID_LETTER, // [142] = 0x8e INVALID_LETTER, // [143] = 0x8f INVALID_LETTER, // [144] = 0x90 INVALID_LETTER, // [145] = 0x91 INVALID_LETTER, // [146] = 0x92 INVALID_LETTER, // [147] = 0x93 INVALID_LETTER, // [148] = 0x94 INVALID_LETTER, // [149] = 0x95 INVALID_LETTER, // [150] = 0x96 INVALID_LETTER, // [151] = 0x97 INVALID_LETTER, // [152] = 0x98 INVALID_LETTER, // [153] = 0x99 INVALID_LETTER, // [154] = 0x9a INVALID_LETTER, // [155] = 0x9b INVALID_LETTER, // [156] = 0x9c INVALID_LETTER, // [157] = 0x9d INVALID_LETTER, // [158] = 0x9e INVALID_LETTER, // [159] = 0x9f INVALID_LETTER, // [160] = 0xa0 INVALID_LETTER, // [161] = 0xa1 INVALID_LETTER, // [162] = 0xa2 INVALID_LETTER, // [163] = 0xa3 INVALID_LETTER, // [164] = 0xa4 INVALID_LETTER, // [165] = 0xa5 INVALID_LETTER, // [166] = 0xa6 INVALID_LETTER, // [167] = 0xa7 INVALID_LETTER, // [168] = 0xa8 INVALID_LETTER, // [169] = 0xa9 INVALID_LETTER, // [170] = 0xaa INVALID_LETTER, // [171] = 0xab INVALID_LETTER, // [172] = 0xac INVALID_LETTER, // [173] = 0xad INVALID_LETTER, // [174] = 0xae INVALID_LETTER, // [175] = 0xaf INVALID_LETTER, // [176] = 0xb0 INVALID_LETTER, // [177] = 0xb1 INVALID_LETTER, // [178] = 0xb2 INVALID_LETTER, // [179] = 0xb3 INVALID_LETTER, // [180] = 0xb4 INVALID_LETTER, // [181] = 0xb5 INVALID_LETTER, // [182] = 0xb6 INVALID_LETTER, // [183] = 0xb7 INVALID_LETTER, // [184] = 0xb8 INVALID_LETTER, // [185] = 0xb9 INVALID_LETTER, // [186] = 0xba INVALID_LETTER, // [187] = 0xbb INVALID_LETTER, // [188] = 0xbc INVALID_LETTER, // [189] = 0xbd INVALID_LETTER, // [190] = 0xbe INVALID_LETTER, // [191] = 0xbf INVALID_LETTER, // [192] = 0xc0 INVALID_LETTER, // [193] = 0xc1 INVALID_LETTER, // [194] = 0xc2 INVALID_LETTER, // [195] = 0xc3 INVALID_LETTER, // [196] = 0xc4 INVALID_LETTER, // [197] = 0xc5 INVALID_LETTER, // [198] = 0xc6 INVALID_LETTER, // [199] = 0xc7 INVALID_LETTER, // [200] = 0xc8 INVALID_LETTER, // [201] = 0xc9 INVALID_LETTER, // [202] = 0xca INVALID_LETTER, // [203] = 0xcb INVALID_LETTER, // [204] = 0xcc INVALID_LETTER, // [205] = 0xcd INVALID_LETTER, // [206] = 0xce INVALID_LETTER, // [207] = 0xcf INVALID_LETTER, // [208] = 0xd0 INVALID_LETTER, // [209] = 0xd1 INVALID_LETTER, // [210] = 0xd2 INVALID_LETTER, // [211] = 0xd3 INVALID_LETTER, // [212] = 0xd4 INVALID_LETTER, // [213] = 0xd5 INVALID_LETTER, // [214] = 0xd6 INVALID_LETTER, // [215] = 0xd7 INVALID_LETTER, // [216] = 0xd8 INVALID_LETTER, // [217] = 0xd9 INVALID_LETTER, // [218] = 0xda INVALID_LETTER, // [219] = 0xdb INVALID_LETTER, // [220] = 0xdc INVALID_LETTER, // [221] = 0xdd INVALID_LETTER, // [222] = 0xde INVALID_LETTER, // [223] = 0xdf INVALID_LETTER, // [224] = 0xe0 INVALID_LETTER, // [225] = 0xe1 INVALID_LETTER, // [226] = 0xe2 INVALID_LETTER, // [227] = 0xe3 INVALID_LETTER, // [228] = 0xe4 INVALID_LETTER, // [229] = 0xe5 INVALID_LETTER, // [230] = 0xe6 INVALID_LETTER, // [231] = 0xe7 INVALID_LETTER, // [232] = 0xe8 INVALID_LETTER, // [233] = 0xe9 INVALID_LETTER, // [234] = 0xea INVALID_LETTER, // [235] = 0xeb INVALID_LETTER, // [236] = 0xec INVALID_LETTER, // [237] = 0xed INVALID_LETTER, // [238] = 0xee INVALID_LETTER, // [239] = 0xef INVALID_LETTER, // [240] = 0xf0 INVALID_LETTER, // [241] = 0xf1 INVALID_LETTER, // [242] = 0xf2 INVALID_LETTER, // [243] = 0xf3 INVALID_LETTER, // [244] = 0xf4 INVALID_LETTER, // [245] = 0xf5 INVALID_LETTER, // [246] = 0xf6 INVALID_LETTER, // [247] = 0xf7 INVALID_LETTER, // [248] = 0xf8 INVALID_LETTER, // [249] = 0xf9 INVALID_LETTER, // [250] = 0xfa INVALID_LETTER, // [251] = 0xfb INVALID_LETTER, // [252] = 0xfc INVALID_LETTER, // [253] = 0xfd INVALID_LETTER, // [254] = 0xfe INVALID_LETTER, // [255] = 0xff }; byte g_CharToLetterNucleoGap[256] = { INVALID_LETTER, // [ 0] = 0x00 INVALID_LETTER, // [ 1] = 0x01 INVALID_LETTER, // [ 2] = 0x02 INVALID_LETTER, // [ 3] = 0x03 INVALID_LETTER, // [ 4] = 0x04 INVALID_LETTER, // [ 5] = 0x05 INVALID_LETTER, // [ 6] = 0x06 INVALID_LETTER, // [ 7] = 0x07 INVALID_LETTER, // [ 8] = 0x08 INVALID_LETTER, // [ 9] = 0x09 INVALID_LETTER, // [ 10] = 0x0a INVALID_LETTER, // [ 11] = 0x0b INVALID_LETTER, // [ 12] = 0x0c INVALID_LETTER, // [ 13] = 0x0d INVALID_LETTER, // [ 14] = 0x0e INVALID_LETTER, // [ 15] = 0x0f INVALID_LETTER, // [ 16] = 0x10 INVALID_LETTER, // [ 17] = 0x11 INVALID_LETTER, // [ 18] = 0x12 INVALID_LETTER, // [ 19] = 0x13 INVALID_LETTER, // [ 20] = 0x14 INVALID_LETTER, // [ 21] = 0x15 INVALID_LETTER, // [ 22] = 0x16 INVALID_LETTER, // [ 23] = 0x17 INVALID_LETTER, // [ 24] = 0x18 INVALID_LETTER, // [ 25] = 0x19 INVALID_LETTER, // [ 26] = 0x1a INVALID_LETTER, // [ 27] = 0x1b INVALID_LETTER, // [ 28] = 0x1c INVALID_LETTER, // [ 29] = 0x1d INVALID_LETTER, // [ 30] = 0x1e INVALID_LETTER, // [ 31] = 0x1f INVALID_LETTER, // [ 32] = 32 INVALID_LETTER, // [ 33] = 33 INVALID_LETTER, // [ 34] = 34 INVALID_LETTER, // [ 35] = 35 INVALID_LETTER, // [ 36] = 36 INVALID_LETTER, // [ 37] = 37 INVALID_LETTER, // [ 38] = 38 INVALID_LETTER, // [ 39] = 39 INVALID_LETTER, // [ 40] = 40 INVALID_LETTER, // [ 41] = 41 INVALID_LETTER, // [ 42] = 42 INVALID_LETTER, // [ 43] = 43 INVALID_LETTER, // [ 44] = 44 4 , // [ 45] = - (gap) INVALID_LETTER, // [ 46] = 46 INVALID_LETTER, // [ 47] = 47 INVALID_LETTER, // [ 48] = 48 INVALID_LETTER, // [ 49] = 49 INVALID_LETTER, // [ 50] = 50 INVALID_LETTER, // [ 51] = 51 INVALID_LETTER, // [ 52] = 52 INVALID_LETTER, // [ 53] = 53 INVALID_LETTER, // [ 54] = 54 INVALID_LETTER, // [ 55] = 55 INVALID_LETTER, // [ 56] = 56 INVALID_LETTER, // [ 57] = 57 INVALID_LETTER, // [ 58] = 58 INVALID_LETTER, // [ 59] = 59 INVALID_LETTER, // [ 60] = 60 INVALID_LETTER, // [ 61] = 61 INVALID_LETTER, // [ 62] = 62 INVALID_LETTER, // [ 63] = 63 INVALID_LETTER, // [ 64] = 64 0 , // [ 65] = A (Nucleotide) INVALID_LETTER, // [ 66] = 66 1 , // [ 67] = C (Nucleotide) INVALID_LETTER, // [ 68] = 68 INVALID_LETTER, // [ 69] = 69 INVALID_LETTER, // [ 70] = 70 2 , // [ 71] = G (Nucleotide) INVALID_LETTER, // [ 72] = 72 INVALID_LETTER, // [ 73] = 73 INVALID_LETTER, // [ 74] = 74 INVALID_LETTER, // [ 75] = 75 INVALID_LETTER, // [ 76] = 76 INVALID_LETTER, // [ 77] = 77 INVALID_LETTER, // [ 78] = 78 INVALID_LETTER, // [ 79] = 79 INVALID_LETTER, // [ 80] = 80 INVALID_LETTER, // [ 81] = 81 INVALID_LETTER, // [ 82] = 82 INVALID_LETTER, // [ 83] = 83 3 , // [ 84] = T (Nucleotide) 3 , // [ 85] = U (Nucleotide) INVALID_LETTER, // [ 86] = 86 INVALID_LETTER, // [ 87] = 87 INVALID_LETTER, // [ 88] = 88 INVALID_LETTER, // [ 89] = 89 INVALID_LETTER, // [ 90] = 90 INVALID_LETTER, // [ 91] = 91 INVALID_LETTER, // [ 92] = 92 INVALID_LETTER, // [ 93] = 93 INVALID_LETTER, // [ 94] = 94 INVALID_LETTER, // [ 95] = 95 INVALID_LETTER, // [ 96] = 96 0 , // [ 97] = a (Nucleotide) INVALID_LETTER, // [ 98] = 98 1 , // [ 99] = c (Nucleotide) INVALID_LETTER, // [100] = 100 INVALID_LETTER, // [101] = 101 INVALID_LETTER, // [102] = 102 2 , // [103] = g (Nucleotide) INVALID_LETTER, // [104] = 104 INVALID_LETTER, // [105] = 105 INVALID_LETTER, // [106] = 106 INVALID_LETTER, // [107] = 107 INVALID_LETTER, // [108] = 108 INVALID_LETTER, // [109] = 109 INVALID_LETTER, // [110] = 110 INVALID_LETTER, // [111] = 111 INVALID_LETTER, // [112] = 112 INVALID_LETTER, // [113] = 113 INVALID_LETTER, // [114] = 114 INVALID_LETTER, // [115] = 115 3 , // [116] = t (Nucleotide) 3 , // [117] = u (Nucleotide) INVALID_LETTER, // [118] = 118 INVALID_LETTER, // [119] = 119 INVALID_LETTER, // [120] = 120 INVALID_LETTER, // [121] = 121 INVALID_LETTER, // [122] = 122 INVALID_LETTER, // [123] = 123 INVALID_LETTER, // [124] = 124 INVALID_LETTER, // [125] = 125 INVALID_LETTER, // [126] = 126 INVALID_LETTER, // [127] = 0x7f INVALID_LETTER, // [128] = 0x80 INVALID_LETTER, // [129] = 0x81 INVALID_LETTER, // [130] = 0x82 INVALID_LETTER, // [131] = 0x83 INVALID_LETTER, // [132] = 0x84 INVALID_LETTER, // [133] = 0x85 INVALID_LETTER, // [134] = 0x86 INVALID_LETTER, // [135] = 0x87 INVALID_LETTER, // [136] = 0x88 INVALID_LETTER, // [137] = 0x89 INVALID_LETTER, // [138] = 0x8a INVALID_LETTER, // [139] = 0x8b INVALID_LETTER, // [140] = 0x8c INVALID_LETTER, // [141] = 0x8d INVALID_LETTER, // [142] = 0x8e INVALID_LETTER, // [143] = 0x8f INVALID_LETTER, // [144] = 0x90 INVALID_LETTER, // [145] = 0x91 INVALID_LETTER, // [146] = 0x92 INVALID_LETTER, // [147] = 0x93 INVALID_LETTER, // [148] = 0x94 INVALID_LETTER, // [149] = 0x95 INVALID_LETTER, // [150] = 0x96 INVALID_LETTER, // [151] = 0x97 INVALID_LETTER, // [152] = 0x98 INVALID_LETTER, // [153] = 0x99 INVALID_LETTER, // [154] = 0x9a INVALID_LETTER, // [155] = 0x9b INVALID_LETTER, // [156] = 0x9c INVALID_LETTER, // [157] = 0x9d INVALID_LETTER, // [158] = 0x9e INVALID_LETTER, // [159] = 0x9f INVALID_LETTER, // [160] = 0xa0 INVALID_LETTER, // [161] = 0xa1 INVALID_LETTER, // [162] = 0xa2 INVALID_LETTER, // [163] = 0xa3 INVALID_LETTER, // [164] = 0xa4 INVALID_LETTER, // [165] = 0xa5 INVALID_LETTER, // [166] = 0xa6 INVALID_LETTER, // [167] = 0xa7 INVALID_LETTER, // [168] = 0xa8 INVALID_LETTER, // [169] = 0xa9 INVALID_LETTER, // [170] = 0xaa INVALID_LETTER, // [171] = 0xab INVALID_LETTER, // [172] = 0xac INVALID_LETTER, // [173] = 0xad INVALID_LETTER, // [174] = 0xae INVALID_LETTER, // [175] = 0xaf INVALID_LETTER, // [176] = 0xb0 INVALID_LETTER, // [177] = 0xb1 INVALID_LETTER, // [178] = 0xb2 INVALID_LETTER, // [179] = 0xb3 INVALID_LETTER, // [180] = 0xb4 INVALID_LETTER, // [181] = 0xb5 INVALID_LETTER, // [182] = 0xb6 INVALID_LETTER, // [183] = 0xb7 INVALID_LETTER, // [184] = 0xb8 INVALID_LETTER, // [185] = 0xb9 INVALID_LETTER, // [186] = 0xba INVALID_LETTER, // [187] = 0xbb INVALID_LETTER, // [188] = 0xbc INVALID_LETTER, // [189] = 0xbd INVALID_LETTER, // [190] = 0xbe INVALID_LETTER, // [191] = 0xbf INVALID_LETTER, // [192] = 0xc0 INVALID_LETTER, // [193] = 0xc1 INVALID_LETTER, // [194] = 0xc2 INVALID_LETTER, // [195] = 0xc3 INVALID_LETTER, // [196] = 0xc4 INVALID_LETTER, // [197] = 0xc5 INVALID_LETTER, // [198] = 0xc6 INVALID_LETTER, // [199] = 0xc7 INVALID_LETTER, // [200] = 0xc8 INVALID_LETTER, // [201] = 0xc9 INVALID_LETTER, // [202] = 0xca INVALID_LETTER, // [203] = 0xcb INVALID_LETTER, // [204] = 0xcc INVALID_LETTER, // [205] = 0xcd INVALID_LETTER, // [206] = 0xce INVALID_LETTER, // [207] = 0xcf INVALID_LETTER, // [208] = 0xd0 INVALID_LETTER, // [209] = 0xd1 INVALID_LETTER, // [210] = 0xd2 INVALID_LETTER, // [211] = 0xd3 INVALID_LETTER, // [212] = 0xd4 INVALID_LETTER, // [213] = 0xd5 INVALID_LETTER, // [214] = 0xd6 INVALID_LETTER, // [215] = 0xd7 INVALID_LETTER, // [216] = 0xd8 INVALID_LETTER, // [217] = 0xd9 INVALID_LETTER, // [218] = 0xda INVALID_LETTER, // [219] = 0xdb INVALID_LETTER, // [220] = 0xdc INVALID_LETTER, // [221] = 0xdd INVALID_LETTER, // [222] = 0xde INVALID_LETTER, // [223] = 0xdf INVALID_LETTER, // [224] = 0xe0 INVALID_LETTER, // [225] = 0xe1 INVALID_LETTER, // [226] = 0xe2 INVALID_LETTER, // [227] = 0xe3 INVALID_LETTER, // [228] = 0xe4 INVALID_LETTER, // [229] = 0xe5 INVALID_LETTER, // [230] = 0xe6 INVALID_LETTER, // [231] = 0xe7 INVALID_LETTER, // [232] = 0xe8 INVALID_LETTER, // [233] = 0xe9 INVALID_LETTER, // [234] = 0xea INVALID_LETTER, // [235] = 0xeb INVALID_LETTER, // [236] = 0xec INVALID_LETTER, // [237] = 0xed INVALID_LETTER, // [238] = 0xee INVALID_LETTER, // [239] = 0xef INVALID_LETTER, // [240] = 0xf0 INVALID_LETTER, // [241] = 0xf1 INVALID_LETTER, // [242] = 0xf2 INVALID_LETTER, // [243] = 0xf3 INVALID_LETTER, // [244] = 0xf4 INVALID_LETTER, // [245] = 0xf5 INVALID_LETTER, // [246] = 0xf6 INVALID_LETTER, // [247] = 0xf7 INVALID_LETTER, // [248] = 0xf8 INVALID_LETTER, // [249] = 0xf9 INVALID_LETTER, // [250] = 0xfa INVALID_LETTER, // [251] = 0xfb INVALID_LETTER, // [252] = 0xfc INVALID_LETTER, // [253] = 0xfd INVALID_LETTER, // [254] = 0xfe INVALID_LETTER, // [255] = 0xff }; byte g_CharToLetterNucleoMasked[256] = { INVALID_LETTER, // [ 0] = 0x00 INVALID_LETTER, // [ 1] = 0x01 INVALID_LETTER, // [ 2] = 0x02 INVALID_LETTER, // [ 3] = 0x03 INVALID_LETTER, // [ 4] = 0x04 INVALID_LETTER, // [ 5] = 0x05 INVALID_LETTER, // [ 6] = 0x06 INVALID_LETTER, // [ 7] = 0x07 INVALID_LETTER, // [ 8] = 0x08 INVALID_LETTER, // [ 9] = 0x09 INVALID_LETTER, // [ 10] = 0x0a INVALID_LETTER, // [ 11] = 0x0b INVALID_LETTER, // [ 12] = 0x0c INVALID_LETTER, // [ 13] = 0x0d INVALID_LETTER, // [ 14] = 0x0e INVALID_LETTER, // [ 15] = 0x0f INVALID_LETTER, // [ 16] = 0x10 INVALID_LETTER, // [ 17] = 0x11 INVALID_LETTER, // [ 18] = 0x12 INVALID_LETTER, // [ 19] = 0x13 INVALID_LETTER, // [ 20] = 0x14 INVALID_LETTER, // [ 21] = 0x15 INVALID_LETTER, // [ 22] = 0x16 INVALID_LETTER, // [ 23] = 0x17 INVALID_LETTER, // [ 24] = 0x18 INVALID_LETTER, // [ 25] = 0x19 INVALID_LETTER, // [ 26] = 0x1a INVALID_LETTER, // [ 27] = 0x1b INVALID_LETTER, // [ 28] = 0x1c INVALID_LETTER, // [ 29] = 0x1d INVALID_LETTER, // [ 30] = 0x1e INVALID_LETTER, // [ 31] = 0x1f INVALID_LETTER, // [ 32] = 32 INVALID_LETTER, // [ 33] = 33 INVALID_LETTER, // [ 34] = 34 INVALID_LETTER, // [ 35] = 35 INVALID_LETTER, // [ 36] = 36 INVALID_LETTER, // [ 37] = 37 INVALID_LETTER, // [ 38] = 38 INVALID_LETTER, // [ 39] = 39 INVALID_LETTER, // [ 40] = 40 INVALID_LETTER, // [ 41] = 41 INVALID_LETTER, // [ 42] = 42 INVALID_LETTER, // [ 43] = 43 INVALID_LETTER, // [ 44] = 44 INVALID_LETTER, // [ 45] = 45 INVALID_LETTER, // [ 46] = 46 INVALID_LETTER, // [ 47] = 47 INVALID_LETTER, // [ 48] = 48 INVALID_LETTER, // [ 49] = 49 INVALID_LETTER, // [ 50] = 50 INVALID_LETTER, // [ 51] = 51 INVALID_LETTER, // [ 52] = 52 INVALID_LETTER, // [ 53] = 53 INVALID_LETTER, // [ 54] = 54 INVALID_LETTER, // [ 55] = 55 INVALID_LETTER, // [ 56] = 56 INVALID_LETTER, // [ 57] = 57 INVALID_LETTER, // [ 58] = 58 INVALID_LETTER, // [ 59] = 59 INVALID_LETTER, // [ 60] = 60 INVALID_LETTER, // [ 61] = 61 INVALID_LETTER, // [ 62] = 62 INVALID_LETTER, // [ 63] = 63 INVALID_LETTER, // [ 64] = 64 0 , // [ 65] = A (Nucleotide) INVALID_LETTER, // [ 66] = 66 1 , // [ 67] = C (Nucleotide) INVALID_LETTER, // [ 68] = 68 INVALID_LETTER, // [ 69] = 69 INVALID_LETTER, // [ 70] = 70 2 , // [ 71] = G (Nucleotide) INVALID_LETTER, // [ 72] = 72 INVALID_LETTER, // [ 73] = 73 INVALID_LETTER, // [ 74] = 74 INVALID_LETTER, // [ 75] = 75 INVALID_LETTER, // [ 76] = 76 INVALID_LETTER, // [ 77] = 77 INVALID_LETTER, // [ 78] = 78 INVALID_LETTER, // [ 79] = 79 INVALID_LETTER, // [ 80] = 80 INVALID_LETTER, // [ 81] = 81 INVALID_LETTER, // [ 82] = 82 INVALID_LETTER, // [ 83] = 83 3 , // [ 84] = T (Nucleotide) 3 , // [ 85] = U (Nucleotide) INVALID_LETTER, // [ 86] = 86 INVALID_LETTER, // [ 87] = 87 INVALID_LETTER, // [ 88] = 88 INVALID_LETTER, // [ 89] = 89 INVALID_LETTER, // [ 90] = 90 INVALID_LETTER, // [ 91] = 91 INVALID_LETTER, // [ 92] = 92 INVALID_LETTER, // [ 93] = 93 INVALID_LETTER, // [ 94] = 94 INVALID_LETTER, // [ 95] = 95 INVALID_LETTER, // [ 96] = 96 INVALID_LETTER, // [ 97] = 0x61 INVALID_LETTER, // [ 98] = 0x62 INVALID_LETTER, // [ 99] = 0x63 INVALID_LETTER, // [100] = 0x64 INVALID_LETTER, // [101] = 0x65 INVALID_LETTER, // [102] = 0x66 INVALID_LETTER, // [103] = 0x67 INVALID_LETTER, // [104] = 0x68 INVALID_LETTER, // [105] = 0x69 INVALID_LETTER, // [106] = 0x6a INVALID_LETTER, // [107] = 0x6b INVALID_LETTER, // [108] = 0x6c INVALID_LETTER, // [109] = 0x6d INVALID_LETTER, // [110] = 0x6e INVALID_LETTER, // [111] = 0x6f INVALID_LETTER, // [112] = 0x70 INVALID_LETTER, // [113] = 0x71 INVALID_LETTER, // [114] = 0x72 INVALID_LETTER, // [115] = 0x73 INVALID_LETTER, // [116] = 0x74 INVALID_LETTER, // [117] = 0x75 INVALID_LETTER, // [118] = 0x76 INVALID_LETTER, // [119] = 0x77 INVALID_LETTER, // [120] = 0x78 INVALID_LETTER, // [121] = 0x79 INVALID_LETTER, // [122] = 0x7a INVALID_LETTER, // [123] = 123 INVALID_LETTER, // [124] = 124 INVALID_LETTER, // [125] = 125 INVALID_LETTER, // [126] = 126 INVALID_LETTER, // [127] = 0x7f INVALID_LETTER, // [128] = 0x80 INVALID_LETTER, // [129] = 0x81 INVALID_LETTER, // [130] = 0x82 INVALID_LETTER, // [131] = 0x83 INVALID_LETTER, // [132] = 0x84 INVALID_LETTER, // [133] = 0x85 INVALID_LETTER, // [134] = 0x86 INVALID_LETTER, // [135] = 0x87 INVALID_LETTER, // [136] = 0x88 INVALID_LETTER, // [137] = 0x89 INVALID_LETTER, // [138] = 0x8a INVALID_LETTER, // [139] = 0x8b INVALID_LETTER, // [140] = 0x8c INVALID_LETTER, // [141] = 0x8d INVALID_LETTER, // [142] = 0x8e INVALID_LETTER, // [143] = 0x8f INVALID_LETTER, // [144] = 0x90 INVALID_LETTER, // [145] = 0x91 INVALID_LETTER, // [146] = 0x92 INVALID_LETTER, // [147] = 0x93 INVALID_LETTER, // [148] = 0x94 INVALID_LETTER, // [149] = 0x95 INVALID_LETTER, // [150] = 0x96 INVALID_LETTER, // [151] = 0x97 INVALID_LETTER, // [152] = 0x98 INVALID_LETTER, // [153] = 0x99 INVALID_LETTER, // [154] = 0x9a INVALID_LETTER, // [155] = 0x9b INVALID_LETTER, // [156] = 0x9c INVALID_LETTER, // [157] = 0x9d INVALID_LETTER, // [158] = 0x9e INVALID_LETTER, // [159] = 0x9f INVALID_LETTER, // [160] = 0xa0 INVALID_LETTER, // [161] = 0xa1 INVALID_LETTER, // [162] = 0xa2 INVALID_LETTER, // [163] = 0xa3 INVALID_LETTER, // [164] = 0xa4 INVALID_LETTER, // [165] = 0xa5 INVALID_LETTER, // [166] = 0xa6 INVALID_LETTER, // [167] = 0xa7 INVALID_LETTER, // [168] = 0xa8 INVALID_LETTER, // [169] = 0xa9 INVALID_LETTER, // [170] = 0xaa INVALID_LETTER, // [171] = 0xab INVALID_LETTER, // [172] = 0xac INVALID_LETTER, // [173] = 0xad INVALID_LETTER, // [174] = 0xae INVALID_LETTER, // [175] = 0xaf INVALID_LETTER, // [176] = 0xb0 INVALID_LETTER, // [177] = 0xb1 INVALID_LETTER, // [178] = 0xb2 INVALID_LETTER, // [179] = 0xb3 INVALID_LETTER, // [180] = 0xb4 INVALID_LETTER, // [181] = 0xb5 INVALID_LETTER, // [182] = 0xb6 INVALID_LETTER, // [183] = 0xb7 INVALID_LETTER, // [184] = 0xb8 INVALID_LETTER, // [185] = 0xb9 INVALID_LETTER, // [186] = 0xba INVALID_LETTER, // [187] = 0xbb INVALID_LETTER, // [188] = 0xbc INVALID_LETTER, // [189] = 0xbd INVALID_LETTER, // [190] = 0xbe INVALID_LETTER, // [191] = 0xbf INVALID_LETTER, // [192] = 0xc0 INVALID_LETTER, // [193] = 0xc1 INVALID_LETTER, // [194] = 0xc2 INVALID_LETTER, // [195] = 0xc3 INVALID_LETTER, // [196] = 0xc4 INVALID_LETTER, // [197] = 0xc5 INVALID_LETTER, // [198] = 0xc6 INVALID_LETTER, // [199] = 0xc7 INVALID_LETTER, // [200] = 0xc8 INVALID_LETTER, // [201] = 0xc9 INVALID_LETTER, // [202] = 0xca INVALID_LETTER, // [203] = 0xcb INVALID_LETTER, // [204] = 0xcc INVALID_LETTER, // [205] = 0xcd INVALID_LETTER, // [206] = 0xce INVALID_LETTER, // [207] = 0xcf INVALID_LETTER, // [208] = 0xd0 INVALID_LETTER, // [209] = 0xd1 INVALID_LETTER, // [210] = 0xd2 INVALID_LETTER, // [211] = 0xd3 INVALID_LETTER, // [212] = 0xd4 INVALID_LETTER, // [213] = 0xd5 INVALID_LETTER, // [214] = 0xd6 INVALID_LETTER, // [215] = 0xd7 INVALID_LETTER, // [216] = 0xd8 INVALID_LETTER, // [217] = 0xd9 INVALID_LETTER, // [218] = 0xda INVALID_LETTER, // [219] = 0xdb INVALID_LETTER, // [220] = 0xdc INVALID_LETTER, // [221] = 0xdd INVALID_LETTER, // [222] = 0xde INVALID_LETTER, // [223] = 0xdf INVALID_LETTER, // [224] = 0xe0 INVALID_LETTER, // [225] = 0xe1 INVALID_LETTER, // [226] = 0xe2 INVALID_LETTER, // [227] = 0xe3 INVALID_LETTER, // [228] = 0xe4 INVALID_LETTER, // [229] = 0xe5 INVALID_LETTER, // [230] = 0xe6 INVALID_LETTER, // [231] = 0xe7 INVALID_LETTER, // [232] = 0xe8 INVALID_LETTER, // [233] = 0xe9 INVALID_LETTER, // [234] = 0xea INVALID_LETTER, // [235] = 0xeb INVALID_LETTER, // [236] = 0xec INVALID_LETTER, // [237] = 0xed INVALID_LETTER, // [238] = 0xee INVALID_LETTER, // [239] = 0xef INVALID_LETTER, // [240] = 0xf0 INVALID_LETTER, // [241] = 0xf1 INVALID_LETTER, // [242] = 0xf2 INVALID_LETTER, // [243] = 0xf3 INVALID_LETTER, // [244] = 0xf4 INVALID_LETTER, // [245] = 0xf5 INVALID_LETTER, // [246] = 0xf6 INVALID_LETTER, // [247] = 0xf7 INVALID_LETTER, // [248] = 0xf8 INVALID_LETTER, // [249] = 0xf9 INVALID_LETTER, // [250] = 0xfa INVALID_LETTER, // [251] = 0xfb INVALID_LETTER, // [252] = 0xfc INVALID_LETTER, // [253] = 0xfd INVALID_LETTER, // [254] = 0xfe INVALID_LETTER, // [255] = 0xff }; byte g_CharToLetterNucleoW[256] = { 0 , // [ 0] = 0x00 0 , // [ 1] = 0x01 0 , // [ 2] = 0x02 0 , // [ 3] = 0x03 0 , // [ 4] = 0x04 0 , // [ 5] = 0x05 0 , // [ 6] = 0x06 0 , // [ 7] = 0x07 0 , // [ 8] = 0x08 0 , // [ 9] = 0x09 0 , // [ 10] = 0x0a 0 , // [ 11] = 0x0b 0 , // [ 12] = 0x0c 0 , // [ 13] = 0x0d 0 , // [ 14] = 0x0e 0 , // [ 15] = 0x0f 0 , // [ 16] = 0x10 0 , // [ 17] = 0x11 0 , // [ 18] = 0x12 0 , // [ 19] = 0x13 0 , // [ 20] = 0x14 0 , // [ 21] = 0x15 0 , // [ 22] = 0x16 0 , // [ 23] = 0x17 0 , // [ 24] = 0x18 0 , // [ 25] = 0x19 0 , // [ 26] = 0x1a 0 , // [ 27] = 0x1b 0 , // [ 28] = 0x1c 0 , // [ 29] = 0x1d 0 , // [ 30] = 0x1e 0 , // [ 31] = 0x1f 0 , // [ 32] = 32 0 , // [ 33] = 33 0 , // [ 34] = 34 0 , // [ 35] = 35 0 , // [ 36] = 36 0 , // [ 37] = 37 0 , // [ 38] = 38 0 , // [ 39] = 39 0 , // [ 40] = 40 0 , // [ 41] = 41 0 , // [ 42] = 42 0 , // [ 43] = 43 0 , // [ 44] = 44 0 , // [ 45] = 45 0 , // [ 46] = 46 0 , // [ 47] = 47 0 , // [ 48] = 48 0 , // [ 49] = 49 0 , // [ 50] = 50 0 , // [ 51] = 51 0 , // [ 52] = 52 0 , // [ 53] = 53 0 , // [ 54] = 54 0 , // [ 55] = 55 0 , // [ 56] = 56 0 , // [ 57] = 57 0 , // [ 58] = 58 0 , // [ 59] = 59 0 , // [ 60] = 60 0 , // [ 61] = 61 0 , // [ 62] = 62 0 , // [ 63] = 63 0 , // [ 64] = 64 0 , // [ 65] = A (Nucleotide) 1 , // [ 66] = B (Wildcard) 1 , // [ 67] = C (Nucleotide) 0 , // [ 68] = D (Wildcard) 0 , // [ 69] = 69 0 , // [ 70] = 70 2 , // [ 71] = G (Nucleotide) 0 , // [ 72] = H (Wildcard) 0 , // [ 73] = 73 0 , // [ 74] = 74 2 , // [ 75] = K (Wildcard) 0 , // [ 76] = 76 0 , // [ 77] = M (Wildcard) 2 , // [ 78] = N (Wildcard) 0 , // [ 79] = 79 0 , // [ 80] = 80 0 , // [ 81] = 81 0 , // [ 82] = R (Wildcard) 1 , // [ 83] = S (Wildcard) 3 , // [ 84] = T (Nucleotide) 3 , // [ 85] = U (Nucleotide) 0 , // [ 86] = V (Wildcard) 0 , // [ 87] = W (Wildcard) 2 , // [ 88] = X (Wildcard) 1 , // [ 89] = Y (Wildcard) 0 , // [ 90] = 90 0 , // [ 91] = 91 0 , // [ 92] = 92 0 , // [ 93] = 93 0 , // [ 94] = 94 0 , // [ 95] = 95 0 , // [ 96] = 96 0 , // [ 97] = a (Nucleotide) 1 , // [ 98] = b (Nucleotide) 1 , // [ 99] = c (Nucleotide) 0 , // [100] = d (Nucleotide) 0 , // [101] = 101 0 , // [102] = 102 2 , // [103] = g (Nucleotide) 0 , // [104] = h (Nucleotide) 0 , // [105] = 105 0 , // [106] = 106 2 , // [107] = k (Nucleotide) 0 , // [108] = 108 0 , // [109] = m (Nucleotide) 2 , // [110] = n (Nucleotide) 0 , // [111] = 111 0 , // [112] = 112 0 , // [113] = 113 0 , // [114] = r (Nucleotide) 1 , // [115] = s (Nucleotide) 3 , // [116] = t (Nucleotide) 3 , // [117] = u (Nucleotide) 0 , // [118] = v (Nucleotide) 0 , // [119] = w (Nucleotide) 2 , // [120] = x (Nucleotide) 1 , // [121] = y (Nucleotide) 0 , // [122] = 122 0 , // [123] = 123 0 , // [124] = 124 0 , // [125] = 125 0 , // [126] = 126 0 , // [127] = 0x7f 0 , // [128] = 0x80 0 , // [129] = 0x81 0 , // [130] = 0x82 0 , // [131] = 0x83 0 , // [132] = 0x84 0 , // [133] = 0x85 0 , // [134] = 0x86 0 , // [135] = 0x87 0 , // [136] = 0x88 0 , // [137] = 0x89 0 , // [138] = 0x8a 0 , // [139] = 0x8b 0 , // [140] = 0x8c 0 , // [141] = 0x8d 0 , // [142] = 0x8e 0 , // [143] = 0x8f 0 , // [144] = 0x90 0 , // [145] = 0x91 0 , // [146] = 0x92 0 , // [147] = 0x93 0 , // [148] = 0x94 0 , // [149] = 0x95 0 , // [150] = 0x96 0 , // [151] = 0x97 0 , // [152] = 0x98 0 , // [153] = 0x99 0 , // [154] = 0x9a 0 , // [155] = 0x9b 0 , // [156] = 0x9c 0 , // [157] = 0x9d 0 , // [158] = 0x9e 0 , // [159] = 0x9f 0 , // [160] = 0xa0 0 , // [161] = 0xa1 0 , // [162] = 0xa2 0 , // [163] = 0xa3 0 , // [164] = 0xa4 0 , // [165] = 0xa5 0 , // [166] = 0xa6 0 , // [167] = 0xa7 0 , // [168] = 0xa8 0 , // [169] = 0xa9 0 , // [170] = 0xaa 0 , // [171] = 0xab 0 , // [172] = 0xac 0 , // [173] = 0xad 0 , // [174] = 0xae 0 , // [175] = 0xaf 0 , // [176] = 0xb0 0 , // [177] = 0xb1 0 , // [178] = 0xb2 0 , // [179] = 0xb3 0 , // [180] = 0xb4 0 , // [181] = 0xb5 0 , // [182] = 0xb6 0 , // [183] = 0xb7 0 , // [184] = 0xb8 0 , // [185] = 0xb9 0 , // [186] = 0xba 0 , // [187] = 0xbb 0 , // [188] = 0xbc 0 , // [189] = 0xbd 0 , // [190] = 0xbe 0 , // [191] = 0xbf 0 , // [192] = 0xc0 0 , // [193] = 0xc1 0 , // [194] = 0xc2 0 , // [195] = 0xc3 0 , // [196] = 0xc4 0 , // [197] = 0xc5 0 , // [198] = 0xc6 0 , // [199] = 0xc7 0 , // [200] = 0xc8 0 , // [201] = 0xc9 0 , // [202] = 0xca 0 , // [203] = 0xcb 0 , // [204] = 0xcc 0 , // [205] = 0xcd 0 , // [206] = 0xce 0 , // [207] = 0xcf 0 , // [208] = 0xd0 0 , // [209] = 0xd1 0 , // [210] = 0xd2 0 , // [211] = 0xd3 0 , // [212] = 0xd4 0 , // [213] = 0xd5 0 , // [214] = 0xd6 0 , // [215] = 0xd7 0 , // [216] = 0xd8 0 , // [217] = 0xd9 0 , // [218] = 0xda 0 , // [219] = 0xdb 0 , // [220] = 0xdc 0 , // [221] = 0xdd 0 , // [222] = 0xde 0 , // [223] = 0xdf 0 , // [224] = 0xe0 0 , // [225] = 0xe1 0 , // [226] = 0xe2 0 , // [227] = 0xe3 0 , // [228] = 0xe4 0 , // [229] = 0xe5 0 , // [230] = 0xe6 0 , // [231] = 0xe7 0 , // [232] = 0xe8 0 , // [233] = 0xe9 0 , // [234] = 0xea 0 , // [235] = 0xeb 0 , // [236] = 0xec 0 , // [237] = 0xed 0 , // [238] = 0xee 0 , // [239] = 0xef 0 , // [240] = 0xf0 0 , // [241] = 0xf1 0 , // [242] = 0xf2 0 , // [243] = 0xf3 0 , // [244] = 0xf4 0 , // [245] = 0xf5 0 , // [246] = 0xf6 0 , // [247] = 0xf7 0 , // [248] = 0xf8 0 , // [249] = 0xf9 0 , // [250] = 0xfa 0 , // [251] = 0xfb 0 , // [252] = 0xfc 0 , // [253] = 0xfd 0 , // [254] = 0xfe 0 , // [255] = 0xff }; byte g_LetterToCharNucleo[256] = { 'A', // [0] 'C', // [1] 'G', // [2] 'T', // [3] INVALID_CHAR, // [4] INVALID_CHAR, // [5] INVALID_CHAR, // [6] INVALID_CHAR, // [7] INVALID_CHAR, // [8] INVALID_CHAR, // [9] INVALID_CHAR, // [10] INVALID_CHAR, // [11] INVALID_CHAR, // [12] INVALID_CHAR, // [13] INVALID_CHAR, // [14] INVALID_CHAR, // [15] INVALID_CHAR, // [16] INVALID_CHAR, // [17] INVALID_CHAR, // [18] INVALID_CHAR, // [19] INVALID_CHAR, // [20] INVALID_CHAR, // [21] INVALID_CHAR, // [22] INVALID_CHAR, // [23] INVALID_CHAR, // [24] INVALID_CHAR, // [25] INVALID_CHAR, // [26] INVALID_CHAR, // [27] INVALID_CHAR, // [28] INVALID_CHAR, // [29] INVALID_CHAR, // [30] INVALID_CHAR, // [31] INVALID_CHAR, // [32] INVALID_CHAR, // [33] INVALID_CHAR, // [34] INVALID_CHAR, // [35] INVALID_CHAR, // [36] INVALID_CHAR, // [37] INVALID_CHAR, // [38] INVALID_CHAR, // [39] INVALID_CHAR, // [40] INVALID_CHAR, // [41] INVALID_CHAR, // [42] INVALID_CHAR, // [43] INVALID_CHAR, // [44] INVALID_CHAR, // [45] INVALID_CHAR, // [46] INVALID_CHAR, // [47] INVALID_CHAR, // [48] INVALID_CHAR, // [49] INVALID_CHAR, // [50] INVALID_CHAR, // [51] INVALID_CHAR, // [52] INVALID_CHAR, // [53] INVALID_CHAR, // [54] INVALID_CHAR, // [55] INVALID_CHAR, // [56] INVALID_CHAR, // [57] INVALID_CHAR, // [58] INVALID_CHAR, // [59] INVALID_CHAR, // [60] INVALID_CHAR, // [61] INVALID_CHAR, // [62] INVALID_CHAR, // [63] INVALID_CHAR, // [64] INVALID_CHAR, // [65] INVALID_CHAR, // [66] INVALID_CHAR, // [67] INVALID_CHAR, // [68] INVALID_CHAR, // [69] INVALID_CHAR, // [70] INVALID_CHAR, // [71] INVALID_CHAR, // [72] INVALID_CHAR, // [73] INVALID_CHAR, // [74] INVALID_CHAR, // [75] INVALID_CHAR, // [76] INVALID_CHAR, // [77] INVALID_CHAR, // [78] INVALID_CHAR, // [79] INVALID_CHAR, // [80] INVALID_CHAR, // [81] INVALID_CHAR, // [82] INVALID_CHAR, // [83] INVALID_CHAR, // [84] INVALID_CHAR, // [85] INVALID_CHAR, // [86] INVALID_CHAR, // [87] INVALID_CHAR, // [88] INVALID_CHAR, // [89] INVALID_CHAR, // [90] INVALID_CHAR, // [91] INVALID_CHAR, // [92] INVALID_CHAR, // [93] INVALID_CHAR, // [94] INVALID_CHAR, // [95] INVALID_CHAR, // [96] INVALID_CHAR, // [97] INVALID_CHAR, // [98] INVALID_CHAR, // [99] INVALID_CHAR, // [100] INVALID_CHAR, // [101] INVALID_CHAR, // [102] INVALID_CHAR, // [103] INVALID_CHAR, // [104] INVALID_CHAR, // [105] INVALID_CHAR, // [106] INVALID_CHAR, // [107] INVALID_CHAR, // [108] INVALID_CHAR, // [109] INVALID_CHAR, // [110] INVALID_CHAR, // [111] INVALID_CHAR, // [112] INVALID_CHAR, // [113] INVALID_CHAR, // [114] INVALID_CHAR, // [115] INVALID_CHAR, // [116] INVALID_CHAR, // [117] INVALID_CHAR, // [118] INVALID_CHAR, // [119] INVALID_CHAR, // [120] INVALID_CHAR, // [121] INVALID_CHAR, // [122] INVALID_CHAR, // [123] INVALID_CHAR, // [124] INVALID_CHAR, // [125] INVALID_CHAR, // [126] INVALID_CHAR, // [127] INVALID_CHAR, // [128] INVALID_CHAR, // [129] INVALID_CHAR, // [130] INVALID_CHAR, // [131] INVALID_CHAR, // [132] INVALID_CHAR, // [133] INVALID_CHAR, // [134] INVALID_CHAR, // [135] INVALID_CHAR, // [136] INVALID_CHAR, // [137] INVALID_CHAR, // [138] INVALID_CHAR, // [139] INVALID_CHAR, // [140] INVALID_CHAR, // [141] INVALID_CHAR, // [142] INVALID_CHAR, // [143] INVALID_CHAR, // [144] INVALID_CHAR, // [145] INVALID_CHAR, // [146] INVALID_CHAR, // [147] INVALID_CHAR, // [148] INVALID_CHAR, // [149] INVALID_CHAR, // [150] INVALID_CHAR, // [151] INVALID_CHAR, // [152] INVALID_CHAR, // [153] INVALID_CHAR, // [154] INVALID_CHAR, // [155] INVALID_CHAR, // [156] INVALID_CHAR, // [157] INVALID_CHAR, // [158] INVALID_CHAR, // [159] INVALID_CHAR, // [160] INVALID_CHAR, // [161] INVALID_CHAR, // [162] INVALID_CHAR, // [163] INVALID_CHAR, // [164] INVALID_CHAR, // [165] INVALID_CHAR, // [166] INVALID_CHAR, // [167] INVALID_CHAR, // [168] INVALID_CHAR, // [169] INVALID_CHAR, // [170] INVALID_CHAR, // [171] INVALID_CHAR, // [172] INVALID_CHAR, // [173] INVALID_CHAR, // [174] INVALID_CHAR, // [175] INVALID_CHAR, // [176] INVALID_CHAR, // [177] INVALID_CHAR, // [178] INVALID_CHAR, // [179] INVALID_CHAR, // [180] INVALID_CHAR, // [181] INVALID_CHAR, // [182] INVALID_CHAR, // [183] INVALID_CHAR, // [184] INVALID_CHAR, // [185] INVALID_CHAR, // [186] INVALID_CHAR, // [187] INVALID_CHAR, // [188] INVALID_CHAR, // [189] INVALID_CHAR, // [190] INVALID_CHAR, // [191] INVALID_CHAR, // [192] INVALID_CHAR, // [193] INVALID_CHAR, // [194] INVALID_CHAR, // [195] INVALID_CHAR, // [196] INVALID_CHAR, // [197] INVALID_CHAR, // [198] INVALID_CHAR, // [199] INVALID_CHAR, // [200] INVALID_CHAR, // [201] INVALID_CHAR, // [202] INVALID_CHAR, // [203] INVALID_CHAR, // [204] INVALID_CHAR, // [205] INVALID_CHAR, // [206] INVALID_CHAR, // [207] INVALID_CHAR, // [208] INVALID_CHAR, // [209] INVALID_CHAR, // [210] INVALID_CHAR, // [211] INVALID_CHAR, // [212] INVALID_CHAR, // [213] INVALID_CHAR, // [214] INVALID_CHAR, // [215] INVALID_CHAR, // [216] INVALID_CHAR, // [217] INVALID_CHAR, // [218] INVALID_CHAR, // [219] INVALID_CHAR, // [220] INVALID_CHAR, // [221] INVALID_CHAR, // [222] INVALID_CHAR, // [223] INVALID_CHAR, // [224] INVALID_CHAR, // [225] INVALID_CHAR, // [226] INVALID_CHAR, // [227] INVALID_CHAR, // [228] INVALID_CHAR, // [229] INVALID_CHAR, // [230] INVALID_CHAR, // [231] INVALID_CHAR, // [232] INVALID_CHAR, // [233] INVALID_CHAR, // [234] INVALID_CHAR, // [235] INVALID_CHAR, // [236] INVALID_CHAR, // [237] INVALID_CHAR, // [238] INVALID_CHAR, // [239] INVALID_CHAR, // [240] INVALID_CHAR, // [241] INVALID_CHAR, // [242] INVALID_CHAR, // [243] INVALID_CHAR, // [244] INVALID_CHAR, // [245] INVALID_CHAR, // [246] INVALID_CHAR, // [247] INVALID_CHAR, // [248] INVALID_CHAR, // [249] INVALID_CHAR, // [250] INVALID_CHAR, // [251] INVALID_CHAR, // [252] INVALID_CHAR, // [253] INVALID_CHAR, // [254] INVALID_CHAR, // [255] }; byte g_LetterToCharNucleoGap[256] = { 'A', // [0] 'C', // [1] 'G', // [2] 'T', // [3] '-', // [4] INVALID_CHAR, // [5] INVALID_CHAR, // [6] INVALID_CHAR, // [7] INVALID_CHAR, // [8] INVALID_CHAR, // [9] INVALID_CHAR, // [10] INVALID_CHAR, // [11] INVALID_CHAR, // [12] INVALID_CHAR, // [13] INVALID_CHAR, // [14] INVALID_CHAR, // [15] INVALID_CHAR, // [16] INVALID_CHAR, // [17] INVALID_CHAR, // [18] INVALID_CHAR, // [19] INVALID_CHAR, // [20] INVALID_CHAR, // [21] INVALID_CHAR, // [22] INVALID_CHAR, // [23] INVALID_CHAR, // [24] INVALID_CHAR, // [25] INVALID_CHAR, // [26] INVALID_CHAR, // [27] INVALID_CHAR, // [28] INVALID_CHAR, // [29] INVALID_CHAR, // [30] INVALID_CHAR, // [31] INVALID_CHAR, // [32] INVALID_CHAR, // [33] INVALID_CHAR, // [34] INVALID_CHAR, // [35] INVALID_CHAR, // [36] INVALID_CHAR, // [37] INVALID_CHAR, // [38] INVALID_CHAR, // [39] INVALID_CHAR, // [40] INVALID_CHAR, // [41] INVALID_CHAR, // [42] INVALID_CHAR, // [43] INVALID_CHAR, // [44] INVALID_CHAR, // [45] INVALID_CHAR, // [46] INVALID_CHAR, // [47] INVALID_CHAR, // [48] INVALID_CHAR, // [49] INVALID_CHAR, // [50] INVALID_CHAR, // [51] INVALID_CHAR, // [52] INVALID_CHAR, // [53] INVALID_CHAR, // [54] INVALID_CHAR, // [55] INVALID_CHAR, // [56] INVALID_CHAR, // [57] INVALID_CHAR, // [58] INVALID_CHAR, // [59] INVALID_CHAR, // [60] INVALID_CHAR, // [61] INVALID_CHAR, // [62] INVALID_CHAR, // [63] INVALID_CHAR, // [64] INVALID_CHAR, // [65] INVALID_CHAR, // [66] INVALID_CHAR, // [67] INVALID_CHAR, // [68] INVALID_CHAR, // [69] INVALID_CHAR, // [70] INVALID_CHAR, // [71] INVALID_CHAR, // [72] INVALID_CHAR, // [73] INVALID_CHAR, // [74] INVALID_CHAR, // [75] INVALID_CHAR, // [76] INVALID_CHAR, // [77] INVALID_CHAR, // [78] INVALID_CHAR, // [79] INVALID_CHAR, // [80] INVALID_CHAR, // [81] INVALID_CHAR, // [82] INVALID_CHAR, // [83] INVALID_CHAR, // [84] INVALID_CHAR, // [85] INVALID_CHAR, // [86] INVALID_CHAR, // [87] INVALID_CHAR, // [88] INVALID_CHAR, // [89] INVALID_CHAR, // [90] INVALID_CHAR, // [91] INVALID_CHAR, // [92] INVALID_CHAR, // [93] INVALID_CHAR, // [94] INVALID_CHAR, // [95] INVALID_CHAR, // [96] INVALID_CHAR, // [97] INVALID_CHAR, // [98] INVALID_CHAR, // [99] INVALID_CHAR, // [100] INVALID_CHAR, // [101] INVALID_CHAR, // [102] INVALID_CHAR, // [103] INVALID_CHAR, // [104] INVALID_CHAR, // [105] INVALID_CHAR, // [106] INVALID_CHAR, // [107] INVALID_CHAR, // [108] INVALID_CHAR, // [109] INVALID_CHAR, // [110] INVALID_CHAR, // [111] INVALID_CHAR, // [112] INVALID_CHAR, // [113] INVALID_CHAR, // [114] INVALID_CHAR, // [115] INVALID_CHAR, // [116] INVALID_CHAR, // [117] INVALID_CHAR, // [118] INVALID_CHAR, // [119] INVALID_CHAR, // [120] INVALID_CHAR, // [121] INVALID_CHAR, // [122] INVALID_CHAR, // [123] INVALID_CHAR, // [124] INVALID_CHAR, // [125] INVALID_CHAR, // [126] INVALID_CHAR, // [127] INVALID_CHAR, // [128] INVALID_CHAR, // [129] INVALID_CHAR, // [130] INVALID_CHAR, // [131] INVALID_CHAR, // [132] INVALID_CHAR, // [133] INVALID_CHAR, // [134] INVALID_CHAR, // [135] INVALID_CHAR, // [136] INVALID_CHAR, // [137] INVALID_CHAR, // [138] INVALID_CHAR, // [139] INVALID_CHAR, // [140] INVALID_CHAR, // [141] INVALID_CHAR, // [142] INVALID_CHAR, // [143] INVALID_CHAR, // [144] INVALID_CHAR, // [145] INVALID_CHAR, // [146] INVALID_CHAR, // [147] INVALID_CHAR, // [148] INVALID_CHAR, // [149] INVALID_CHAR, // [150] INVALID_CHAR, // [151] INVALID_CHAR, // [152] INVALID_CHAR, // [153] INVALID_CHAR, // [154] INVALID_CHAR, // [155] INVALID_CHAR, // [156] INVALID_CHAR, // [157] INVALID_CHAR, // [158] INVALID_CHAR, // [159] INVALID_CHAR, // [160] INVALID_CHAR, // [161] INVALID_CHAR, // [162] INVALID_CHAR, // [163] INVALID_CHAR, // [164] INVALID_CHAR, // [165] INVALID_CHAR, // [166] INVALID_CHAR, // [167] INVALID_CHAR, // [168] INVALID_CHAR, // [169] INVALID_CHAR, // [170] INVALID_CHAR, // [171] INVALID_CHAR, // [172] INVALID_CHAR, // [173] INVALID_CHAR, // [174] INVALID_CHAR, // [175] INVALID_CHAR, // [176] INVALID_CHAR, // [177] INVALID_CHAR, // [178] INVALID_CHAR, // [179] INVALID_CHAR, // [180] INVALID_CHAR, // [181] INVALID_CHAR, // [182] INVALID_CHAR, // [183] INVALID_CHAR, // [184] INVALID_CHAR, // [185] INVALID_CHAR, // [186] INVALID_CHAR, // [187] INVALID_CHAR, // [188] INVALID_CHAR, // [189] INVALID_CHAR, // [190] INVALID_CHAR, // [191] INVALID_CHAR, // [192] INVALID_CHAR, // [193] INVALID_CHAR, // [194] INVALID_CHAR, // [195] INVALID_CHAR, // [196] INVALID_CHAR, // [197] INVALID_CHAR, // [198] INVALID_CHAR, // [199] INVALID_CHAR, // [200] INVALID_CHAR, // [201] INVALID_CHAR, // [202] INVALID_CHAR, // [203] INVALID_CHAR, // [204] INVALID_CHAR, // [205] INVALID_CHAR, // [206] INVALID_CHAR, // [207] INVALID_CHAR, // [208] INVALID_CHAR, // [209] INVALID_CHAR, // [210] INVALID_CHAR, // [211] INVALID_CHAR, // [212] INVALID_CHAR, // [213] INVALID_CHAR, // [214] INVALID_CHAR, // [215] INVALID_CHAR, // [216] INVALID_CHAR, // [217] INVALID_CHAR, // [218] INVALID_CHAR, // [219] INVALID_CHAR, // [220] INVALID_CHAR, // [221] INVALID_CHAR, // [222] INVALID_CHAR, // [223] INVALID_CHAR, // [224] INVALID_CHAR, // [225] INVALID_CHAR, // [226] INVALID_CHAR, // [227] INVALID_CHAR, // [228] INVALID_CHAR, // [229] INVALID_CHAR, // [230] INVALID_CHAR, // [231] INVALID_CHAR, // [232] INVALID_CHAR, // [233] INVALID_CHAR, // [234] INVALID_CHAR, // [235] INVALID_CHAR, // [236] INVALID_CHAR, // [237] INVALID_CHAR, // [238] INVALID_CHAR, // [239] INVALID_CHAR, // [240] INVALID_CHAR, // [241] INVALID_CHAR, // [242] INVALID_CHAR, // [243] INVALID_CHAR, // [244] INVALID_CHAR, // [245] INVALID_CHAR, // [246] INVALID_CHAR, // [247] INVALID_CHAR, // [248] INVALID_CHAR, // [249] INVALID_CHAR, // [250] INVALID_CHAR, // [251] INVALID_CHAR, // [252] INVALID_CHAR, // [253] INVALID_CHAR, // [254] INVALID_CHAR, // [255] }; byte g_CodonWordToAminoLetter[4*4*4] = { 8 , // [ 0] = AAA K (Lys) 11, // [ 1] = AAC N (Asn) 8 , // [ 2] = AAG K (Lys) 11, // [ 3] = AAT N (Asn) 16, // [ 4] = ACA T (Thr) 16, // [ 5] = ACC T (Thr) 16, // [ 6] = ACG T (Thr) 16, // [ 7] = ACT T (Thr) 14, // [ 8] = AGA R (Arg) 15, // [ 9] = AGC S (Ser) 14, // [10] = AGG R (Arg) 15, // [11] = AGT S (Ser) 7 , // [12] = ATA I (Ile) 7 , // [13] = ATC I (Ile) 10, // [14] = ATG M (Met) 7 , // [15] = ATT I (Ile) 13, // [16] = CAA Q (Gln) 6 , // [17] = CAC H (His) 13, // [18] = CAG Q (Gln) 6 , // [19] = CAT H (His) 12, // [20] = CCA P (Pro) 12, // [21] = CCC P (Pro) 12, // [22] = CCG P (Pro) 12, // [23] = CCT P (Pro) 14, // [24] = CGA R (Arg) 14, // [25] = CGC R (Arg) 14, // [26] = CGG R (Arg) 14, // [27] = CGT R (Arg) 9 , // [28] = CTA L (Leu) 9 , // [29] = CTC L (Leu) 9 , // [30] = CTG L (Leu) 9 , // [31] = CTT L (Leu) 3 , // [32] = GAA E (Glu) 2 , // [33] = GAC D (Asp) 3 , // [34] = GAG E (Glu) 2 , // [35] = GAT D (Asp) 0 , // [36] = GCA A (Ala) 0 , // [37] = GCC A (Ala) 0 , // [38] = GCG A (Ala) 0 , // [39] = GCT A (Ala) 5 , // [40] = GGA G (Gly) 5 , // [41] = GGC G (Gly) 5 , // [42] = GGG G (Gly) 5 , // [43] = GGT G (Gly) 17, // [44] = GTA V (Val) 17, // [45] = GTC V (Val) 17, // [46] = GTG V (Val) 17, // [47] = GTT V (Val) 20, // [48] = TAA * (STP) 19, // [49] = TAC Y (Tyr) 20, // [50] = TAG * (STP) 19, // [51] = TAT Y (Tyr) 15, // [52] = TCA S (Ser) 15, // [53] = TCC S (Ser) 15, // [54] = TCG S (Ser) 15, // [55] = TCT S (Ser) 20, // [56] = TGA * (STP) 1 , // [57] = TGC C (Cys) 18, // [58] = TGG W (Trp) 1 , // [59] = TGT C (Cys) 9 , // [60] = TTA L (Leu) 4 , // [61] = TTC F (Phe) 9 , // [62] = TTG L (Leu) 4 , // [63] = TTT F (Phe) }; byte g_CodonWordToAminoChar[4*4*4] = { 'K', // [ 0] = AAA (Lys) 'N', // [ 1] = AAC (Asn) 'K', // [ 2] = AAG (Lys) 'N', // [ 3] = AAT (Asn) 'T', // [ 4] = ACA (Thr) 'T', // [ 5] = ACC (Thr) 'T', // [ 6] = ACG (Thr) 'T', // [ 7] = ACT (Thr) 'R', // [ 8] = AGA (Arg) 'S', // [ 9] = AGC (Ser) 'R', // [10] = AGG (Arg) 'S', // [11] = AGT (Ser) 'I', // [12] = ATA (Ile) 'I', // [13] = ATC (Ile) 'M', // [14] = ATG (Met) 'I', // [15] = ATT (Ile) 'Q', // [16] = CAA (Gln) 'H', // [17] = CAC (His) 'Q', // [18] = CAG (Gln) 'H', // [19] = CAT (His) 'P', // [20] = CCA (Pro) 'P', // [21] = CCC (Pro) 'P', // [22] = CCG (Pro) 'P', // [23] = CCT (Pro) 'R', // [24] = CGA (Arg) 'R', // [25] = CGC (Arg) 'R', // [26] = CGG (Arg) 'R', // [27] = CGT (Arg) 'L', // [28] = CTA (Leu) 'L', // [29] = CTC (Leu) 'L', // [30] = CTG (Leu) 'L', // [31] = CTT (Leu) 'E', // [32] = GAA (Glu) 'D', // [33] = GAC (Asp) 'E', // [34] = GAG (Glu) 'D', // [35] = GAT (Asp) 'A', // [36] = GCA (Ala) 'A', // [37] = GCC (Ala) 'A', // [38] = GCG (Ala) 'A', // [39] = GCT (Ala) 'G', // [40] = GGA (Gly) 'G', // [41] = GGC (Gly) 'G', // [42] = GGG (Gly) 'G', // [43] = GGT (Gly) 'V', // [44] = GTA (Val) 'V', // [45] = GTC (Val) 'V', // [46] = GTG (Val) 'V', // [47] = GTT (Val) '*', // [48] = TAA (STP) 'Y', // [49] = TAC (Tyr) '*', // [50] = TAG (STP) 'Y', // [51] = TAT (Tyr) 'S', // [52] = TCA (Ser) 'S', // [53] = TCC (Ser) 'S', // [54] = TCG (Ser) 'S', // [55] = TCT (Ser) '*', // [56] = TGA (STP) 'C', // [57] = TGC (Cys) 'W', // [58] = TGG (Trp) 'C', // [59] = TGT (Cys) 'L', // [60] = TTA (Leu) 'F', // [61] = TTC (Phe) 'L', // [62] = TTG (Leu) 'F', // [63] = TTT (Phe) }; byte g_CharToCompChar[256] = { '?', // [ 0] 0x00 invalid nt '?', // [ 1] 0x01 invalid nt '?', // [ 2] 0x02 invalid nt '?', // [ 3] 0x03 invalid nt '?', // [ 4] 0x04 invalid nt '?', // [ 5] 0x05 invalid nt '?', // [ 6] 0x06 invalid nt '?', // [ 7] 0x07 invalid nt '?', // [ 8] 0x08 invalid nt '?', // [ 9] 0x09 invalid nt '?', // [ 10] 0x0a invalid nt '?', // [ 11] 0x0b invalid nt '?', // [ 12] 0x0c invalid nt '?', // [ 13] 0x0d invalid nt '?', // [ 14] 0x0e invalid nt '?', // [ 15] 0x0f invalid nt '?', // [ 16] 0x10 invalid nt '?', // [ 17] 0x11 invalid nt '?', // [ 18] 0x12 invalid nt '?', // [ 19] 0x13 invalid nt '?', // [ 20] 0x14 invalid nt '?', // [ 21] 0x15 invalid nt '?', // [ 22] 0x16 invalid nt '?', // [ 23] 0x17 invalid nt '?', // [ 24] 0x18 invalid nt '?', // [ 25] 0x19 invalid nt '?', // [ 26] 0x1a invalid nt '?', // [ 27] 0x1b invalid nt '?', // [ 28] 0x1c invalid nt '?', // [ 29] 0x1d invalid nt '?', // [ 30] 0x1e invalid nt '?', // [ 31] 0x1f invalid nt '?', // [ 32] 0x20 invalid nt '?', // [ 33] 0x21 invalid nt '?', // [ 34] 0x22 invalid nt '?', // [ 35] 0x23 invalid nt '?', // [ 36] 0x24 invalid nt '?', // [ 37] 0x25 invalid nt '?', // [ 38] 0x26 invalid nt '?', // [ 39] 0x27 invalid nt '?', // [ 40] 0x28 invalid nt '?', // [ 41] 0x29 invalid nt '?', // [ 42] 0x2a invalid nt '?', // [ 43] 0x2b invalid nt '?', // [ 44] 0x2c invalid nt '?', // [ 45] 0x2d invalid nt '?', // [ 46] 0x2e invalid nt '?', // [ 47] 0x2f invalid nt '?', // [ 48] 0x30 invalid nt '?', // [ 49] 0x31 invalid nt '?', // [ 50] 0x32 invalid nt '?', // [ 51] 0x33 invalid nt '?', // [ 52] 0x34 invalid nt '?', // [ 53] 0x35 invalid nt '?', // [ 54] 0x36 invalid nt '?', // [ 55] 0x37 invalid nt '?', // [ 56] 0x38 invalid nt '?', // [ 57] 0x39 invalid nt '?', // [ 58] 0x3a invalid nt '?', // [ 59] 0x3b invalid nt '?', // [ 60] 0x3c invalid nt '?', // [ 61] 0x3d invalid nt '?', // [ 62] 0x3e invalid nt '?', // [ 63] 0x3f invalid nt '?', // [ 64] 0x40 invalid nt 'T', // [ 65] A -> T 'V', // [ 66] B -> V 'G', // [ 67] C -> G 'H', // [ 68] D -> H '?', // [ 69] E invalid nt '?', // [ 70] F invalid nt 'C', // [ 71] G -> C 'D', // [ 72] H -> D '?', // [ 73] I invalid nt '?', // [ 74] J invalid nt 'M', // [ 75] K -> M '?', // [ 76] L invalid nt 'K', // [ 77] M -> K 'N', // [ 78] N -> N '?', // [ 79] O invalid nt '?', // [ 80] P invalid nt '?', // [ 81] Q invalid nt 'Y', // [ 82] R -> Y 'S', // [ 83] S -> S 'A', // [ 84] T -> A 'A', // [ 85] U -> A 'B', // [ 86] V -> B 'W', // [ 87] W -> W 'X', // [ 88] X -> X 'R', // [ 89] Y -> R '?', // [ 90] Z invalid nt '?', // [ 91] 0x5b invalid nt '?', // [ 92] 0x5c invalid nt '?', // [ 93] 0x5d invalid nt '?', // [ 94] 0x5e invalid nt '?', // [ 95] 0x5f invalid nt '?', // [ 96] 0x60 invalid nt 't', // [ 97] a -> t 'v', // [ 98] b -> v 'g', // [ 99] c -> g 'h', // [100] d -> h '?', // [101] e invalid nt '?', // [102] f invalid nt 'c', // [103] g -> c 'd', // [104] h -> d '?', // [105] i invalid nt '?', // [106] j invalid nt 'm', // [107] k -> m '?', // [108] l invalid nt 'k', // [109] m -> k 'n', // [110] n -> n '?', // [111] o invalid nt '?', // [112] p invalid nt '?', // [113] q invalid nt 'y', // [114] r -> y 's', // [115] s -> s 'a', // [116] t -> a '?', // [117] u invalid nt 'b', // [118] v -> b 'w', // [119] w -> w 'x', // [120] x -> x 'r', // [121] y -> r '?', // [122] z invalid nt '?', // [123] 0x7b invalid nt '?', // [124] 0x7c invalid nt '?', // [125] 0x7d invalid nt '?', // [126] 0x7e invalid nt '?', // [127] 0x7f invalid nt '?', // [128] 0x80 invalid nt '?', // [129] 0x81 invalid nt '?', // [130] 0x82 invalid nt '?', // [131] 0x83 invalid nt '?', // [132] 0x84 invalid nt '?', // [133] 0x85 invalid nt '?', // [134] 0x86 invalid nt '?', // [135] 0x87 invalid nt '?', // [136] 0x88 invalid nt '?', // [137] 0x89 invalid nt '?', // [138] 0x8a invalid nt '?', // [139] 0x8b invalid nt '?', // [140] 0x8c invalid nt '?', // [141] 0x8d invalid nt '?', // [142] 0x8e invalid nt '?', // [143] 0x8f invalid nt '?', // [144] 0x90 invalid nt '?', // [145] 0x91 invalid nt '?', // [146] 0x92 invalid nt '?', // [147] 0x93 invalid nt '?', // [148] 0x94 invalid nt '?', // [149] 0x95 invalid nt '?', // [150] 0x96 invalid nt '?', // [151] 0x97 invalid nt '?', // [152] 0x98 invalid nt '?', // [153] 0x99 invalid nt '?', // [154] 0x9a invalid nt '?', // [155] 0x9b invalid nt '?', // [156] 0x9c invalid nt '?', // [157] 0x9d invalid nt '?', // [158] 0x9e invalid nt '?', // [159] 0x9f invalid nt '?', // [160] 0xa0 invalid nt '?', // [161] 0xa1 invalid nt '?', // [162] 0xa2 invalid nt '?', // [163] 0xa3 invalid nt '?', // [164] 0xa4 invalid nt '?', // [165] 0xa5 invalid nt '?', // [166] 0xa6 invalid nt '?', // [167] 0xa7 invalid nt '?', // [168] 0xa8 invalid nt '?', // [169] 0xa9 invalid nt '?', // [170] 0xaa invalid nt '?', // [171] 0xab invalid nt '?', // [172] 0xac invalid nt '?', // [173] 0xad invalid nt '?', // [174] 0xae invalid nt '?', // [175] 0xaf invalid nt '?', // [176] 0xb0 invalid nt '?', // [177] 0xb1 invalid nt '?', // [178] 0xb2 invalid nt '?', // [179] 0xb3 invalid nt '?', // [180] 0xb4 invalid nt '?', // [181] 0xb5 invalid nt '?', // [182] 0xb6 invalid nt '?', // [183] 0xb7 invalid nt '?', // [184] 0xb8 invalid nt '?', // [185] 0xb9 invalid nt '?', // [186] 0xba invalid nt '?', // [187] 0xbb invalid nt '?', // [188] 0xbc invalid nt '?', // [189] 0xbd invalid nt '?', // [190] 0xbe invalid nt '?', // [191] 0xbf invalid nt '?', // [192] 0xc0 invalid nt '?', // [193] 0xc1 invalid nt '?', // [194] 0xc2 invalid nt '?', // [195] 0xc3 invalid nt '?', // [196] 0xc4 invalid nt '?', // [197] 0xc5 invalid nt '?', // [198] 0xc6 invalid nt '?', // [199] 0xc7 invalid nt '?', // [200] 0xc8 invalid nt '?', // [201] 0xc9 invalid nt '?', // [202] 0xca invalid nt '?', // [203] 0xcb invalid nt '?', // [204] 0xcc invalid nt '?', // [205] 0xcd invalid nt '?', // [206] 0xce invalid nt '?', // [207] 0xcf invalid nt '?', // [208] 0xd0 invalid nt '?', // [209] 0xd1 invalid nt '?', // [210] 0xd2 invalid nt '?', // [211] 0xd3 invalid nt '?', // [212] 0xd4 invalid nt '?', // [213] 0xd5 invalid nt '?', // [214] 0xd6 invalid nt '?', // [215] 0xd7 invalid nt '?', // [216] 0xd8 invalid nt '?', // [217] 0xd9 invalid nt '?', // [218] 0xda invalid nt '?', // [219] 0xdb invalid nt '?', // [220] 0xdc invalid nt '?', // [221] 0xdd invalid nt '?', // [222] 0xde invalid nt '?', // [223] 0xdf invalid nt '?', // [224] 0xe0 invalid nt '?', // [225] 0xe1 invalid nt '?', // [226] 0xe2 invalid nt '?', // [227] 0xe3 invalid nt '?', // [228] 0xe4 invalid nt '?', // [229] 0xe5 invalid nt '?', // [230] 0xe6 invalid nt '?', // [231] 0xe7 invalid nt '?', // [232] 0xe8 invalid nt '?', // [233] 0xe9 invalid nt '?', // [234] 0xea invalid nt '?', // [235] 0xeb invalid nt '?', // [236] 0xec invalid nt '?', // [237] 0xed invalid nt '?', // [238] 0xee invalid nt '?', // [239] 0xef invalid nt '?', // [240] 0xf0 invalid nt '?', // [241] 0xf1 invalid nt '?', // [242] 0xf2 invalid nt '?', // [243] 0xf3 invalid nt '?', // [244] 0xf4 invalid nt '?', // [245] 0xf5 invalid nt '?', // [246] 0xf6 invalid nt '?', // [247] 0xf7 invalid nt '?', // [248] 0xf8 invalid nt '?', // [249] 0xf9 invalid nt '?', // [250] 0xfa invalid nt '?', // [251] 0xfb invalid nt '?', // [252] 0xfc invalid nt '?', // [253] 0xfd invalid nt '?', // [254] 0xfe invalid nt '?', // [255] 0xff invalid nt }; byte g_LetterToCompLetter[256] = { 3, // 0(A) -> 3(T) 2, // 1(C) -> 2(G) 1, // 2(G) -> 1(C) 0, // 3(T) -> 0(A) INVALID_LETTER, // [ 4] INVALID_LETTER, // [ 5] INVALID_LETTER, // [ 6] INVALID_LETTER, // [ 7] INVALID_LETTER, // [ 8] INVALID_LETTER, // [ 9] INVALID_LETTER, // [ 10] INVALID_LETTER, // [ 11] INVALID_LETTER, // [ 12] INVALID_LETTER, // [ 13] INVALID_LETTER, // [ 14] INVALID_LETTER, // [ 15] INVALID_LETTER, // [ 16] INVALID_LETTER, // [ 17] INVALID_LETTER, // [ 18] INVALID_LETTER, // [ 19] INVALID_LETTER, // [ 20] INVALID_LETTER, // [ 21] INVALID_LETTER, // [ 22] INVALID_LETTER, // [ 23] INVALID_LETTER, // [ 24] INVALID_LETTER, // [ 25] INVALID_LETTER, // [ 26] INVALID_LETTER, // [ 27] INVALID_LETTER, // [ 28] INVALID_LETTER, // [ 29] INVALID_LETTER, // [ 30] INVALID_LETTER, // [ 31] INVALID_LETTER, // [ 32] INVALID_LETTER, // [ 33] INVALID_LETTER, // [ 34] INVALID_LETTER, // [ 35] INVALID_LETTER, // [ 36] INVALID_LETTER, // [ 37] INVALID_LETTER, // [ 38] INVALID_LETTER, // [ 39] INVALID_LETTER, // [ 40] INVALID_LETTER, // [ 41] INVALID_LETTER, // [ 42] INVALID_LETTER, // [ 43] INVALID_LETTER, // [ 44] INVALID_LETTER, // [ 45] INVALID_LETTER, // [ 46] INVALID_LETTER, // [ 47] INVALID_LETTER, // [ 48] INVALID_LETTER, // [ 49] INVALID_LETTER, // [ 50] INVALID_LETTER, // [ 51] INVALID_LETTER, // [ 52] INVALID_LETTER, // [ 53] INVALID_LETTER, // [ 54] INVALID_LETTER, // [ 55] INVALID_LETTER, // [ 56] INVALID_LETTER, // [ 57] INVALID_LETTER, // [ 58] INVALID_LETTER, // [ 59] INVALID_LETTER, // [ 60] INVALID_LETTER, // [ 61] INVALID_LETTER, // [ 62] INVALID_LETTER, // [ 63] INVALID_LETTER, // [ 64] INVALID_LETTER, // [ 65] INVALID_LETTER, // [ 66] INVALID_LETTER, // [ 67] INVALID_LETTER, // [ 68] INVALID_LETTER, // [ 69] INVALID_LETTER, // [ 70] INVALID_LETTER, // [ 71] INVALID_LETTER, // [ 72] INVALID_LETTER, // [ 73] INVALID_LETTER, // [ 74] INVALID_LETTER, // [ 75] INVALID_LETTER, // [ 76] INVALID_LETTER, // [ 77] INVALID_LETTER, // [ 78] INVALID_LETTER, // [ 79] INVALID_LETTER, // [ 80] INVALID_LETTER, // [ 81] INVALID_LETTER, // [ 82] INVALID_LETTER, // [ 83] INVALID_LETTER, // [ 84] INVALID_LETTER, // [ 85] INVALID_LETTER, // [ 86] INVALID_LETTER, // [ 87] INVALID_LETTER, // [ 88] INVALID_LETTER, // [ 89] INVALID_LETTER, // [ 90] INVALID_LETTER, // [ 91] INVALID_LETTER, // [ 92] INVALID_LETTER, // [ 93] INVALID_LETTER, // [ 94] INVALID_LETTER, // [ 95] INVALID_LETTER, // [ 96] INVALID_LETTER, // [ 97] INVALID_LETTER, // [ 98] INVALID_LETTER, // [ 99] INVALID_LETTER, // [100] INVALID_LETTER, // [101] INVALID_LETTER, // [102] INVALID_LETTER, // [103] INVALID_LETTER, // [104] INVALID_LETTER, // [105] INVALID_LETTER, // [106] INVALID_LETTER, // [107] INVALID_LETTER, // [108] INVALID_LETTER, // [109] INVALID_LETTER, // [110] INVALID_LETTER, // [111] INVALID_LETTER, // [112] INVALID_LETTER, // [113] INVALID_LETTER, // [114] INVALID_LETTER, // [115] INVALID_LETTER, // [116] INVALID_LETTER, // [117] INVALID_LETTER, // [118] INVALID_LETTER, // [119] INVALID_LETTER, // [120] INVALID_LETTER, // [121] INVALID_LETTER, // [122] INVALID_LETTER, // [123] INVALID_LETTER, // [124] INVALID_LETTER, // [125] INVALID_LETTER, // [126] INVALID_LETTER, // [127] INVALID_LETTER, // [128] INVALID_LETTER, // [129] INVALID_LETTER, // [130] INVALID_LETTER, // [131] INVALID_LETTER, // [132] INVALID_LETTER, // [133] INVALID_LETTER, // [134] INVALID_LETTER, // [135] INVALID_LETTER, // [136] INVALID_LETTER, // [137] INVALID_LETTER, // [138] INVALID_LETTER, // [139] INVALID_LETTER, // [140] INVALID_LETTER, // [141] INVALID_LETTER, // [142] INVALID_LETTER, // [143] INVALID_LETTER, // [144] INVALID_LETTER, // [145] INVALID_LETTER, // [146] INVALID_LETTER, // [147] INVALID_LETTER, // [148] INVALID_LETTER, // [149] INVALID_LETTER, // [150] INVALID_LETTER, // [151] INVALID_LETTER, // [152] INVALID_LETTER, // [153] INVALID_LETTER, // [154] INVALID_LETTER, // [155] INVALID_LETTER, // [156] INVALID_LETTER, // [157] INVALID_LETTER, // [158] INVALID_LETTER, // [159] INVALID_LETTER, // [160] INVALID_LETTER, // [161] INVALID_LETTER, // [162] INVALID_LETTER, // [163] INVALID_LETTER, // [164] INVALID_LETTER, // [165] INVALID_LETTER, // [166] INVALID_LETTER, // [167] INVALID_LETTER, // [168] INVALID_LETTER, // [169] INVALID_LETTER, // [170] INVALID_LETTER, // [171] INVALID_LETTER, // [172] INVALID_LETTER, // [173] INVALID_LETTER, // [174] INVALID_LETTER, // [175] INVALID_LETTER, // [176] INVALID_LETTER, // [177] INVALID_LETTER, // [178] INVALID_LETTER, // [179] INVALID_LETTER, // [180] INVALID_LETTER, // [181] INVALID_LETTER, // [182] INVALID_LETTER, // [183] INVALID_LETTER, // [184] INVALID_LETTER, // [185] INVALID_LETTER, // [186] INVALID_LETTER, // [187] INVALID_LETTER, // [188] INVALID_LETTER, // [189] INVALID_LETTER, // [190] INVALID_LETTER, // [191] INVALID_LETTER, // [192] INVALID_LETTER, // [193] INVALID_LETTER, // [194] INVALID_LETTER, // [195] INVALID_LETTER, // [196] INVALID_LETTER, // [197] INVALID_LETTER, // [198] INVALID_LETTER, // [199] INVALID_LETTER, // [200] INVALID_LETTER, // [201] INVALID_LETTER, // [202] INVALID_LETTER, // [203] INVALID_LETTER, // [204] INVALID_LETTER, // [205] INVALID_LETTER, // [206] INVALID_LETTER, // [207] INVALID_LETTER, // [208] INVALID_LETTER, // [209] INVALID_LETTER, // [210] INVALID_LETTER, // [211] INVALID_LETTER, // [212] INVALID_LETTER, // [213] INVALID_LETTER, // [214] INVALID_LETTER, // [215] INVALID_LETTER, // [216] INVALID_LETTER, // [217] INVALID_LETTER, // [218] INVALID_LETTER, // [219] INVALID_LETTER, // [220] INVALID_LETTER, // [221] INVALID_LETTER, // [222] INVALID_LETTER, // [223] INVALID_LETTER, // [224] INVALID_LETTER, // [225] INVALID_LETTER, // [226] INVALID_LETTER, // [227] INVALID_LETTER, // [228] INVALID_LETTER, // [229] INVALID_LETTER, // [230] INVALID_LETTER, // [231] INVALID_LETTER, // [232] INVALID_LETTER, // [233] INVALID_LETTER, // [234] INVALID_LETTER, // [235] INVALID_LETTER, // [236] INVALID_LETTER, // [237] INVALID_LETTER, // [238] INVALID_LETTER, // [239] INVALID_LETTER, // [240] INVALID_LETTER, // [241] INVALID_LETTER, // [242] INVALID_LETTER, // [243] INVALID_LETTER, // [244] INVALID_LETTER, // [245] INVALID_LETTER, // [246] INVALID_LETTER, // [247] INVALID_LETTER, // [248] INVALID_LETTER, // [249] INVALID_LETTER, // [250] INVALID_LETTER, // [251] INVALID_LETTER, // [252] INVALID_LETTER, // [253] INVALID_LETTER, // [254] INVALID_LETTER, // [255] }; byte g_CharToCompLetter[256] = { INVALID_LETTER, // [ 0] INVALID_LETTER, // [ 1] INVALID_LETTER, // [ 2] INVALID_LETTER, // [ 3] INVALID_LETTER, // [ 4] INVALID_LETTER, // [ 5] INVALID_LETTER, // [ 6] INVALID_LETTER, // [ 7] INVALID_LETTER, // [ 8] INVALID_LETTER, // [ 9] INVALID_LETTER, // [ 10] INVALID_LETTER, // [ 11] INVALID_LETTER, // [ 12] INVALID_LETTER, // [ 13] INVALID_LETTER, // [ 14] INVALID_LETTER, // [ 15] INVALID_LETTER, // [ 16] INVALID_LETTER, // [ 17] INVALID_LETTER, // [ 18] INVALID_LETTER, // [ 19] INVALID_LETTER, // [ 20] INVALID_LETTER, // [ 21] INVALID_LETTER, // [ 22] INVALID_LETTER, // [ 23] INVALID_LETTER, // [ 24] INVALID_LETTER, // [ 25] INVALID_LETTER, // [ 26] INVALID_LETTER, // [ 27] INVALID_LETTER, // [ 28] INVALID_LETTER, // [ 29] INVALID_LETTER, // [ 30] INVALID_LETTER, // [ 31] INVALID_LETTER, // [ 32] INVALID_LETTER, // [ 33] INVALID_LETTER, // [ 34] INVALID_LETTER, // [ 35] INVALID_LETTER, // [ 36] INVALID_LETTER, // [ 37] INVALID_LETTER, // [ 38] INVALID_LETTER, // [ 39] INVALID_LETTER, // [ 40] INVALID_LETTER, // [ 41] INVALID_LETTER, // [ 42] INVALID_LETTER, // [ 43] INVALID_LETTER, // [ 44] INVALID_LETTER, // [ 45] INVALID_LETTER, // [ 46] INVALID_LETTER, // [ 47] INVALID_LETTER, // [ 48] INVALID_LETTER, // [ 49] INVALID_LETTER, // [ 50] INVALID_LETTER, // [ 51] INVALID_LETTER, // [ 52] INVALID_LETTER, // [ 53] INVALID_LETTER, // [ 54] INVALID_LETTER, // [ 55] INVALID_LETTER, // [ 56] INVALID_LETTER, // [ 57] INVALID_LETTER, // [ 58] INVALID_LETTER, // [ 59] INVALID_LETTER, // [ 60] INVALID_LETTER, // [ 61] INVALID_LETTER, // [ 62] INVALID_LETTER, // [ 63] INVALID_LETTER, // [ 64] 3, // [ 65] A -> T INVALID_LETTER, // [ 66] B -> V 2, // [ 67] C -> G INVALID_LETTER, // [ 68] D -> H INVALID_LETTER, // [ 69] INVALID_LETTER, // [ 70] 1, // [ 71] G -> C INVALID_LETTER, // [ 72] H -> D INVALID_LETTER, // [ 73] INVALID_LETTER, // [ 74] INVALID_LETTER, // [ 75] K -> M INVALID_LETTER, // [ 76] INVALID_LETTER, // [ 77] M -> K INVALID_LETTER, // [ 78] N -> N INVALID_LETTER, // [ 79] INVALID_LETTER, // [ 80] INVALID_LETTER, // [ 81] INVALID_LETTER, // [ 82] R -> Y INVALID_LETTER, // [ 83] S -> S 0, // [ 84] T -> A 0, // [ 85] U -> A INVALID_LETTER, // [ 86] V -> B INVALID_LETTER, // [ 87] W -> W INVALID_LETTER, // [ 88] X -> X INVALID_LETTER, // [ 89] Y -> R INVALID_LETTER, // [ 90] INVALID_LETTER, // [ 91] INVALID_LETTER, // [ 92] INVALID_LETTER, // [ 93] INVALID_LETTER, // [ 94] INVALID_LETTER, // [ 95] INVALID_LETTER, // [ 96] 3, // [ 97] a -> t INVALID_LETTER, // [ 98] b -> v 2, // [ 99] c -> g INVALID_LETTER, // [100] d -> h INVALID_LETTER, // [101] INVALID_LETTER, // [102] 1, // [103] g -> c INVALID_LETTER, // [104] h -> d INVALID_LETTER, // [105] INVALID_LETTER, // [106] INVALID_LETTER, // [107] k -> m INVALID_LETTER, // [108] INVALID_LETTER, // [109] m -> k INVALID_LETTER, // [110] n -> n INVALID_LETTER, // [111] INVALID_LETTER, // [112] INVALID_LETTER, // [113] INVALID_LETTER, // [114] r -> y INVALID_LETTER, // [115] s -> s 0, // [116] t -> a INVALID_LETTER, // [117] INVALID_LETTER, // [118] v -> b INVALID_LETTER, // [119] w -> w INVALID_LETTER, // [120] x -> x INVALID_LETTER, // [121] y -> r INVALID_LETTER, // [122] INVALID_LETTER, // [123] INVALID_LETTER, // [124] INVALID_LETTER, // [125] INVALID_LETTER, // [126] INVALID_LETTER, // [127] INVALID_LETTER, // [128] INVALID_LETTER, // [129] INVALID_LETTER, // [130] INVALID_LETTER, // [131] INVALID_LETTER, // [132] INVALID_LETTER, // [133] INVALID_LETTER, // [134] INVALID_LETTER, // [135] INVALID_LETTER, // [136] INVALID_LETTER, // [137] INVALID_LETTER, // [138] INVALID_LETTER, // [139] INVALID_LETTER, // [140] INVALID_LETTER, // [141] INVALID_LETTER, // [142] INVALID_LETTER, // [143] INVALID_LETTER, // [144] INVALID_LETTER, // [145] INVALID_LETTER, // [146] INVALID_LETTER, // [147] INVALID_LETTER, // [148] INVALID_LETTER, // [149] INVALID_LETTER, // [150] INVALID_LETTER, // [151] INVALID_LETTER, // [152] INVALID_LETTER, // [153] INVALID_LETTER, // [154] INVALID_LETTER, // [155] INVALID_LETTER, // [156] INVALID_LETTER, // [157] INVALID_LETTER, // [158] INVALID_LETTER, // [159] INVALID_LETTER, // [160] INVALID_LETTER, // [161] INVALID_LETTER, // [162] INVALID_LETTER, // [163] INVALID_LETTER, // [164] INVALID_LETTER, // [165] INVALID_LETTER, // [166] INVALID_LETTER, // [167] INVALID_LETTER, // [168] INVALID_LETTER, // [169] INVALID_LETTER, // [170] INVALID_LETTER, // [171] INVALID_LETTER, // [172] INVALID_LETTER, // [173] INVALID_LETTER, // [174] INVALID_LETTER, // [175] INVALID_LETTER, // [176] INVALID_LETTER, // [177] INVALID_LETTER, // [178] INVALID_LETTER, // [179] INVALID_LETTER, // [180] INVALID_LETTER, // [181] INVALID_LETTER, // [182] INVALID_LETTER, // [183] INVALID_LETTER, // [184] INVALID_LETTER, // [185] INVALID_LETTER, // [186] INVALID_LETTER, // [187] INVALID_LETTER, // [188] INVALID_LETTER, // [189] INVALID_LETTER, // [190] INVALID_LETTER, // [191] INVALID_LETTER, // [192] INVALID_LETTER, // [193] INVALID_LETTER, // [194] INVALID_LETTER, // [195] INVALID_LETTER, // [196] INVALID_LETTER, // [197] INVALID_LETTER, // [198] INVALID_LETTER, // [199] INVALID_LETTER, // [200] INVALID_LETTER, // [201] INVALID_LETTER, // [202] INVALID_LETTER, // [203] INVALID_LETTER, // [204] INVALID_LETTER, // [205] INVALID_LETTER, // [206] INVALID_LETTER, // [207] INVALID_LETTER, // [208] INVALID_LETTER, // [209] INVALID_LETTER, // [210] INVALID_LETTER, // [211] INVALID_LETTER, // [212] INVALID_LETTER, // [213] INVALID_LETTER, // [214] INVALID_LETTER, // [215] INVALID_LETTER, // [216] INVALID_LETTER, // [217] INVALID_LETTER, // [218] INVALID_LETTER, // [219] INVALID_LETTER, // [220] INVALID_LETTER, // [221] INVALID_LETTER, // [222] INVALID_LETTER, // [223] INVALID_LETTER, // [224] INVALID_LETTER, // [225] INVALID_LETTER, // [226] INVALID_LETTER, // [227] INVALID_LETTER, // [228] INVALID_LETTER, // [229] INVALID_LETTER, // [230] INVALID_LETTER, // [231] INVALID_LETTER, // [232] INVALID_LETTER, // [233] INVALID_LETTER, // [234] INVALID_LETTER, // [235] INVALID_LETTER, // [236] INVALID_LETTER, // [237] INVALID_LETTER, // [238] INVALID_LETTER, // [239] INVALID_LETTER, // [240] INVALID_LETTER, // [241] INVALID_LETTER, // [242] INVALID_LETTER, // [243] INVALID_LETTER, // [244] INVALID_LETTER, // [245] INVALID_LETTER, // [246] INVALID_LETTER, // [247] INVALID_LETTER, // [248] INVALID_LETTER, // [249] INVALID_LETTER, // [250] INVALID_LETTER, // [251] INVALID_LETTER, // [252] INVALID_LETTER, // [253] INVALID_LETTER, // [254] INVALID_LETTER, // [255] }; bool g_IsSeqChar[256] = { false, // [ 0] 0x00 false, // [ 1] 0x01 false, // [ 2] 0x02 false, // [ 3] 0x03 false, // [ 4] 0x04 false, // [ 5] 0x05 false, // [ 6] 0x06 false, // [ 7] 0x07 false, // [ 8] 0x08 false, // [ 9] 0x09 false, // [ 10] 0x0a false, // [ 11] 0x0b false, // [ 12] 0x0c false, // [ 13] 0x0d false, // [ 14] 0x0e false, // [ 15] 0x0f false, // [ 16] 0x10 false, // [ 17] 0x11 false, // [ 18] 0x12 false, // [ 19] 0x13 false, // [ 20] 0x14 false, // [ 21] 0x15 false, // [ 22] 0x16 false, // [ 23] 0x17 false, // [ 24] 0x18 false, // [ 25] 0x19 false, // [ 26] 0x1a false, // [ 27] 0x1b false, // [ 28] 0x1c false, // [ 29] 0x1d false, // [ 30] 0x1e false, // [ 31] 0x1f false, // [ 32] ' ' false, // [ 33] '!' false, // [ 34] '"' false, // [ 35] '#' false, // [ 36] '$' false, // [ 37] '%' false, // [ 38] '&' false, // [ 39] ''' false, // [ 40] '(' false, // [ 41] ')' true, // [ 42] '*' false, // [ 43] '+' false, // [ 44] ',' false, // [ 45] '-' false, // [ 46] '.' false, // [ 47] '/' false, // [ 48] '0' false, // [ 49] '1' false, // [ 50] '2' false, // [ 51] '3' false, // [ 52] '4' false, // [ 53] '5' false, // [ 54] '6' false, // [ 55] '7' false, // [ 56] '8' false, // [ 57] '9' false, // [ 58] ':' false, // [ 59] ';' false, // [ 60] '<' false, // [ 61] '=' false, // [ 62] '>' false, // [ 63] '?' false, // [ 64] '@' true, // [ 65] 'A' true, // [ 66] 'B' true, // [ 67] 'C' true, // [ 68] 'D' true, // [ 69] 'E' true, // [ 70] 'F' true, // [ 71] 'G' true, // [ 72] 'H' true, // [ 73] 'I' false, // [ 74] 'J' true, // [ 75] 'K' true, // [ 76] 'L' true, // [ 77] 'M' true, // [ 78] 'N' false, // [ 79] 'O' true, // [ 80] 'P' true, // [ 81] 'Q' true, // [ 82] 'R' true, // [ 83] 'S' true, // [ 84] 'T' true, // [ 85] 'U' true, // [ 86] 'V' true, // [ 87] 'W' true, // [ 88] 'X' true, // [ 89] 'Y' true, // [ 90] 'Z' false, // [ 91] '[' false, // [ 92] '\' false, // [ 93] ']' false, // [ 94] '^' false, // [ 95] '_' false, // [ 96] '`' true, // [ 97] 'a' true, // [ 98] 'b' true, // [ 99] 'c' true, // [100] 'd' true, // [101] 'e' true, // [102] 'f' true, // [103] 'g' true, // [104] 'h' true, // [105] 'i' false, // [106] 'j' true, // [107] 'k' true, // [108] 'l' true, // [109] 'm' true, // [110] 'n' false, // [111] 'o' true, // [112] 'p' true, // [113] 'q' true, // [114] 'r' true, // [115] 's' true, // [116] 't' true, // [117] 'u' true, // [118] 'v' true, // [119] 'w' true, // [120] 'x' true, // [121] 'y' true, // [122] 'z' false, // [123] '{' false, // [124] '|' false, // [125] '}' false, // [126] '~' false, // [127] 0x7f false, // [128] 0x80 false, // [129] 0x81 false, // [130] 0x82 false, // [131] 0x83 false, // [132] 0x84 false, // [133] 0x85 false, // [134] 0x86 false, // [135] 0x87 false, // [136] 0x88 false, // [137] 0x89 false, // [138] 0x8a false, // [139] 0x8b false, // [140] 0x8c false, // [141] 0x8d false, // [142] 0x8e false, // [143] 0x8f false, // [144] 0x90 false, // [145] 0x91 false, // [146] 0x92 false, // [147] 0x93 false, // [148] 0x94 false, // [149] 0x95 false, // [150] 0x96 false, // [151] 0x97 false, // [152] 0x98 false, // [153] 0x99 false, // [154] 0x9a false, // [155] 0x9b false, // [156] 0x9c false, // [157] 0x9d false, // [158] 0x9e false, // [159] 0x9f false, // [160] 0xa0 false, // [161] 0xa1 false, // [162] 0xa2 false, // [163] 0xa3 false, // [164] 0xa4 false, // [165] 0xa5 false, // [166] 0xa6 false, // [167] 0xa7 false, // [168] 0xa8 false, // [169] 0xa9 false, // [170] 0xaa false, // [171] 0xab false, // [172] 0xac false, // [173] 0xad false, // [174] 0xae false, // [175] 0xaf false, // [176] 0xb0 false, // [177] 0xb1 false, // [178] 0xb2 false, // [179] 0xb3 false, // [180] 0xb4 false, // [181] 0xb5 false, // [182] 0xb6 false, // [183] 0xb7 false, // [184] 0xb8 false, // [185] 0xb9 false, // [186] 0xba false, // [187] 0xbb false, // [188] 0xbc false, // [189] 0xbd false, // [190] 0xbe false, // [191] 0xbf false, // [192] 0xc0 false, // [193] 0xc1 false, // [194] 0xc2 false, // [195] 0xc3 false, // [196] 0xc4 false, // [197] 0xc5 false, // [198] 0xc6 false, // [199] 0xc7 false, // [200] 0xc8 false, // [201] 0xc9 false, // [202] 0xca false, // [203] 0xcb false, // [204] 0xcc false, // [205] 0xcd false, // [206] 0xce false, // [207] 0xcf false, // [208] 0xd0 false, // [209] 0xd1 false, // [210] 0xd2 false, // [211] 0xd3 false, // [212] 0xd4 false, // [213] 0xd5 false, // [214] 0xd6 false, // [215] 0xd7 false, // [216] 0xd8 false, // [217] 0xd9 false, // [218] 0xda false, // [219] 0xdb false, // [220] 0xdc false, // [221] 0xdd false, // [222] 0xde false, // [223] 0xdf false, // [224] 0xe0 false, // [225] 0xe1 false, // [226] 0xe2 false, // [227] 0xe3 false, // [228] 0xe4 false, // [229] 0xe5 false, // [230] 0xe6 false, // [231] 0xe7 false, // [232] 0xe8 false, // [233] 0xe9 false, // [234] 0xea false, // [235] 0xeb false, // [236] 0xec false, // [237] 0xed false, // [238] 0xee false, // [239] 0xef false, // [240] 0xf0 false, // [241] 0xf1 false, // [242] 0xf2 false, // [243] 0xf3 false, // [244] 0xf4 false, // [245] 0xf5 false, // [246] 0xf6 false, // [247] 0xf7 false, // [248] 0xf8 false, // [249] 0xf9 false, // [250] 0xfa false, // [251] 0xfb false, // [252] 0xfc false, // [253] 0xfd false, // [254] 0xfe false, // [255] 0xff }; bool g_IsAminoChar[256] = { false, // [ 0] 0x00 false, // [ 1] 0x01 false, // [ 2] 0x02 false, // [ 3] 0x03 false, // [ 4] 0x04 false, // [ 5] 0x05 false, // [ 6] 0x06 false, // [ 7] 0x07 false, // [ 8] 0x08 false, // [ 9] 0x09 false, // [ 10] 0x0a false, // [ 11] 0x0b false, // [ 12] 0x0c false, // [ 13] 0x0d false, // [ 14] 0x0e false, // [ 15] 0x0f false, // [ 16] 0x10 false, // [ 17] 0x11 false, // [ 18] 0x12 false, // [ 19] 0x13 false, // [ 20] 0x14 false, // [ 21] 0x15 false, // [ 22] 0x16 false, // [ 23] 0x17 false, // [ 24] 0x18 false, // [ 25] 0x19 false, // [ 26] 0x1a false, // [ 27] 0x1b false, // [ 28] 0x1c false, // [ 29] 0x1d false, // [ 30] 0x1e false, // [ 31] 0x1f false, // [ 32] ' ' false, // [ 33] '!' false, // [ 34] '"' false, // [ 35] '#' false, // [ 36] '$' false, // [ 37] '%' false, // [ 38] '&' false, // [ 39] ''' false, // [ 40] '(' false, // [ 41] ')' true, // [ 42] '*' = STP false, // [ 43] '+' false, // [ 44] ',' false, // [ 45] '-' false, // [ 46] '.' false, // [ 47] '/' false, // [ 48] '0' false, // [ 49] '1' false, // [ 50] '2' false, // [ 51] '3' false, // [ 52] '4' false, // [ 53] '5' false, // [ 54] '6' false, // [ 55] '7' false, // [ 56] '8' false, // [ 57] '9' false, // [ 58] ':' false, // [ 59] ';' false, // [ 60] '<' false, // [ 61] '=' false, // [ 62] '>' false, // [ 63] '?' false, // [ 64] '@' true, // [ 65] 'A' = Ala false, // [ 66] 'B' true, // [ 67] 'C' = Cys true, // [ 68] 'D' = Asp true, // [ 69] 'E' = Glu true, // [ 70] 'F' = Phe true, // [ 71] 'G' = Gly true, // [ 72] 'H' = His true, // [ 73] 'I' = Ile false, // [ 74] 'J' true, // [ 75] 'K' = Lys true, // [ 76] 'L' = Leu true, // [ 77] 'M' = Met true, // [ 78] 'N' = Asn false, // [ 79] 'O' true, // [ 80] 'P' = Pro true, // [ 81] 'Q' = Gln true, // [ 82] 'R' = Arg true, // [ 83] 'S' = Ser true, // [ 84] 'T' = Thr false, // [ 85] 'U' true, // [ 86] 'V' = Val true, // [ 87] 'W' = Trp false, // [ 88] 'X' true, // [ 89] 'Y' = Tyr false, // [ 90] 'Z' false, // [ 91] '[' false, // [ 92] '\' false, // [ 93] ']' false, // [ 94] '^' false, // [ 95] '_' false, // [ 96] '`' true, // [ 97] 'A' = Ala false, // [ 98] 'B' true, // [ 99] 'C' = Cys true, // [100] 'D' = Asp true, // [101] 'E' = Glu true, // [102] 'F' = Phe true, // [103] 'G' = Gly true, // [104] 'H' = His true, // [105] 'I' = Ile false, // [106] 'J' true, // [107] 'K' = Lys true, // [108] 'L' = Leu true, // [109] 'M' = Met true, // [110] 'N' = Asn false, // [111] 'O' true, // [112] 'P' = Pro true, // [113] 'Q' = Gln true, // [114] 'R' = Arg true, // [115] 'S' = Ser true, // [116] 'T' = Thr false, // [117] 'U' true, // [118] 'V' = Val true, // [119] 'W' = Trp false, // [120] 'X' true, // [121] 'Y' = Tyr false, // [122] 'Z' false, // [123] '{' false, // [124] '|' false, // [125] '}' false, // [126] '~' false, // [127] 0x7f false, // [128] 0x80 false, // [129] 0x81 false, // [130] 0x82 false, // [131] 0x83 false, // [132] 0x84 false, // [133] 0x85 false, // [134] 0x86 false, // [135] 0x87 false, // [136] 0x88 false, // [137] 0x89 false, // [138] 0x8a false, // [139] 0x8b false, // [140] 0x8c false, // [141] 0x8d false, // [142] 0x8e false, // [143] 0x8f false, // [144] 0x90 false, // [145] 0x91 false, // [146] 0x92 false, // [147] 0x93 false, // [148] 0x94 false, // [149] 0x95 false, // [150] 0x96 false, // [151] 0x97 false, // [152] 0x98 false, // [153] 0x99 false, // [154] 0x9a false, // [155] 0x9b false, // [156] 0x9c false, // [157] 0x9d false, // [158] 0x9e false, // [159] 0x9f false, // [160] 0xa0 false, // [161] 0xa1 false, // [162] 0xa2 false, // [163] 0xa3 false, // [164] 0xa4 false, // [165] 0xa5 false, // [166] 0xa6 false, // [167] 0xa7 false, // [168] 0xa8 false, // [169] 0xa9 false, // [170] 0xaa false, // [171] 0xab false, // [172] 0xac false, // [173] 0xad false, // [174] 0xae false, // [175] 0xaf false, // [176] 0xb0 false, // [177] 0xb1 false, // [178] 0xb2 false, // [179] 0xb3 false, // [180] 0xb4 false, // [181] 0xb5 false, // [182] 0xb6 false, // [183] 0xb7 false, // [184] 0xb8 false, // [185] 0xb9 false, // [186] 0xba false, // [187] 0xbb false, // [188] 0xbc false, // [189] 0xbd false, // [190] 0xbe false, // [191] 0xbf false, // [192] 0xc0 false, // [193] 0xc1 false, // [194] 0xc2 false, // [195] 0xc3 false, // [196] 0xc4 false, // [197] 0xc5 false, // [198] 0xc6 false, // [199] 0xc7 false, // [200] 0xc8 false, // [201] 0xc9 false, // [202] 0xca false, // [203] 0xcb false, // [204] 0xcc false, // [205] 0xcd false, // [206] 0xce false, // [207] 0xcf false, // [208] 0xd0 false, // [209] 0xd1 false, // [210] 0xd2 false, // [211] 0xd3 false, // [212] 0xd4 false, // [213] 0xd5 false, // [214] 0xd6 false, // [215] 0xd7 false, // [216] 0xd8 false, // [217] 0xd9 false, // [218] 0xda false, // [219] 0xdb false, // [220] 0xdc false, // [221] 0xdd false, // [222] 0xde false, // [223] 0xdf false, // [224] 0xe0 false, // [225] 0xe1 false, // [226] 0xe2 false, // [227] 0xe3 false, // [228] 0xe4 false, // [229] 0xe5 false, // [230] 0xe6 false, // [231] 0xe7 false, // [232] 0xe8 false, // [233] 0xe9 false, // [234] 0xea false, // [235] 0xeb false, // [236] 0xec false, // [237] 0xed false, // [238] 0xee false, // [239] 0xef false, // [240] 0xf0 false, // [241] 0xf1 false, // [242] 0xf2 false, // [243] 0xf3 false, // [244] 0xf4 false, // [245] 0xf5 false, // [246] 0xf6 false, // [247] 0xf7 false, // [248] 0xf8 false, // [249] 0xf9 false, // [250] 0xfa false, // [251] 0xfb false, // [252] 0xfc false, // [253] 0xfd false, // [254] 0xfe false, // [255] 0xff }; bool g_IsNucleoChar[256] = { false, // [ 0] 0x00 false, // [ 1] 0x01 false, // [ 2] 0x02 false, // [ 3] 0x03 false, // [ 4] 0x04 false, // [ 5] 0x05 false, // [ 6] 0x06 false, // [ 7] 0x07 false, // [ 8] 0x08 false, // [ 9] 0x09 false, // [ 10] 0x0a false, // [ 11] 0x0b false, // [ 12] 0x0c false, // [ 13] 0x0d false, // [ 14] 0x0e false, // [ 15] 0x0f false, // [ 16] 0x10 false, // [ 17] 0x11 false, // [ 18] 0x12 false, // [ 19] 0x13 false, // [ 20] 0x14 false, // [ 21] 0x15 false, // [ 22] 0x16 false, // [ 23] 0x17 false, // [ 24] 0x18 false, // [ 25] 0x19 false, // [ 26] 0x1a false, // [ 27] 0x1b false, // [ 28] 0x1c false, // [ 29] 0x1d false, // [ 30] 0x1e false, // [ 31] 0x1f false, // [ 32] ' ' false, // [ 33] '!' false, // [ 34] '"' false, // [ 35] '#' false, // [ 36] '$' false, // [ 37] '%' false, // [ 38] '&' false, // [ 39] ''' false, // [ 40] '(' false, // [ 41] ')' false, // [ 42] '*' false, // [ 43] '+' false, // [ 44] ',' false, // [ 45] '-' false, // [ 46] '.' false, // [ 47] '/' false, // [ 48] '0' false, // [ 49] '1' false, // [ 50] '2' false, // [ 51] '3' false, // [ 52] '4' false, // [ 53] '5' false, // [ 54] '6' false, // [ 55] '7' false, // [ 56] '8' false, // [ 57] '9' false, // [ 58] ':' false, // [ 59] ';' false, // [ 60] '<' false, // [ 61] '=' false, // [ 62] '>' false, // [ 63] '?' false, // [ 64] '@' true, // [ 65] 'A' (Nucleotide) false, // [ 66] 'B' true, // [ 67] 'C' (Nucleotide) false, // [ 68] 'D' false, // [ 69] 'E' false, // [ 70] 'F' true, // [ 71] 'G' (Nucleotide) false, // [ 72] 'H' false, // [ 73] 'I' false, // [ 74] 'J' false, // [ 75] 'K' false, // [ 76] 'L' false, // [ 77] 'M' true, // [ 78] 'N' (Nucleotide) false, // [ 79] 'O' false, // [ 80] 'P' false, // [ 81] 'Q' false, // [ 82] 'R' false, // [ 83] 'S' true, // [ 84] 'T' (Nucleotide) true, // [ 85] 'U' (Nucleotide) false, // [ 86] 'V' false, // [ 87] 'W' false, // [ 88] 'X' false, // [ 89] 'Y' false, // [ 90] 'Z' false, // [ 91] '[' false, // [ 92] '\' false, // [ 93] ']' false, // [ 94] '^' false, // [ 95] '_' false, // [ 96] '`' true, // [ 97] 'A' (Nucleotide) false, // [ 98] 'B' true, // [ 99] 'C' (Nucleotide) false, // [100] 'D' false, // [101] 'E' false, // [102] 'F' true, // [103] 'G' (Nucleotide) false, // [104] 'H' false, // [105] 'I' false, // [106] 'J' false, // [107] 'K' false, // [108] 'L' false, // [109] 'M' true, // [110] 'N' (Nucleotide) false, // [111] 'O' false, // [112] 'P' false, // [113] 'Q' false, // [114] 'R' false, // [115] 'S' true, // [116] 'T' (Nucleotide) true, // [117] 'U' (Nucleotide) false, // [118] 'V' false, // [119] 'W' false, // [120] 'X' false, // [121] 'Y' false, // [122] 'Z' false, // [123] '{' false, // [124] '|' false, // [125] '}' false, // [126] '~' false, // [127] 0x7f false, // [128] 0x80 false, // [129] 0x81 false, // [130] 0x82 false, // [131] 0x83 false, // [132] 0x84 false, // [133] 0x85 false, // [134] 0x86 false, // [135] 0x87 false, // [136] 0x88 false, // [137] 0x89 false, // [138] 0x8a false, // [139] 0x8b false, // [140] 0x8c false, // [141] 0x8d false, // [142] 0x8e false, // [143] 0x8f false, // [144] 0x90 false, // [145] 0x91 false, // [146] 0x92 false, // [147] 0x93 false, // [148] 0x94 false, // [149] 0x95 false, // [150] 0x96 false, // [151] 0x97 false, // [152] 0x98 false, // [153] 0x99 false, // [154] 0x9a false, // [155] 0x9b false, // [156] 0x9c false, // [157] 0x9d false, // [158] 0x9e false, // [159] 0x9f false, // [160] 0xa0 false, // [161] 0xa1 false, // [162] 0xa2 false, // [163] 0xa3 false, // [164] 0xa4 false, // [165] 0xa5 false, // [166] 0xa6 false, // [167] 0xa7 false, // [168] 0xa8 false, // [169] 0xa9 false, // [170] 0xaa false, // [171] 0xab false, // [172] 0xac false, // [173] 0xad false, // [174] 0xae false, // [175] 0xaf false, // [176] 0xb0 false, // [177] 0xb1 false, // [178] 0xb2 false, // [179] 0xb3 false, // [180] 0xb4 false, // [181] 0xb5 false, // [182] 0xb6 false, // [183] 0xb7 false, // [184] 0xb8 false, // [185] 0xb9 false, // [186] 0xba false, // [187] 0xbb false, // [188] 0xbc false, // [189] 0xbd false, // [190] 0xbe false, // [191] 0xbf false, // [192] 0xc0 false, // [193] 0xc1 false, // [194] 0xc2 false, // [195] 0xc3 false, // [196] 0xc4 false, // [197] 0xc5 false, // [198] 0xc6 false, // [199] 0xc7 false, // [200] 0xc8 false, // [201] 0xc9 false, // [202] 0xca false, // [203] 0xcb false, // [204] 0xcc false, // [205] 0xcd false, // [206] 0xce false, // [207] 0xcf false, // [208] 0xd0 false, // [209] 0xd1 false, // [210] 0xd2 false, // [211] 0xd3 false, // [212] 0xd4 false, // [213] 0xd5 false, // [214] 0xd6 false, // [215] 0xd7 false, // [216] 0xd8 false, // [217] 0xd9 false, // [218] 0xda false, // [219] 0xdb false, // [220] 0xdc false, // [221] 0xdd false, // [222] 0xde false, // [223] 0xdf false, // [224] 0xe0 false, // [225] 0xe1 false, // [226] 0xe2 false, // [227] 0xe3 false, // [228] 0xe4 false, // [229] 0xe5 false, // [230] 0xe6 false, // [231] 0xe7 false, // [232] 0xe8 false, // [233] 0xe9 false, // [234] 0xea false, // [235] 0xeb false, // [236] 0xec false, // [237] 0xed false, // [238] 0xee false, // [239] 0xef false, // [240] 0xf0 false, // [241] 0xf1 false, // [242] 0xf2 false, // [243] 0xf3 false, // [244] 0xf4 false, // [245] 0xf5 false, // [246] 0xf6 false, // [247] 0xf7 false, // [248] 0xf8 false, // [249] 0xf9 false, // [250] 0xfa false, // [251] 0xfb false, // [252] 0xfc false, // [253] 0xfd false, // [254] 0xfe false, // [255] 0xff }; bool g_IsACGTU[256] = { false, // [ 0] 0x00 false, // [ 1] 0x01 false, // [ 2] 0x02 false, // [ 3] 0x03 false, // [ 4] 0x04 false, // [ 5] 0x05 false, // [ 6] 0x06 false, // [ 7] 0x07 false, // [ 8] 0x08 false, // [ 9] 0x09 false, // [ 10] 0x0a false, // [ 11] 0x0b false, // [ 12] 0x0c false, // [ 13] 0x0d false, // [ 14] 0x0e false, // [ 15] 0x0f false, // [ 16] 0x10 false, // [ 17] 0x11 false, // [ 18] 0x12 false, // [ 19] 0x13 false, // [ 20] 0x14 false, // [ 21] 0x15 false, // [ 22] 0x16 false, // [ 23] 0x17 false, // [ 24] 0x18 false, // [ 25] 0x19 false, // [ 26] 0x1a false, // [ 27] 0x1b false, // [ 28] 0x1c false, // [ 29] 0x1d false, // [ 30] 0x1e false, // [ 31] 0x1f false, // [ 32] ' ' false, // [ 33] '!' false, // [ 34] '"' false, // [ 35] '#' false, // [ 36] '$' false, // [ 37] '%' false, // [ 38] '&' false, // [ 39] ''' false, // [ 40] '(' false, // [ 41] ')' false, // [ 42] '*' false, // [ 43] '+' false, // [ 44] ',' false, // [ 45] '-' false, // [ 46] '.' false, // [ 47] '/' false, // [ 48] '0' false, // [ 49] '1' false, // [ 50] '2' false, // [ 51] '3' false, // [ 52] '4' false, // [ 53] '5' false, // [ 54] '6' false, // [ 55] '7' false, // [ 56] '8' false, // [ 57] '9' false, // [ 58] ':' false, // [ 59] ';' false, // [ 60] '<' false, // [ 61] '=' false, // [ 62] '>' false, // [ 63] '?' false, // [ 64] '@' true, // [ 65] 'A' (ACGT) false, // [ 66] 'B' true, // [ 67] 'C' (ACGT) false, // [ 68] 'D' false, // [ 69] 'E' false, // [ 70] 'F' true, // [ 71] 'G' (ACGT) false, // [ 72] 'H' false, // [ 73] 'I' false, // [ 74] 'J' false, // [ 75] 'K' false, // [ 76] 'L' false, // [ 77] 'M' false, // [ 78] 'N' false, // [ 79] 'O' false, // [ 80] 'P' false, // [ 81] 'Q' false, // [ 82] 'R' false, // [ 83] 'S' true, // [ 84] 'T' (ACGT) true, // [ 85] 'U' (ACGT) false, // [ 86] 'V' false, // [ 87] 'W' false, // [ 88] 'X' false, // [ 89] 'Y' false, // [ 90] 'Z' false, // [ 91] '[' false, // [ 92] '\' false, // [ 93] ']' false, // [ 94] '^' false, // [ 95] '_' false, // [ 96] '`' true, // [ 97] 'A' (ACGT) false, // [ 98] 'B' true, // [ 99] 'C' (ACGT) false, // [100] 'D' false, // [101] 'E' false, // [102] 'F' true, // [103] 'G' (ACGT) false, // [104] 'H' false, // [105] 'I' false, // [106] 'J' false, // [107] 'K' false, // [108] 'L' false, // [109] 'M' false, // [110] 'N' false, // [111] 'O' false, // [112] 'P' false, // [113] 'Q' false, // [114] 'R' false, // [115] 'S' true, // [116] 'T' (ACGT) true, // [117] 'U' (ACGT) false, // [118] 'V' false, // [119] 'W' false, // [120] 'X' false, // [121] 'Y' false, // [122] 'Z' false, // [123] '{' false, // [124] '|' false, // [125] '}' false, // [126] '~' false, // [127] 0x7f false, // [128] 0x80 false, // [129] 0x81 false, // [130] 0x82 false, // [131] 0x83 false, // [132] 0x84 false, // [133] 0x85 false, // [134] 0x86 false, // [135] 0x87 false, // [136] 0x88 false, // [137] 0x89 false, // [138] 0x8a false, // [139] 0x8b false, // [140] 0x8c false, // [141] 0x8d false, // [142] 0x8e false, // [143] 0x8f false, // [144] 0x90 false, // [145] 0x91 false, // [146] 0x92 false, // [147] 0x93 false, // [148] 0x94 false, // [149] 0x95 false, // [150] 0x96 false, // [151] 0x97 false, // [152] 0x98 false, // [153] 0x99 false, // [154] 0x9a false, // [155] 0x9b false, // [156] 0x9c false, // [157] 0x9d false, // [158] 0x9e false, // [159] 0x9f false, // [160] 0xa0 false, // [161] 0xa1 false, // [162] 0xa2 false, // [163] 0xa3 false, // [164] 0xa4 false, // [165] 0xa5 false, // [166] 0xa6 false, // [167] 0xa7 false, // [168] 0xa8 false, // [169] 0xa9 false, // [170] 0xaa false, // [171] 0xab false, // [172] 0xac false, // [173] 0xad false, // [174] 0xae false, // [175] 0xaf false, // [176] 0xb0 false, // [177] 0xb1 false, // [178] 0xb2 false, // [179] 0xb3 false, // [180] 0xb4 false, // [181] 0xb5 false, // [182] 0xb6 false, // [183] 0xb7 false, // [184] 0xb8 false, // [185] 0xb9 false, // [186] 0xba false, // [187] 0xbb false, // [188] 0xbc false, // [189] 0xbd false, // [190] 0xbe false, // [191] 0xbf false, // [192] 0xc0 false, // [193] 0xc1 false, // [194] 0xc2 false, // [195] 0xc3 false, // [196] 0xc4 false, // [197] 0xc5 false, // [198] 0xc6 false, // [199] 0xc7 false, // [200] 0xc8 false, // [201] 0xc9 false, // [202] 0xca false, // [203] 0xcb false, // [204] 0xcc false, // [205] 0xcd false, // [206] 0xce false, // [207] 0xcf false, // [208] 0xd0 false, // [209] 0xd1 false, // [210] 0xd2 false, // [211] 0xd3 false, // [212] 0xd4 false, // [213] 0xd5 false, // [214] 0xd6 false, // [215] 0xd7 false, // [216] 0xd8 false, // [217] 0xd9 false, // [218] 0xda false, // [219] 0xdb false, // [220] 0xdc false, // [221] 0xdd false, // [222] 0xde false, // [223] 0xdf false, // [224] 0xe0 false, // [225] 0xe1 false, // [226] 0xe2 false, // [227] 0xe3 false, // [228] 0xe4 false, // [229] 0xe5 false, // [230] 0xe6 false, // [231] 0xe7 false, // [232] 0xe8 false, // [233] 0xe9 false, // [234] 0xea false, // [235] 0xeb false, // [236] 0xec false, // [237] 0xed false, // [238] 0xee false, // [239] 0xef false, // [240] 0xf0 false, // [241] 0xf1 false, // [242] 0xf2 false, // [243] 0xf3 false, // [244] 0xf4 false, // [245] 0xf5 false, // [246] 0xf6 false, // [247] 0xf7 false, // [248] 0xf8 false, // [249] 0xf9 false, // [250] 0xfa false, // [251] 0xfb false, // [252] 0xfc false, // [253] 0xfd false, // [254] 0xfe false, // [255] 0xff }; float g_AminoFreqs[20] = { 0.0777f, // 'A' = Ala 0.0161f, // 'C' = Cys 0.0527f, // 'D' = Asp 0.0631f, // 'E' = Glu 0.0417f, // 'F' = Phe 0.0718f, // 'G' = Gly 0.0238f, // 'H' = His 0.0606f, // 'I' = Ile 0.0601f, // 'K' = Lys 0.0906f, // 'L' = Leu 0.0233f, // 'M' = Met 0.0439f, // 'N' = Asn 0.0456f, // 'P' = Pro 0.0368f, // 'Q' = Gln 0.0526f, // 'R' = Arg 0.0639f, // 'S' = Ser 0.0570f, // 'T' = Thr 0.0712f, // 'V' = Val 0.0134f, // 'W' = Trp 0.0339f, // 'Y' = Tyr }; muscle-5.1.0/src/alpha3.h000066400000000000000000000034171424453062600151300ustar00rootroot00000000000000#ifndef alpha3_h #define alpha3_h #include #include using namespace std; const byte INVALID_LETTER = 0xff; const byte INVALID_CHAR = '?'; const unsigned BAD_WORD = UINT_MAX; extern byte g_AminoAcidChars[]; extern byte g_CharToLetterAmino[]; extern byte g_CharToLetterAminoStop[]; extern byte g_CharToLetterAminoGap[]; extern byte g_LetterToCharAmino[]; extern byte g_LetterToCharAminoGap[]; extern byte g_CharToLetterNucleo[]; extern byte g_CharToLetterNucleoGap[]; extern byte g_CharToLetterNucleoMasked[]; extern byte g_LetterToCharNucleo[]; extern byte g_LetterToCharNucleoGap[]; extern byte g_CodonWordToAminoLetter[]; extern byte g_CodonWordToAminoChar[]; extern byte g_CharToCompChar[]; extern byte g_CharToCompLetter[]; extern byte g_IUPAC_PairCharToChar1[256]; extern byte g_IUPAC_PairCharToChar2[256]; extern byte g_IUPAC_PairCharToCharCase[256]; extern byte g_CharToLetterSEB8[256]; extern bool **g_MatchMxNucleo; extern bool **g_MatchMxAmino; extern bool g_IsAminoChar[]; extern bool g_IsNucleoChar[]; extern bool g_IsACGTU[]; extern bool g_IsSeqChar[]; extern float g_AminoFreqs[]; extern unsigned g_CharToLetterRed[]; extern byte g_LetterToCharRed[]; extern unsigned g_RedAlphaSize; void LogRedAlphaRed(); void ReadRedAlphaFromFile(const string &FileName); byte GetAminoCharFrom3NucChars(byte c1, byte c2, byte c3); const char *WordToStr(unsigned Word, unsigned WordLength, bool Nucleo); const char *WordToStrNucleo(unsigned Word, unsigned WordLength); const char *WordToStrAmino(unsigned Word, unsigned WordLength); const char *WordToStrAmino2(unsigned Word, unsigned WordLength, char *Str); static inline bool isgap(byte c) { return c == '-' || c == '.'; } void InitAlpha(); byte IUPAC_Pair(byte CharOrWildcard1, byte CharOrWildcard2); #endif // alpha3_h muscle-5.1.0/src/assertsameseqs.cpp000066400000000000000000000163041424453062600173550ustar00rootroot00000000000000#include "muscle.h" static uint g_AssertOkCount = 0; void _AssertSeqsEq(const char *FileName, uint LineNr, const MultiSequence &MSA1, const MultiSequence &MSA2) { const uint SeqCount1 = MSA1.GetSeqCount(); for (uint SeqIndex1 = 0; SeqIndex1 < SeqCount1; ++SeqIndex1) { const Sequence *Seq1 = MSA1.GetSequence((int) SeqIndex1); const string &Label = Seq1->m_Label; uint SeqIndex2 = MSA2.GetSeqIndex(Label); const Sequence *Seq2 = MSA2.GetSequence((int) SeqIndex2); uint GSI1 = Seq1->GetGSI(); uint GSI2 = Seq2->GetGSI(); Sequence *uSeq1 = Seq1->DeleteGaps(); Sequence *uSeq2 = Seq2->DeleteGaps(); int Length1 = uSeq1->GetLength(); int Length2 = uSeq2->GetLength(); const vector &v1 = uSeq1->m_CharVec; const vector &v2 = uSeq2->m_CharVec; if (v1 != v2 || GSI1 != GSI2) { Log("\n"); Log("AssertSeqsEq >%s\n", Label.c_str()); Log("GI1 %u, GI2 %u\n", GSI1, GSI2); Log("Seq1[%d] ", Length1); for (int i = 1; i < Length1; ++i) Log("%c", v1[i]); Log("\n"); Log("Seq2[%d] ", Length2); for (int i = 1; i < Length2; ++i) Log("%c", v2[i]); Log("\n"); Die("AssertSeqsEq %s:%u", FileName, LineNr); } DeleteSequence(uSeq1); DeleteSequence(uSeq2); } } void _AssertSeqsEqInput(const char *File, uint Line, const MultiSequence &MS) { const MultiSequence &GlobalMS = GetGlobalInputMS(); const uint GN = GetGlobalMSSeqCount(); const uint SeqCount = MS.GetSeqCount(); set GSIs; for (uint i = 0; i < SeqCount; ++i) { const Sequence *Seq = MS.GetSequence(i); uint GSI = Seq->GetGSI(); if (GSI >= GN) { MS.LogGSIs(); Die("%s:%u AssertSeqsEqInput GSI1=%u > GN=%u", File, Line, GSI, GN); } if (GSIs.find(GSI) != GSIs.end()) { MS.LogGSIs(); Die("%s:%u AssertSeqsEqInput dupe GSI=%u", File, Line, GSI); } const Sequence *InputSeq = GlobalMS.GetSequence(GSI); const string &Label = string(MS.GetLabel(i)); const string &GlobalLabel = InputSeq->m_Label; if (GlobalLabel != Label) { MS.LogGSIs(); Die("%s:%u AssertSeqsEqInput Seq(%u) GSI %u label '%s' != '%s'", File, Line, i, GSI, Label.c_str(), GlobalLabel.c_str()); } GSIs.insert(GSI); const Sequence *UngappedInputSeq = InputSeq->DeleteGaps(); const uint L = UngappedInputSeq->GetLength(); const Sequence *UngappedSeq = Seq->DeleteGaps(); const uint MSL = UngappedSeq->GetLength(); if (L != MSL) Die("%s:%u AssertSeqsEqInput Seq(%u) GSI=%u L=%u, MSL=%u, label=%s", File, Line, i, GSI, L, MSL, Label.c_str()); for (uint Pos = 0; Pos < L; ++Pos) { char InputChar = UngappedInputSeq->GetChar(Pos); char Char = UngappedSeq->GetChar(Pos); if (toupper(InputChar) != toupper(Char)) Die("%s:%u AssertSeqsEqInput Seq(%u) GSI=%u Pos[%u]=%c,%c label=%s", File, Line, i, GSI, Pos, Char, InputChar, Label.c_str()); } DeleteSequence(UngappedInputSeq); DeleteSequence(UngappedSeq); } } void _AssertSameSeqsVec(const char *File, uint Line, const MultiSequence &MS, vector &v) { MultiSequence *CombinedMS = new MultiSequence; const uint N = SIZE(v); for (uint i = 0; i < N; ++i) { const MultiSequence *MS = v[i]; const uint n = MS->GetSeqCount(); for (uint j = 0; j < n; ++j) { const Sequence *Seq = MS->GetSequence(j); CombinedMS->AddSequence(Seq, false); } } _AssertSameSeqs(File, Line, MS, *CombinedMS); ++g_AssertOkCount; delete CombinedMS; } void _AssertSameSeqsVec(const char *File, uint Line, const MultiSequence &MS, vector &v) { MultiSequence *CombinedMS = new MultiSequence; const uint N = SIZE(v); for (uint i = 0; i < N; ++i) { const MultiSequence *MS = v[i]; const uint n = MS->GetSeqCount(); for (uint j = 0; j < n; ++j) { const Sequence *Seq = MS->GetSequence(j); CombinedMS->AddSequence(Seq, false); } } _AssertSameSeqs(File, Line, MS, *CombinedMS); ++g_AssertOkCount; delete CombinedMS; } void _AssertSameSeqsJoin(const char *File, uint Line, const MultiSequence &MS1, const MultiSequence &MS2, const MultiSequence &MS12) { vector v; v.push_back(&MS1); v.push_back(&MS2); _AssertSameSeqsVec(File, Line, MS12, v); } uint GetAssertSameSeqsOkCount() { return g_AssertOkCount; } void _AssertSameLabels(const char *File, uint Line, const MultiSequence &MS) { const MultiSequence &GlobalMS = GetGlobalInputMS(); const uint GN = GetGlobalMSSeqCount(); const uint SeqCount = MS.GetSeqCount(); set GSIs; for (uint i = 0; i < SeqCount; ++i) { const Sequence *Seq = MS.GetSequence(i); uint GSI = Seq->GetGSI(); if (GSI >= GN) { MS.LogGSIs(); Die("%s:%u AssertSameLabels GSI1=%u > GN=%u", File, Line, GSI, GN); } if (GSIs.find(GSI) != GSIs.end()) { MS.LogGSIs(); Die("%s:%u AssertSameLabels dupe GSI=%u", File, Line, GSI); } const string &Label = string(MS.GetLabel(i)); const string &GlobalLabel = string(GlobalMS.GetLabel(GSI)); if (GlobalLabel != Label) { MS.LogGSIs(); Die("%s:%u AssertSameLabels Seq(%u) GSI %u label '%s' != '%s'", File, Line, i, GSI, Label.c_str(), GlobalLabel.c_str()); } GSIs.insert(GSI); } } void _AssertSameSeqs(const char *File, uint Line, const MultiSequence &MS1, const MultiSequence &MS2) { const MultiSequence &GlobalMS = GetGlobalInputMS(); const uint GN = GetGlobalMSSeqCount(); const uint SeqCount = MS1.GetSeqCount(); const uint SeqCount2 = MS2.GetSeqCount(); if (SeqCount2 != SeqCount) Die("%s:%u AssertSameSeqs N1=%u, N22=%u", File, Line, SeqCount, SeqCount2); set GSIs1; set GSIs2; for (uint i = 0; i < SeqCount; ++i) { const Sequence *Seq1 = MS1.GetSequence(i); const Sequence *Seq2 = MS2.GetSequence(i); uint GSI1 = Seq1->GetGSI(); uint GSI2 = Seq2->GetGSI(); if (GSI1 >= GN) Die("%s:%u AssertSameSeqs GSI1=%u > GN=%u", File, Line, GSI1, GN); if (GSI2 >= GN) Die("%s:%u AssertSameSeqs GSI2=%u > GN=%u", File, Line, GSI2, GN); if (GSIs1.find(GSI1) != GSIs1.end()) { MS1.LogGSIs(); Die("%s:%u AssertSameSeqs dupe GSI1=%u", File, Line, GSI1); } if (GSIs2.find(GSI2) != GSIs2.end()) { MS2.LogGSIs(); Die("%s:%u AssertSameSeqs dupe GSI2=%u", File, Line, GSI2, GN); } const string &Label1 = string(MS1.GetLabel(i)); const string &Label2 = string(MS2.GetLabel(i)); const string &GlobalLabel1 = string(GlobalMS.GetLabel(GSI1)); const string &GlobalLabel2 = string(GlobalMS.GetLabel(GSI2)); if (GlobalLabel1 != Label1) Die("%s:%u AssertSameSeqs Seq1(%u) GI %u label '%s' != '%s'", File, Line, i, GSI1, Label1.c_str(), GlobalLabel1.c_str()); if (GlobalLabel2 != Label2) Die("%s:%u AssertSameSeqs Seq2(%u) GI %u label '%s' != '%s'", File, Line, i, GSI2, Label2.c_str(), GlobalLabel2.c_str()); GSIs1.insert(GSI1); GSIs2.insert(GSI2); } for (set::const_iterator p = GSIs1.begin(); p != GSIs1.end(); ++p) { uint GSI1 = *p; if (GSIs2.find(GSI1) == GSIs2.end()) Die("%s:%u AssertSameSeqs GSI1=%u missing in MS2", File, Line, GSI1); } for (set::const_iterator p = GSIs2.begin(); p != GSIs2.end(); ++p) { uint GSI2 = *p; if (GSIs1.find(GSI2) == GSIs1.end()) Die("%s:%u AssertSameSeqs GSI2=%u missing in MS1", File, Line, GSI2); } ++g_AssertOkCount; } muscle-5.1.0/src/best3.h000066400000000000000000000013011424453062600147660ustar00rootroot00000000000000#pragma once // Store the largest of three values x1, x2, and x3 in *x. // If x_i is the largest value, then store b_i in *b. static inline void Best3(float x1, float x2, float x3, char b1, char b2, char b3, float *x, char *b) { if (x1 >= x2) { if (x1 >= x3) { *x = x1; *b = b1; return; } *x = x3; *b = b3; return; } if (x2 >= x3) { *x = x2; *b = b2; return; } *x = x3; *b = b3; } // Store the largest of three values x1, x2, and x3 in *x. static inline void Best3(float x1, float x2, float x3, float *x) { if (x1 >= x2) { if (x1 >= x3) { *x = x1; return; } *x = x3; return; } if (x2 >= x3) { *x = x2; return; } *x = x3; } muscle-5.1.0/src/build_describe.h000066400000000000000000000000351424453062600167100ustar00rootroot00000000000000" v5.0.1428-14-gc683-dirty " muscle-5.1.0/src/build_time.h000066400000000000000000000000271424453062600160670ustar00rootroot00000000000000__DATE__ " " __TIME__ muscle-5.1.0/src/buildposterior3flat.cpp000066400000000000000000000053411424453062600203110ustar00rootroot00000000000000#include "muscle.h" // Builds a posterior probability matrix needed to align a pair // of alignments. Mathematically, the returned matrix M is // defined as follows: // M[SeqIndex1,SeqIndex2] = sum sum f(s,t,SeqIndex1,SeqIndex2) // s in align1 t in align2 // where // [ P(s[SeqIndex1] <--> t[SeqIndex2]) // [ if s[SeqIndex1] is a letter in the ith column of align1 and // [ t[SeqIndex2] it a letter in the jth column of align2 // f(s,t,SeqIndex1,SeqIndex2) = [ // [ 0 otherwise // // This is a variant of BuildPosterior() where sparse posterior matrices // contain all pairs with one sequence from MSA1 and the other from MSA2, // rather than all pairs in the union as in CalcPostFlat. void CalcPosteriorFlat3(const MultiSequence &MSA1, const MultiSequence &MSA2, const vector &SeqIndexes1, const vector &SeqIndexes2, const vector &SparseMxs, float *Flat) { const uint SeqCount1 = MSA1.GetSeqCount(); const uint SeqCount2 = MSA1.GetSeqCount(); const uint ColCount1 = MSA1.GetColCount(); const uint ColCount2 = MSA2.GetColCount(); const uint FlatSize = ColCount1*ColCount2; for (uint i = 0; i < FlatSize; ++i) Flat[i] = 0; vector PosToCol1; vector PosToCol2; // May be subset of all pairs due to sampling const uint PairCount = SIZE(SparseMxs); for (uint PairIndex = 0; PairIndex < PairCount; ++PairIndex) { uint SeqIndex1 = SeqIndexes1[PairIndex]; uint SeqIndex2 = SeqIndexes2[PairIndex]; const Sequence *Seq1 = MSA1.GetSequence(SeqIndex1); const Sequence *Seq2 = MSA2.GetSequence(SeqIndex2); const uint ColCountSeq1 = Seq1->GetLength(); const uint ColCountSeq2 = Seq2->GetLength(); asserta(ColCountSeq1 == ColCount1); asserta(ColCountSeq2 == ColCount2); const MySparseMx &PostMx12 = *SparseMxs[PairIndex]; const uint L1 = PostMx12.GetLX(); const uint L2 = PostMx12.GetLY(); Seq1->GetPosToCol(PosToCol1); Seq2->GetPosToCol(PosToCol2); asserta(SIZE(PosToCol1) == L1); asserta(SIZE(PosToCol2) == L2); for (uint Pos1 = 0; Pos1 < L1; ++Pos1) { uint Offset = PostMx12.GetOffset(Pos1); uint RowSize = PostMx12.GetSize(Pos1); assert(Pos1 < SIZE(PosToCol1)); uint Col1 = PosToCol1[Pos1]; uint FlatBase = Col1 * ColCount2; for (uint k = 0; k < RowSize; ++k) { float Prob = PostMx12.GetProb_Offset(Offset + k); uint Pos2 = PostMx12.GetCol_Offset(Offset + k); assert(Pos2 < SIZE(PosToCol2)); uint Col2 = PosToCol2[Pos2]; uint FlatOffset = FlatBase + Col2; assert(FlatOffset < FlatSize); Flat[FlatOffset] += Prob; } } } } muscle-5.1.0/src/buildpostflat.cpp000066400000000000000000000052441424453062600171670ustar00rootroot00000000000000#include "muscle.h" #include "mpcflat.h" #define TRACE 0 // Builds a posterior probability matrix needed to align a pair // of alignments. Mathematically, the returned matrix M is // defined as follows: // M[i,j] = sum sum f(s,t,i,j) // s in align1 t in align2 // where // [ P(s[i] <--> t[j]) // [ if s[i] is a letter in the ith column of align1 and // [ t[j] is a letter in the jth column of align2 // f(s,t,i,j) = [ // [ 0 otherwise // void MPCFlat::BuildPost(const MultiSequence &MSA1, const MultiSequence &MSA2, float *Post) { const uint SeqCount1 = MSA1.GetSeqCount(); const uint SeqCount2 = MSA2.GetSeqCount(); const uint ColCount1 = MSA1.GetColCount(); const uint ColCount2 = MSA2.GetColCount(); uint Ix = 0; for (uint i = 0; i < ColCount1; ++i) for (uint j = 0; j < ColCount2; ++j) Post[Ix++] = 0; // for each s in MSA1 vector PosToCol1; vector PosToCol2; for (uint SeqIndex1 = 0; SeqIndex1 < SeqCount1; ++SeqIndex1) { const Sequence *Seq1 = MSA1.GetSequence(SeqIndex1); uint SMI_1 = Seq1->GetSMI(); asserta(SMI_1 != UINT_MAX); Seq1->GetPosToCol(PosToCol1); // for each t in MSA2 for (uint SeqIndex2 = 0; SeqIndex2 < SeqCount2; SeqIndex2++) { const Sequence *Seq2 = MSA2.GetSequence(SeqIndex2); uint SMI_2 = Seq2->GetSMI(); asserta(SMI_2 != UINT_MAX); asserta(SMI_1 != SMI_2); Seq2->GetPosToCol(PosToCol2); if (SMI_1 < SMI_2) { uint PairIndex = GetPairIndex(SMI_1, SMI_2); const MySparseMx &Mx = GetSparsePost(PairIndex); const uint LX = Mx.GetLX(); const uint LY = Mx.GetLY(); for (uint i = 0; i < LX; ++i) { uint Col1 = PosToCol1[i]; uint Offset = Mx.GetOffset(i); uint Size = Mx.GetSize(i); for (uint k = 0; k < Size; ++k) { float P = Mx.GetProb_Offset(Offset); uint j = Mx.GetCol_Offset(Offset); ++Offset; uint Col2 = PosToCol2[j]; Post[Col1*ColCount2 + Col2] += P; } } } else { uint PairIndex = GetPairIndex(SMI_2, SMI_1); const MySparseMx &Mx = GetSparsePost(PairIndex); const uint LX = Mx.GetLX(); const uint LY = Mx.GetLY(); for (uint i = 0; i < LX; ++i) { uint Col2 = PosToCol2[i]; uint Offset = Mx.GetOffset(i); uint Size = Mx.GetSize(i); for (uint k = 0; k < Size; ++k) { float P = Mx.GetProb_Offset(Offset); uint j = Mx.GetCol_Offset(Offset); ++Offset; uint Col1 = PosToCol1[j]; Post[Col1*ColCount2 + Col2] += P; } } } } } #if 0//TRACE LogFlatMx("MSAPost", Post, ColCount1, ColCount2); #endif } muscle-5.1.0/src/bwdflat3.cpp000066400000000000000000000105601424453062600160160ustar00rootroot00000000000000#include "muscle.h" /*** Bwd[s][i][j] = probability of starting in state s and aligning last (LX-i) letters of X to last (LY-j) letters of Y. ***/ void CalcBwdFlat(const byte *X, uint LX, const byte *Y, uint LY, float *Flat) { #include "hmmscores.h" const int iLX = int(LX); const int iLY = int(LY); const int LY1 = LY+1; const int BaseInc_i = HMMSTATE_COUNT*LY1; const int BaseInc_j = HMMSTATE_COUNT; uint Base = HMMSTATE_COUNT*(0*(LY1) + LY); for (int i = 0; i < iLX; ++i) { Flat[Base + HMMSTATE_IY] = LOG_ZERO; Flat[Base + HMMSTATE_JY] = LOG_ZERO; Base += BaseInc_i; } Base = HMMSTATE_COUNT*(LX*(LY1) + 0); for (int j = 0; j < iLY; ++j) { Flat[Base + HMMSTATE_IX] = LOG_ZERO; Flat[Base + HMMSTATE_JX] = LOG_ZERO; Base += BaseInc_j; } int Base_i_j = (int) HMMSTATE_COUNT*(LX*(LY1) + LY); int Base_i1_j = Base_i_j + BaseInc_i; int Base_i_j1 = Base_i_j + BaseInc_j; int Base_i1_j1 = Base_i_j + BaseInc_i + BaseInc_j; for (int i = iLX; i >= 0; --i) { char x = (i == iLX ? 0 : X[i]); float Emit_x = InsScore[x]; for (int j = iLY; j >= 0; --j) { if (i == LX && j == LY) { // Special case for end-of-alignment Flat[Base_i_j + HMMSTATE_M] = tSM; Flat[Base_i_j + HMMSTATE_IX] = tSI; Flat[Base_i_j + HMMSTATE_IY] = tSI; Flat[Base_i_j + HMMSTATE_JX] = tSJ; Flat[Base_i_j + HMMSTATE_JY] = tSJ; Base_i_j -= BaseInc_j; Base_i1_j -= BaseInc_j; Base_i_j1 -= BaseInc_j; Base_i1_j1 -= BaseInc_j; continue; } char y = (j == iLY ? 0 : Y[j]); float Emit_y = InsScore[y]; float Emit_xy = MatchScore[x][y]; if (i < iLX && j < iLY) { float NextM = Flat[Base_i1_j1 + HMMSTATE_M] + Emit_xy; float NextIX = Flat[Base_i1_j + HMMSTATE_IX] + Emit_x; float NextJX = Flat[Base_i1_j + HMMSTATE_JX] + Emit_x; float NextIY = Flat[Base_i_j1 + HMMSTATE_IY] + Emit_y; float NextJY = Flat[Base_i_j1 + HMMSTATE_JY] + Emit_y; if (i > 0 && j > 0) { float M_M = tMM + NextM; float M_IX = tMI + NextIX; float M_JX = tMJ + NextJX; float M_IY = tMI + NextIY; float M_JY = tMJ + NextJY; Flat[Base_i_j + HMMSTATE_M] = LOG_ADD(M_M, M_IX, M_JX, M_IY, M_JY); } else Flat[Base_i_j + HMMSTATE_M] = LOG_ZERO; if (i > 0) { float IX_IX = tII + NextIX; float IX_M = tIM + NextM; Flat[Base_i_j + HMMSTATE_IX] = LOG_ADD(IX_IX, IX_M); float JX_JX = tJJ + NextJX; float JX_M = tJM + NextM; Flat[Base_i_j + HMMSTATE_JX] = LOG_ADD(JX_JX, JX_M); } else { Flat[Base_i_j + HMMSTATE_IX] = LOG_ZERO; Flat[Base_i_j + HMMSTATE_JX] = LOG_ZERO; } if (j > 0) { float IY_IY = tII + NextIY; float IY_M = tIM + NextM; Flat[Base_i_j + HMMSTATE_IY] = LOG_ADD(IY_IY, IY_M); float JY_JY = tJJ + NextJY; float JY_M = tJM + NextM; Flat[Base_i_j + HMMSTATE_JY] = LOG_ADD(JY_JY, JY_M); } else { Flat[Base_i_j + HMMSTATE_IY] = LOG_ZERO; Flat[Base_i_j + HMMSTATE_JY] = LOG_ZERO; } Base_i_j -= BaseInc_j; Base_i1_j -= BaseInc_j; Base_i_j1 -= BaseInc_j; Base_i1_j1 -= BaseInc_j; continue; } if (i < iLX) { assert(j == iLY); if (i > 0) { float NextIX = Flat[Base_i1_j + HMMSTATE_IX] + Emit_x; float NextJX = Flat[Base_i1_j + HMMSTATE_JX] + Emit_x; float M_IX = tMI + NextIX; float M_JX = tMJ + NextJX; Flat[Base_i_j + HMMSTATE_M] = LOG_ADD(M_IX, M_JX); Flat[Base_i_j + HMMSTATE_IX] = tII + NextIX; Flat[Base_i_j + HMMSTATE_JX] = tJJ + NextJX; } else { Flat[Base_i_j + HMMSTATE_M] = LOG_ZERO; Flat[Base_i_j + HMMSTATE_IX] = LOG_ZERO; Flat[Base_i_j + HMMSTATE_JX] = LOG_ZERO; } } if (j < iLY) { assert(i == iLX); float NextIY = Flat[Base_i_j1 + HMMSTATE_IY] + Emit_y; float NextJY = Flat[Base_i_j1 + HMMSTATE_JY] + Emit_y; float M_IY = tMI + NextIY; float M_JY = tMJ + NextJY; if (j > 0) { Flat[Base_i_j + HMMSTATE_M] = LOG_ADD(M_IY, M_JY); Flat[Base_i_j + HMMSTATE_IY] = tII + NextIY; Flat[Base_i_j + HMMSTATE_JY] = tJJ + NextJY; } else { Flat[Base_i_j + HMMSTATE_M] = LOG_ZERO; Flat[Base_i_j + HMMSTATE_IY] = LOG_ZERO; Flat[Base_i_j + HMMSTATE_JY] = LOG_ZERO; } } Base_i_j -= BaseInc_j; Base_i1_j -= BaseInc_j; Base_i_j1 -= BaseInc_j; Base_i1_j1 -= BaseInc_j; } } } muscle-5.1.0/src/calcalnflat.cpp000066400000000000000000000015471424453062600165610ustar00rootroot00000000000000#include "muscle.h" #include "best3.h" void TraceBackFlat(const char *TB, uint LX, uint LY, string &Path); float CalcAlnFlat(const float *Post, uint LX, uint LY, float *DPRows, char *TB, string &Path) { Path.clear(); float *OldRow = DPRows; float *NewRow = DPRows + (LY+1); char *TBPtr = TB; for (uint j = 0; j <= LY; ++j) { OldRow[j] = 0; *TBPtr++ = 'Y'; } const float *PostPtr = Post; for (uint i = 1; i <= LX; ++i) { uint64 k = TBPtr - TB; *TBPtr++ = 'X'; NewRow[0] = 0; for (uint j = 1; j <= LY; ++j) { float B = OldRow[j-1] + *PostPtr++; float X = OldRow[j]; float Y = NewRow[j-1]; float Best; char TBChar; Best3(B, X, Y, 'B', 'X', 'Y', &Best, &TBChar); NewRow[j] = Best; *TBPtr++ = TBChar; } swap(OldRow, NewRow); } float Score = OldRow[LY]; TraceBackFlat(TB, LX, LY, Path); return Score; } muscle-5.1.0/src/calcalnscoreflat.cpp000066400000000000000000000011131424453062600176020ustar00rootroot00000000000000#include "muscle.h" #include "best3.h" float CalcAlnScoreFlat(const float *Post, uint LX, uint LY, float *DPRows) { float *Row = DPRows; for (uint j = 0; j <= LY; ++j) Row[j] = 0; const float *PostPtr = Post; for (uint i = 1; i <= LX; ++i) { float Currj1 = 0; float Prevj1 = Row[0]; Row[0] = 0; for (uint j = 1; j <= LY; ++j) { float Prevj = Row[j]; float P = *PostPtr++; float B = Prevj1 + P; float X = Prevj; float Y = Currj1; Prevj1 = Row[j]; Best3(B, X, Y, &Currj1); Row[j] = Currj1; } } float Score = Row[LY]; return Score; } muscle-5.1.0/src/calcalnscoresparse.cpp000066400000000000000000000006631424453062600201620ustar00rootroot00000000000000#include "muscle.h" #include "best3.h" float CalcAlnScoreSparse(const MySparseMx &Mx) { uint LX = Mx.m_LX; uint LY = Mx.m_LY; #if 0 float Sum = 0; for (uint i = 0; i < Mx.m_LX; ++i) Sum += Mx.GetMaxProbRow(i); return Sum; #endif float *Post = AllocPost(LX, LY); Mx.ToPost(Post); float *DPRows = AllocDPRows(LX, LY); float Score = CalcAlnScoreFlat(Post, LX, LY, DPRows); myfree(Post); myfree(DPRows); return Score; } muscle-5.1.0/src/calcposteriorflat.cpp000066400000000000000000000042141424453062600200270ustar00rootroot00000000000000#include "muscle.h" #include "mpcflat.h" void CalcPostFlat(const float *FlatFwd, const float *FlatBwd, uint LX, uint LY, float *Post) { float Total = CalcTotalProbFlat(FlatFwd, FlatBwd, LX, LY); uint IxFB = HMMSTATE_COUNT*((LY + 1) + 1); // M[1,1] uint IxPost = 0; for (uint i = 0; i < LX; ++i) { for (uint j = 0; j < LY; ++j) { float Score = FlatFwd[IxFB] + FlatBwd[IxFB] - Total; if (Score < MIN_SPARSE_SCORE) Post[IxPost++] = 0; else { float P = (Score >= LOG_ONE ? 1.0f : expf(Score)); Post[IxPost++] = P; } IxFB += HMMSTATE_COUNT; } IxFB += HMMSTATE_COUNT; } } void MPCFlat::CalcPosterior(uint PairIndex) { const pair &Pair = GetPair(PairIndex); const uint SeqIndexX = Pair.first; const uint SeqIndexY = Pair.second; uint LX = GetL(SeqIndexX); uint LY = GetL(SeqIndexY); float *Fwd = AllocFB(LX, LY); float *Bwd = AllocFB(LX, LY); const byte *X = GetBytePtr(SeqIndexX); const byte *Y = GetBytePtr(SeqIndexY); CalcFwdFlat(X, LX, Y, LY, Fwd); CalcBwdFlat(X, LX, Y, LY, Bwd); float *Post = AllocPost(LX, LY); CalcPostFlat(Fwd, Bwd, LX, LY, Post); #if 0//TRACE LogFlatMxs("FwdFlat", Fwd, LX, LY); LogFlatMxs("BwdFlat", Bwd, LX, LY); LogFlatMx("PostFlat", Post, LX, LY); #endif myfree(Fwd); myfree(Bwd); #if 0//TRACE LogFlatMx1("Fwd", Fwd, LX, LY); LogFlatMx1("Bwd", Bwd, LX, LY); LogFlatMx("Post", Post, LX, LY); #endif MySparseMx &SparsePost = GetSparsePost(PairIndex); SparsePost.FromPost(Post, LX, LY); SparsePost.m_X = X; SparsePost.m_Y = Y; #if 0//TRACE SparsePost.LogMe(); #endif float *DPRows = AllocDPRows(LX, LY); float Score = CalcAlnScoreFlat(Post, LX, LY, DPRows); myfree(Post); myfree(DPRows); #if 0//TRACE string Path; char *TB = myalloc(char, (LX+1)*(LY+1)); float Score2 = CalcAlnFlat(Post, LX, LY, DPRows, TB, Path); Log("Score=%.3g Score2=%.3g\n", Score, Score2); myfree(TB); #endif float EA = Score/min(LX, LY); #if 0//TRACE const char *LabelX = GetLabel(SeqIndexX); const char *LabelY = GetLabel(SeqIndexY); Log("Flat EA(%s, %s) = %.3g\n", LabelX, LabelY, EA); #endif m_DistMx[SeqIndexX][SeqIndexY] = EA; m_DistMx[SeqIndexY][SeqIndexX] = EA; } muscle-5.1.0/src/chainer.h000066400000000000000000000026531424453062600153720ustar00rootroot00000000000000#pragma once #include "hspfinder.h" #include const float BAD_SCORE = -9e9f; struct HSPData; // Bendpoint struct BPData { uint Pos; bool IsLo; uint Index; void LogMe() const { Log("BP%s Pos %u Ix %u", (IsLo ? "lo" : "hi"), Pos, Index); } }; struct ChainData { uint LastHSPIndex; uint Ahi; uint Bhi; float Score; }; class Chainer { public: const vector *m_HSPs; BPData *m_BPs; uint *m_PrevHSPIndexes; // Predecessor in chain float *m_HSPIndexToChainScore; list m_Chains; // Live HSP indexes public: Chainer() { m_HSPs = 0; m_BPs = 0; m_PrevHSPIndexes = 0; m_HSPIndexToChainScore = 0; } ~Chainer() { Clear(); } void Clear() { m_HSPs = 0; m_Chains.clear(); myfree(m_BPs); myfree(m_PrevHSPIndexes); myfree(m_HSPIndexToChainScore); m_BPs = 0; m_PrevHSPIndexes = 0; m_HSPIndexToChainScore = 0; } void Run(const vector &HSPs, vector &Chain); void LogMe() const; void LogBPs() const; public: static void LogHSPs(const vector &HSPs); static bool IsValidChain(const vector &HSPs); static void AssertValidChain(const vector &HSPs); static void LogChain(const vector &HSPs, bool TestValid); static float GetChainScore(HSPData **HSPs, uint HSPCount); private: void SetBPs(); void SortBPs(); uint FindBestChainLT(uint Ahi, uint Bhi); }; muscle-5.1.0/src/cmds.h000066400000000000000000000012671424453062600147070ustar00rootroot00000000000000#ifndef C #error "C not defined" #endif C(align) C(upgma5) C(msastats) C(pprog) C(pprog2) C(pprogt) C(strip_gappy_cols) C(strip_gappy_rows) C(guide_tree_join_order) C(eadistmx) C(eadistmx_msas) C(split_tree) C(tree_subset_nodes) C(consseq) C(super4) C(usorter) C(permute_tree) C(divide_tree) C(qscore) C(qscore2) C(qscoredir) C(eacluster) C(derep) C(uclust) C(super5) C(transaln) C(hmmdump) C(perturbhmm) C(testlog) C(resample) C(disperse) C(efastats) C(fa2efa) C(colscore_efa) C(qscore_efa) C(efa_bestconf) C(efa_bestcols) C(trimtoref) C(trimtoref_efa) C(efa_explode) C(relabel) C(addconfseq) C(labels2randomchaintree) C(maxcc) C(letterconf) C(letterconf_html) C(make_a2m) C(eesort) #undef C muscle-5.1.0/src/colscoreefa.cpp000066400000000000000000000047131424453062600166000ustar00rootroot00000000000000#include "muscle.h" #include "ensemble.h" static const uint MAXBIN = 10; static uint GetBin(double Conf) { asserta(Conf > 0 && Conf <= 1); if (Conf == 1) return MAXBIN; uint Bin = uint(Conf*10); asserta(Bin >= 0 && Bin < MAXBIN); return Bin; } void cmd_colscore_efa() { const string EfaFileName = opt(colscore_efa); const string RefFileName = opt(ref); const string OutputFileName = opt(output); double MaxGapFract = 0.5; if (optset_max_gap_fract) MaxGapFract = opt(max_gap_fract); Ensemble E; E.FromFile(EfaFileName); MSA Ref; Ref.FromFASTAFile_PreserveCase(RefFileName); FILE *fOut = CreateStdioFile(OutputFileName); const uint MSACount = E.GetMSACount(); E.SortMSA(Ref); set RefUniqueIxs; E.GetRefUniqueIxs(Ref, RefUniqueIxs, MaxGapFract); const uint RefIxCount = SIZE(RefUniqueIxs); uint RefUpperColCount = 0; const uint RefColCount = Ref.GetColCount(); for (uint RefColIndex = 0; RefColIndex < RefColCount; ++RefColIndex) if (Ref.ColIsUpper(RefColIndex, MaxGapFract)) ++RefUpperColCount; set > RefPosSet; E.GetRefPosSet(Ref, MaxGapFract, RefPosSet); vector BinToCount(MAXBIN+1); vector BinToCorrectCount(MAXBIN+1); double SumTC = 0; for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { vector TestUniqueIxs; vector Confs; E.GetTestUniqueIxs(MSAIndex, RefPosSet, TestUniqueIxs, Confs); const uint TestIxCount = SIZE(TestUniqueIxs); uint CorrectCount = 0; for (uint i = 0; i < TestIxCount; ++i) { uint TestUniqueIx = TestUniqueIxs[i]; double Conf = Confs[i]; uint Bin = GetBin(Conf); asserta(Bin <= MAXBIN); ++BinToCount[Bin]; bool Correct = (RefUniqueIxs.find(TestUniqueIx) != RefUniqueIxs.end()); if (Correct) { ++CorrectCount; ++BinToCorrectCount[Bin]; } // Pf(fOut, "col %c %.4f\n", tof(Correct), Conf); } double TC = double(CorrectCount)/RefUpperColCount; SumTC += TC; //Pf(fOut, "tc %u %.4f\n", MSAIndex, TC); } double MeanTC = SumTC/MSACount; Pf(fOut, "meantc %.4f\n", MeanTC); ProgressLog("Mean TC %.4f\n", MeanTC); ProgressLog("Bins "); for (uint Bin = 0; Bin <= MAXBIN; ++Bin) { uint Count = BinToCount[Bin]; uint CorrectCount = BinToCorrectCount[Bin]; asserta(CorrectCount <= Count); double P = 0; if (Count > 0) P = double(CorrectCount)/Count; Pf(fOut, "bin %u %u %u %.4f\n", Bin, Count, CorrectCount, P); ProgressLog(" %.2f", P); } ProgressLog("\n"); CloseStdioFile(fOut); } muscle-5.1.0/src/consflat.cpp000066400000000000000000000010601424453062600161140ustar00rootroot00000000000000#include "muscle.h" #include "mpcflat.h" #include "locallock.h" void MPCFlat::ConsIter(uint Iter) { uint PairCount = SIZE(m_Pairs); asserta(PairCount > 0); unsigned ThreadCount = GetRequestedThreadCount(); uint PairCounter = 0; #pragma omp parallel for num_threads(ThreadCount) for (int PairIndex = 0; PairIndex < (int) PairCount; ++PairIndex) { Lock(); ProgressStep(PairCounter++, PairCount, "Consistency (%u/%u)", Iter+1, m_ConsistencyIterCount); Unlock(); ConsPair(PairIndex); } swap(m_ptrSparsePosts, m_ptrUpdatedSparsePosts); } muscle-5.1.0/src/conspairflat.cpp000066400000000000000000000054271424453062600170030ustar00rootroot00000000000000#include "muscle.h" #include "mpcflat.h" #if 0//TRACE const byte *g_X; const byte *g_Y; const byte *g_Z; #endif void MPCFlat::ConsPair(uint PairIndex) { const pair &Pair = GetPair(PairIndex); uint SeqIndexX = Pair.first; uint SeqIndexY = Pair.second; const MySparseMx &SparsePostXY = GetSparsePost(PairIndex); uint LX = GetSeqLength(SeqIndexX); uint LY = GetSeqLength(SeqIndexY); asserta(SparsePostXY.GetLX() == LX); asserta(SparsePostXY.GetLY() == LY); float *Post = AllocPost(LX, LY); SparsePostXY.ToPost(Post); // Account for Z=X and Z=Y (hence the factor 2) for (uint k = 0; k < LX*LY; ++k) Post[k] *= 2; #if 0//TRACE LogFlatMx("ConsPair Z=X Z=Y", Post, LX, LY); #endif const uint SeqCount = GetSeqCount(); asserta(SeqIndexX < SeqIndexY); // because convention for pairs for (uint SeqIndexZ = 0; SeqIndexZ < SeqCount; ++SeqIndexZ) { if (SeqIndexZ == SeqIndexX || SeqIndexZ == SeqIndexY) continue; #if 0//TRACE g_X = GetSequence(SeqIndexX)->GetBytePtr(); g_Y = GetSequence(SeqIndexY)->GetBytePtr(); g_Z = GetSequence(SeqIndexZ)->GetBytePtr(); #endif if (SeqIndexZ < SeqIndexX) { asserta(SeqIndexZ < SeqIndexY); // because SeqIndexX < SeqIndexY uint PairIndexZX = GetPairIndex(SeqIndexZ, SeqIndexX); uint PairIndexZY = GetPairIndex(SeqIndexZ, SeqIndexY); const MySparseMx &ZX = GetSparsePost(PairIndexZX); const MySparseMx &ZY = GetSparsePost(PairIndexZY); RelaxFlat_ZX_ZY(ZX, ZY, Post); #if 0//TRACE LogFlatMx("ConsPair after RelaxFlat_ZX_ZY", Post, LX, LY); #endif } else if (SeqIndexZ > SeqIndexX && SeqIndexZ < SeqIndexY) { uint PairIndexXZ = GetPairIndex(SeqIndexX, SeqIndexZ); uint PairIndexZY = GetPairIndex(SeqIndexZ, SeqIndexY); const MySparseMx &XZ = GetSparsePost(PairIndexXZ); const MySparseMx &ZY = GetSparsePost(PairIndexZY); RelaxFlat_XZ_ZY(XZ, ZY, Post); #if 0//TRACE LogFlatMx("ConsPair after RelaxFlat_XZ_ZY", Post, LX, LY); #endif } else if (SeqIndexZ > SeqIndexX && SeqIndexZ > SeqIndexY) { uint PairIndexXZ = GetPairIndex(SeqIndexX, SeqIndexZ); uint PairIndexYZ = GetPairIndex(SeqIndexY, SeqIndexZ); const MySparseMx &XZ = GetSparsePost(PairIndexXZ); const MySparseMx &YZ = GetSparsePost(PairIndexYZ); RelaxFlat_XZ_YZ(XZ, YZ, Post); #if 0//TRACE LogFlatMx("ConsPair after RelaxFlat_XZ_YZ", Post, LX, LY); #endif } else asserta(false); } MySparseMx &UpdatedSparsePostXY = GetUpdatedSparsePost(PairIndex); #if 0//TRACE LogFlatMx("Final post before update", Post, LX, LY); #endif UpdatedSparsePostXY.UpdateFromPost(SparsePostXY, Post, SeqCount); UpdatedSparsePostXY.m_X = SparsePostXY.m_X; UpdatedSparsePostXY.m_Y = SparsePostXY.m_Y; myfree(Post); #if 0//TRACE Log("\nBefore:"); SparsePostXY.LogMe(); Log("\nUpdated:"); UpdatedSparsePostXY.LogMe(); #endif } muscle-5.1.0/src/countsort.h000066400000000000000000000017361424453062600160220ustar00rootroot00000000000000#ifndef countsort_h #define countsort_h #include "gobuff.h" class CountSortMem { public: static const unsigned NVEC = 8; public: unsigned *m_Vecs[NVEC]; unsigned m_VecPos[NVEC]; unsigned m_MaxValueCount; GoBuff m_Sizes; GoBuff m_Offsets; public: CountSortMem() { m_MaxValueCount = 0; memset_zero(m_Vecs, NVEC); } void Free() { for (unsigned i = 0; i < NVEC; ++i) { myfree(m_Vecs[i]); m_Vecs[i] = 0; } m_MaxValueCount = 0; } void Alloc(unsigned ValueCount) { if (ValueCount <= m_MaxValueCount) return; Free(); m_MaxValueCount = ValueCount; for (unsigned i = 0; i < NVEC; ++i) m_Vecs[i] = myalloc(unsigned, m_MaxValueCount); } }; unsigned CountSortOrderDesc(const unsigned *Values, unsigned ValueCount, CountSortMem &Mem, unsigned *Order); unsigned CountSortSubsetDesc(const unsigned *Values, unsigned ValueCount, CountSortMem &Mem, const unsigned *Subset, unsigned *Result); #endif // countsort_h muscle-5.1.0/src/defaulthmmparams.cpp000066400000000000000000000151731424453062600176470ustar00rootroot00000000000000#include "myutils.h" #include "hmmparams.h" void HMMParams::GetDefaultHMMParams(bool Nucleo, vector &Lines) { if (Nucleo) GetDefaultHMMParams_Nucleo(Lines); else GetDefaultHMMParams_Amino(Lines); } void HMMParams::GetDefaultHMMParams_Amino(vector &Lines) { Lines.clear(); #define ADD_STR(s) Lines.push_back(s); ADD_STR("HMM aa") ADD_STR("T.START_M 0.6") ADD_STR("T.START_IS 0.02") ADD_STR("T.START_IL 0.18") ADD_STR("T.M_M 0.96") ADD_STR("T.M_IS 0.012") ADD_STR("T.M_IL 0.008") ADD_STR("T.IS_IS 0.35") ADD_STR("T.IS_M 0.65") ADD_STR("T.IL_IL 0.90") ADD_STR("T.IL_M 0.10") ADD_STR("E.AA 0.023731") ADD_STR("E.CA 0.0014551") ADD_STR("E.CC 0.010135") ADD_STR("E.DA 0.0022355") ADD_STR("E.DC 0.00036798") ADD_STR("E.DD 0.019112") ADD_STR("E.EA 0.0033222") ADD_STR("E.EC 0.00037956") ADD_STR("E.ED 0.004968") ADD_STR("E.EE 0.016766") ADD_STR("E.FA 0.00165") ADD_STR("E.FC 0.00052274") ADD_STR("E.FD 0.00069041") ADD_STR("E.FE 0.00078814") ADD_STR("E.FF 0.01661") ADD_STR("E.GA 0.005979") ADD_STR("E.GC 0.00071206") ADD_STR("E.GD 0.0023525") ADD_STR("E.GE 0.0021486") ADD_STR("E.GF 0.001152") ADD_STR("E.GG 0.040629") ADD_STR("E.HA 0.0011435") ADD_STR("E.HC 0.00026421") ADD_STR("E.HD 0.00097077") ADD_STR("E.HE 0.0013177") ADD_STR("E.HF 0.00072545") ADD_STR("E.HG 0.001037") ADD_STR("E.HH 0.00868") ADD_STR("E.IA 0.0031885") ADD_STR("E.IC 0.0009404") ADD_STR("E.ID 0.0010536") ADD_STR("E.IE 0.0012421") ADD_STR("E.IF 0.0027995") ADD_STR("E.IG 0.0014252") ADD_STR("E.IH 0.00059716") ADD_STR("E.II 0.017783") ADD_STR("E.KA 0.0033169") ADD_STR("E.KC 0.00046951") ADD_STR("E.KD 0.0025252") ADD_STR("E.KE 0.0042842") ADD_STR("E.KF 0.00087222") ADD_STR("E.KG 0.0025931") ADD_STR("E.KH 0.0012138") ADD_STR("E.KI 0.0015785") ADD_STR("E.KK 0.016122") ADD_STR("E.LA 0.0044958") ADD_STR("E.LC 0.0013849") ADD_STR("E.LD 0.0016197") ADD_STR("E.LE 0.0022206") ADD_STR("E.LF 0.0053337") ADD_STR("E.LG 0.0021285") ADD_STR("E.LH 0.0011175") ADD_STR("E.LI 0.010718") ADD_STR("E.LK 0.0025963") ADD_STR("E.LL 0.035839") ADD_STR("E.MA 0.0014888") ADD_STR("E.MC 0.00037421") ADD_STR("E.MD 0.00047808") ADD_STR("E.ME 0.00076105") ADD_STR("E.MF 0.0011611") ADD_STR("E.MG 0.00066504") ADD_STR("E.MH 0.00042237") ADD_STR("E.MI 0.002241") ADD_STR("E.MK 0.0009612") ADD_STR("E.ML 0.0046194") ADD_STR("E.MM 0.0040952") ADD_STR("E.NA 0.0021023") ADD_STR("E.NC 0.00042479") ADD_STR("E.ND 0.0035354") ADD_STR("E.NE 0.0022474") ADD_STR("E.NF 0.00084658") ADD_STR("E.NG 0.0028888") ADD_STR("E.NH 0.001412") ADD_STR("E.NI 0.0010427") ADD_STR("E.NK 0.0025731") ADD_STR("E.NL 0.0016028") ADD_STR("E.NM 0.00063401") ADD_STR("E.NN 0.012819") ADD_STR("E.PA 0.0023062") ADD_STR("E.PC 0.00034766") ADD_STR("E.PD 0.0012538") ADD_STR("E.PE 0.0015155") ADD_STR("E.PF 0.00060701") ADD_STR("E.PG 0.001556") ADD_STR("E.PH 0.00049078") ADD_STR("E.PI 0.0010377") ADD_STR("E.PK 0.0015484") ADD_STR("E.PL 0.0015731") ADD_STR("E.PM 0.00046718") ADD_STR("E.PN 0.0010028") ADD_STR("E.PP 0.018461") ADD_STR("E.QA 0.002191") ADD_STR("E.QC 0.00032102") ADD_STR("E.QD 0.0017678") ADD_STR("E.QE 0.0034513") ADD_STR("E.QF 0.00059248") ADD_STR("E.QG 0.0014243") ADD_STR("E.QH 0.001139") ADD_STR("E.QI 0.0010088") ADD_STR("E.QK 0.0031231") ADD_STR("E.QL 0.0018055") ADD_STR("E.QM 0.00075546") ADD_STR("E.QN 0.0015822") ADD_STR("E.QP 0.00090111") ADD_STR("E.QQ 0.007566") ADD_STR("E.RA 0.002445") ADD_STR("E.RC 0.00044701") ADD_STR("E.RD 0.0016166") ADD_STR("E.RE 0.0026887") ADD_STR("E.RF 0.00090768") ADD_STR("E.RG 0.0019486") ADD_STR("E.RH 0.001321") ADD_STR("E.RI 0.0013814") ADD_STR("E.RK 0.0059565") ADD_STR("E.RL 0.0024681") ADD_STR("E.RM 0.00076734") ADD_STR("E.RN 0.0020778") ADD_STR("E.RP 0.0010627") ADD_STR("E.RQ 0.0025353") ADD_STR("E.RR 0.017751") ADD_STR("E.SA 0.0063175") ADD_STR("E.SC 0.00094867") ADD_STR("E.SD 0.0028523") ADD_STR("E.SE 0.002939") ADD_STR("E.SF 0.0011904") ADD_STR("E.SG 0.0038196") ADD_STR("E.SH 0.0011642") ADD_STR("E.SI 0.0017357") ADD_STR("E.SK 0.0031263") ADD_STR("E.SL 0.0025096") ADD_STR("E.SM 0.00087787") ADD_STR("E.SN 0.003014") ADD_STR("E.SP 0.0018004") ADD_STR("E.SQ 0.0019115") ADD_STR("E.SR 0.0022454") ADD_STR("E.SS 0.013466") ADD_STR("E.TA 0.0039") ADD_STR("E.TC 0.00073798") ADD_STR("E.TD 0.0018049") ADD_STR("E.TE 0.0021676") ADD_STR("E.TF 0.0010759") ADD_STR("E.TG 0.0021484") ADD_STR("E.TH 0.00077747") ADD_STR("E.TI 0.0024897") ADD_STR("E.TK 0.0025086") ADD_STR("E.TL 0.0030227") ADD_STR("E.TM 0.00093371") ADD_STR("E.TN 0.0022014") ADD_STR("E.TP 0.0014798") ADD_STR("E.TQ 0.0015453") ADD_STR("E.TR 0.0018605") ADD_STR("E.TS 0.0048729") ADD_STR("E.TT 0.012994") ADD_STR("E.VA 0.0053324") ADD_STR("E.VC 0.0011915") ADD_STR("E.VD 0.0012792") ADD_STR("E.VE 0.001787") ADD_STR("E.VF 0.0025616") ADD_STR("E.VG 0.0019458") ADD_STR("E.VH 0.00071553") ADD_STR("E.VI 0.01118") ADD_STR("E.VK 0.002109") ADD_STR("E.VL 0.0091446") ADD_STR("E.VM 0.0019746") ADD_STR("E.VN 0.0013661") ADD_STR("E.VP 0.0013578") ADD_STR("E.VQ 0.0013284") ADD_STR("E.VR 0.0016936") ADD_STR("E.VS 0.002416") ADD_STR("E.VT 0.0034345") ADD_STR("E.VV 0.020752") ADD_STR("E.WA 0.00039119") ADD_STR("E.WC 0.00010666") ADD_STR("E.WD 0.00016015") ADD_STR("E.WE 0.00023815") ADD_STR("E.WF 0.00085751") ADD_STR("E.WG 0.00038786") ADD_STR("E.WH 0.00019097") ADD_STR("E.WI 0.00039549") ADD_STR("E.WK 0.00028448") ADD_STR("E.WL 0.00076736") ADD_STR("E.WM 0.00016253") ADD_STR("E.WN 0.00021006") ADD_STR("E.WP 0.00015674") ADD_STR("E.WQ 0.00020592") ADD_STR("E.WR 0.00029139") ADD_STR("E.WS 0.00026525") ADD_STR("E.WT 0.00024961") ADD_STR("E.WV 0.00038538") ADD_STR("E.WW 0.0056363") ADD_STR("E.YA 0.0013184") ADD_STR("E.YC 0.00036626") ADD_STR("E.YD 0.00066005") ADD_STR("E.YE 0.00092548") ADD_STR("E.YF 0.0036874") ADD_STR("E.YG 0.00089301") ADD_STR("E.YH 0.0013104") ADD_STR("E.YI 0.0012786") ADD_STR("E.YK 0.0010082") ADD_STR("E.YL 0.0021971") ADD_STR("E.YM 0.00054105") ADD_STR("E.YN 0.0007496") ADD_STR("E.YP 0.00047608") ADD_STR("E.YQ 0.00070192") ADD_STR("E.YR 0.0009943") ADD_STR("E.YS 0.0010265") ADD_STR("E.YT 0.00094759") ADD_STR("E.YV 0.00148") ADD_STR("E.YW 0.00069226") ADD_STR("E.YY 0.0099931") #undef ADD_STR } void HMMParams::GetDefaultHMMParams_Nucleo(vector &Lines) { Lines.clear(); #define ADD_STR(s) Lines.push_back(s); ADD_STR("HMM nt") ADD_STR("T.START_M 0.6") ADD_STR("T.START_IS 0.02") ADD_STR("T.START_IL 0.18") ADD_STR("T.M_M 0.96") ADD_STR("T.M_IS 0.012") ADD_STR("T.M_IL 0.008") ADD_STR("T.IS_IS 0.35") ADD_STR("T.IS_M 0.65") ADD_STR("T.IL_IL 0.90") ADD_STR("T.IL_M 0.10") #define Diag "0.12" #define Other "0.044" ADD_STR("E.AA " Diag) ADD_STR("E.CA " Other) ADD_STR("E.CC " Diag) ADD_STR("E.GA " Other) ADD_STR("E.GC " Other) ADD_STR("E.GG " Diag) ADD_STR("E.TA " Other) ADD_STR("E.TC " Other) ADD_STR("E.TG " Other) ADD_STR("E.TT " Diag) #undef ADD_STR } muscle-5.1.0/src/derep.cpp000066400000000000000000000130541424453062600154100ustar00rootroot00000000000000#include "muscle.h" #include "derep.h" void Derep::Clear() { m_SeqIndexToRepSeqIndex.clear(); m_RepSeqIndexes.clear(); m_RepSeqIndexToSeqIndexes.clear(); m_HashToSeqIndexes.clear(); } // FNV64 hash uint Derep::CalcHash(const Sequence *Seq) const { uint64 hash = 0xcbf29ce484222325uL; const uint L = Seq->GetLength(); for (uint i = 0; i < L; ++i) { char c = Seq->GetChar(i); byte b = (byte) tolower(c); hash *= 1099511628211uL; hash ^= (uint64) b; } uint h = uint(hash%m_SlotCount); return h; } void Derep::Run(MultiSequence &InputSeqs) { Clear(); m_InputSeqs = &InputSeqs; const uint InputSeqCount = InputSeqs.GetSeqCount(); m_SlotCount = 3*InputSeqCount + 7; m_HashToSeqIndexes.resize(m_SlotCount); m_SeqIndexToRepSeqIndex.resize(InputSeqCount, UINT_MAX); m_RepSeqIndexToSeqIndexes.resize(InputSeqCount); uint UniqueCount = 0; uint DupeCount = 0; for (uint SeqIndex = 0; SeqIndex < InputSeqCount; ++SeqIndex) { ProgressStep(SeqIndex, InputSeqCount, "Derep %u uniques, %u dupes", UniqueCount, DupeCount); uint RepSeqIndex = Search(SeqIndex); if (RepSeqIndex == UINT_MAX) { AddToHash(SeqIndex); asserta(SIZE(m_RepSeqIndexes) == UniqueCount); m_RepSeqIndexes.push_back(SeqIndex); m_RepSeqIndexToSeqIndexes[SeqIndex].push_back(SeqIndex); m_SeqIndexToRepSeqIndex[SeqIndex] = SeqIndex; ++UniqueCount; } else { m_RepSeqIndexToSeqIndexes[RepSeqIndex].push_back(SeqIndex); m_SeqIndexToRepSeqIndex[SeqIndex] = RepSeqIndex; ++DupeCount; } } } bool Derep::SeqsEq(uint SeqIndex1, uint SeqIndex2) const { const Sequence *Seq1 = m_InputSeqs->GetSequence(SeqIndex1); const Sequence *Seq2 = m_InputSeqs->GetSequence(SeqIndex2); const uint L = Seq1->GetLength(); const uint L2 = Seq2->GetLength(); if (L2 != L) return false; for (uint i = 0; i < L; ++i) { char c1 = Seq1->GetChar(i); char c2 = Seq2->GetChar(i); if (toupper(c1) != toupper(c2)) return false; } return true; } uint Derep::Search(uint SeqIndex) const { const Sequence *Seq = m_InputSeqs->GetSequence(SeqIndex); asserta(Seq != 0); uint h = CalcHash(Seq); asserta(h < SIZE(m_HashToSeqIndexes)); const vector &Row = m_HashToSeqIndexes[h]; const uint n = SIZE(Row); for (uint i = 0; i < n; ++i) { uint SeqIndex2 = Row[i]; if (SeqsEq(SeqIndex, SeqIndex2)) return SeqIndex2; } return UINT_MAX; } void Derep::AddToHash(uint SeqIndex) { const Sequence *Seq = m_InputSeqs->GetSequence(SeqIndex); asserta(Seq != 0); uint h = CalcHash(Seq); asserta(h < SIZE(m_HashToSeqIndexes)); vector &Row = m_HashToSeqIndexes[h]; Row.push_back(SeqIndex); } void Derep::GetUniqueSeqs(MultiSequence &UniqueSeqs) { asserta(UniqueSeqs.GetSeqCount() == 0); const uint UniqueCount = SIZE(m_RepSeqIndexes); for (uint i = 0; i < UniqueCount; ++i) { uint SeqIndex = m_RepSeqIndexes[i]; const Sequence *Seq = m_InputSeqs->GetSequence(SeqIndex); UniqueSeqs.AddSequence(Seq, false); } AssertSameLabels(UniqueSeqs); } void Derep::Validate() const { asserta(m_InputSeqs != 0); const uint InputSeqCount = m_InputSeqs->GetSeqCount(); asserta(SIZE(m_SeqIndexToRepSeqIndex) == InputSeqCount); asserta(SIZE(m_RepSeqIndexToSeqIndexes) == InputSeqCount); const uint ClusterCount = SIZE(m_RepSeqIndexes); set RepSeqIndexSet; for (uint SeqIndex = 0; SeqIndex < InputSeqCount; ++SeqIndex) { uint RepSeqIndex = m_SeqIndexToRepSeqIndex[SeqIndex]; RepSeqIndexSet.insert(RepSeqIndex); } const uint RepSeqIndexCount = SIZE(m_RepSeqIndexes); asserta(SIZE(RepSeqIndexSet) == RepSeqIndexCount); for (uint i = 0; i < RepSeqIndexCount; ++i) { uint RepSeqIndex = m_RepSeqIndexes[i]; asserta(RepSeqIndexSet.find(RepSeqIndex) != RepSeqIndexSet.end()); const vector &MemberSeqIndexes = m_RepSeqIndexToSeqIndexes[RepSeqIndex]; const uint MemberCount = SIZE(MemberSeqIndexes); asserta(MemberCount > 0); for (uint j = 0; j < MemberCount; ++j) { uint MemberSeqIndex = MemberSeqIndexes[j]; uint MemberRepSeqIndex = m_SeqIndexToRepSeqIndex[MemberSeqIndex]; asserta(MemberRepSeqIndex == RepSeqIndex); } } } void Derep::GetDupeGSIs(vector &GSIs, vector &GlobalRepSeqIndexes) const { GSIs.clear(); GlobalRepSeqIndexes.clear(); const uint InputSeqCount = m_InputSeqs->GetSeqCount(); const uint GlobalMSSeqCount = GetGlobalMSSeqCount(); const uint ClusterCount = SIZE(m_RepSeqIndexes); for (uint ClusterIndex = 0; ClusterIndex < ClusterCount; ++ClusterIndex) { uint RepSeqIndex = m_RepSeqIndexes[ClusterIndex]; asserta(RepSeqIndex < InputSeqCount); const vector &MemberSeqIndexes = m_RepSeqIndexToSeqIndexes[RepSeqIndex]; const uint MemberCount = SIZE(MemberSeqIndexes); const Sequence *Seq = m_InputSeqs->GetSequence(RepSeqIndex); uint GlobalRepSeqIndex = Seq->GetGSI(); asserta(GlobalRepSeqIndex < GlobalMSSeqCount); asserta(MemberSeqIndexes[0] == RepSeqIndex); for (uint i = 1; i < MemberCount; ++i) { uint MemberSeqIndex = MemberSeqIndexes[i]; const Sequence *Seq = m_InputSeqs->GetSequence(MemberSeqIndex); uint GlobalMemberSeqIndex = Seq->GetGSI(); asserta(GlobalMemberSeqIndex < GlobalMSSeqCount); GSIs.push_back(GlobalMemberSeqIndex); GlobalRepSeqIndexes.push_back(GlobalRepSeqIndex); } } } void cmd_derep() { const string &InputFileName = opt(derep); const string &OutputFileName = opt(output); MultiSequence InputSeqs; InputSeqs.FromFASTA(InputFileName); Derep D; D.Run(InputSeqs); D.Validate(); MultiSequence *UniqueSeqs = new MultiSequence; D.GetUniqueSeqs(*UniqueSeqs); UniqueSeqs->WriteMFA(OutputFileName); } muscle-5.1.0/src/derep.h000066400000000000000000000012001424453062600150430ustar00rootroot00000000000000#pragma once class Derep { public: MultiSequence *m_InputSeqs = 0; vector m_SeqIndexToRepSeqIndex; vector m_RepSeqIndexes; vector > m_RepSeqIndexToSeqIndexes; uint m_SlotCount = 0; vector > m_HashToSeqIndexes; public: uint CalcHash(const Sequence *Seq) const; void Clear(); void Run(MultiSequence &InputSeqs); uint Search(uint SeqIndex) const; void GetUniqueSeqs(MultiSequence &UniqueSeqs); void AddToHash(uint SeqIndex); bool SeqsEq(uint SeqIndex1, uint SeqIndex2) const; void Validate() const; void GetDupeGSIs(vector &GSIs, vector &GlobalRepSeqIndexes) const; }; muscle-5.1.0/src/diagbox.cpp000066400000000000000000000055431424453062600157320ustar00rootroot00000000000000#include "myutils.h" #include "diagbox.h" #define TEST 0 /*** DiagBox represents a diagonal "rectangle" in the D.P. matrix. i = 0..LA-1 j = 0..LB-1 d = LA - i + j = 1 .. LA+LB-1 j = d - LA + i i = LA - d + j ***/ void GetDiagRange(uint LA, uint LB, uint d, uint &mini, uint &minj, uint &maxi, uint &maxj) { if (d >= LA) { mini = 0; maxi = min(LA+LB-1-d, LA-1); minj = d - LA; maxj = min(LB-1, d-1); } else { mini = LA-d; maxi = min(LA+LB-1-d, LA-1); minj = 0; maxj = min(LB-1, d-1); } } void GetDiagBox(uint LA, uint LB, uint DiagLo, uint DiagHi, DiagBox &Box) { asserta(DiagLo <= DiagHi); asserta(DiagLo >= 1); asserta(DiagHi <= LA + LB - 1); Box.LA = LA; Box.LB = LB; Box.dlo = DiagLo; Box.dhi = DiagHi; GetDiagRange(LA, LB, DiagLo, Box.dlo_mini, Box.dlo_minj, Box.dlo_maxi, Box.dlo_maxj); GetDiagRange(LA, LB, DiagHi, Box.dhi_mini, Box.dhi_minj, Box.dhi_maxi, Box.dhi_maxj); } void GetDiagLoHi(uint LA, uint LB, const char *Path, uint &dlo, uint &dhi) { dlo = UINT_MAX; dhi = UINT_MAX; uint i = 0; uint j = 0; for (uint k = 0; ; ++k) { char c = Path[k]; if (c == 0) break; if (c == 'M') { uint d = LA - i + j; if (dlo == UINT_MAX) { dlo = d; dhi = d; } else { if (d < dlo) dlo = d; if (d > dhi) dhi = d; } } if (c == 'M' || c == 'D') ++i; if (c == 'M' || c == 'I') ++j; } } #if TEST static void Test2(uint LA, uint LB, uint DiagLo, uint DiagHi) { DiagBox Box; GetDiagBox(LA, LB, DiagLo, DiagHi, Box); Box.LogMe(); Box.Validate(); } static void Test1(uint LA, uint LB, uint d, uint i, uint j, uint I, uint J) { uint mini, maxi, minj, maxj; GetDiagRange(LA, LB, d, mini, minj, maxi, maxj); Log("LA=%u LB=%u d=%u (%u,%u) (%u,%u) expected (%u,%u) (%u,%u)\n", LA, LB, d, mini, minj, maxi, maxj, i, j, I, J); asserta(mini == i); asserta(maxi == I); asserta(minj == j); asserta(maxj == J); } void TestDiagBox() { Test2(16, 19, 17, 37); Test1(5, 3, 1, 4, 0, 4, 0); Test1(5, 3, 2, 3, 0, 4, 1); Test1(5, 3, 3, 2, 0, 4, 2); Test1(5, 3, 4, 1, 0, 3, 2); Test1(5, 3, 5, 0, 0, 2, 2); Test1(5, 3, 6, 0, 1, 1, 2); Test1(5, 3, 7, 0, 2, 0, 2); Test1(3, 5, 1, 2, 0, 2, 0); Test1(3, 5, 2, 1, 0, 2, 1); Test1(3, 5, 3, 0, 0, 2, 2); Test1(3, 5, 4, 0, 1, 2, 3); Test1(3, 5, 5, 0, 2, 2, 4); Test1(3, 5, 6, 0, 3, 1, 4); Test1(3, 5, 7, 0, 4, 0, 4); Test1(5, 5, 1, 4, 0, 4, 0); Test1(5, 5, 2, 3, 0, 4, 1); Test1(5, 5, 3, 2, 0, 4, 2); Test1(5, 5, 4, 1, 0, 4, 3); Test1(5, 5, 5, 0, 0, 4, 4); Test1(5, 5, 6, 0, 1, 3, 4); Test1(5, 5, 7, 0, 2, 2, 4); Test1(5, 5, 8, 0, 3, 1, 4); Test1(5, 5, 9, 0, 4, 0, 4); for (uint LA = 2; LA <= 5; ++LA) for (uint LB = 2; LB <= 5; ++LB) for (uint dlo = 1; dlo <= LA+LB-1; ++dlo) for (uint dhi = dlo; dhi <= LA+LB-1; ++dhi) Test2(LA, LB, dlo, dhi); Log("\n"); Log("ALL OK\n"); } #endif // TEST muscle-5.1.0/src/diagbox.h000066400000000000000000000071671424453062600154030ustar00rootroot00000000000000#pragma once struct DiagBox; void GetDiagBox(uint LA, uint LB, uint DiagLo, uint DiagHi, DiagBox &Box); void GetDiagRange(uint LA, uint LB, uint d, uint &mini, uint &minj, uint &maxi, uint &maxj); void GetDiagLoHi(uint LA, uint LB, const char *Path, uint &dlo, uint &dhi); struct DiagBox { DiagBox() { } DiagBox(uint LA_, uint LB_, uint DiagLo, uint DiagHi) { //GetDiagBox(LA, LB, DiagLo, DiagHi, *this); //Validate(); Init(LA_, LB_, DiagLo, DiagHi); } void Init(uint LA_, uint LB_, uint DiagLo, uint DiagHi) { GetDiagBox(LA_, LB_, DiagLo, DiagHi, *this); Validate(); } uint LA; uint LB; uint dlo; uint dhi; uint dlo_mini; uint dlo_minj; uint dlo_maxi; uint dlo_maxj; uint dhi_mini; uint dhi_minj; uint dhi_maxi; uint dhi_maxj; uint GetDiag(uint i, uint j) const { return LA - i + j; } // i, j are positions 0..LA-1, 0..LB-1. bool InBox(uint i, uint j) const { uint d = GetDiag(i, j); return d >= dlo && d <= dhi; } /*** i, j are 0-based prefix lengths 0..LA, 0..LB. A full path is in the box iff all match pairs are in the box. A partial path that aligns a prefix of A to a prefix of B as in D.P.) is in the box iff it is is the prefix of at least one full path that is in the box. A D.P. matrix entry X[i][j] is in the box iff there is at least one full path aligning the first i letters of A and the first j letters of B ending in a column of type X, i.e. if there exists a partial path in the box that ends in X. Assume terminals appear in all paths, and DI/ID forbidden. Intuitively seems that by these definitions D is in box iff DM or MD is in box, I is in box iff IM or MI is in box. Don't have proof.. ***/ bool InBoxDPM(uint i, uint j) const { // Special case for M[0][0] if (i == 0 && j == 0) return true; if (i == 0 || j == 0) return false; uint d = GetDiag(i-1, j-1); return d >= dlo && d <= dhi; } bool InBoxDPD(uint i, uint j) const { bool MD = i == 0 ? false : InBoxDPM(i-1, j); bool DM = (i == LA || j == LB) ? false : InBoxDPM(i+1, j+1); return MD || DM; } bool InBoxDPI(uint i, uint j) const { bool MI = j == 0 ? false : InBoxDPM(i, j-1); bool IM = (i == LA || j == LB) ? false : InBoxDPM(i+1, j+1); return MI || IM; } // d = LA - i + j = 1 .. LA+LB-1 void Validate() const { asserta(dlo <= dhi); asserta(dlo >= GetDiag(LA-1, 0)); asserta(dhi <= GetDiag(0, LB-1)); asserta(GetDiag(dlo_mini, dlo_minj) == dlo); asserta(GetDiag(dlo_maxi, dlo_maxj) == dlo); asserta(GetDiag(dhi_mini, dhi_minj) == dhi); asserta(GetDiag(dhi_maxi, dhi_maxj) == dhi); asserta(dlo_mini >= dhi_mini); asserta(dlo_minj <= dhi_minj); asserta(dlo_maxi >= dhi_maxi); asserta(dlo_maxj <= dhi_maxj); } uint GetMini() const { return dhi_mini; } uint GetMaxi() const { return dlo_maxi; } uint GetMinj() const { return dlo_minj; } uint GetMaxj() const { return dhi_maxj; } /*** i = 0..LA-1 j = 0..LB-1 d = LA - i + j = 1 .. LA+LB-1 j = d - LA + i i = LA - d + j ***/ void GetRange_j(uint i, uint &Startj, uint &Endj) const { // j = d - LA + i if (dlo + i >= LA) Startj = dlo + i - LA; else Startj = 0; if (Startj >= LB) Startj = LB - 1; if (dhi + i + 1 >= LA) Endj = dhi + i + 1 - LA; else Endj = 0; if (Endj >= LB) Endj = LB - 1; asserta(Endj >= Startj); asserta(Startj < LB); } void LogMe() const { Log("LA=%u LB=%d dlo(%u): (%u,%u)-(%u,%u) dhi(%u): (%u,%u)-(%u,%u) i=[%u-%u] j=[%u-%u]\n", LA, LB, dlo, dlo_mini, dlo_minj, dlo_maxi, dlo_maxj, dhi, dhi_mini, dhi_minj, dhi_maxi, dhi_maxj, GetMini(), GetMaxi(), GetMinj(), GetMaxj()); } }; muscle-5.1.0/src/disperse.cpp000066400000000000000000000006301424453062600161230ustar00rootroot00000000000000#include "muscle.h" #include "ensemble.h" void cmd_disperse() { const string FileName = opt(disperse); Ensemble E; E.FromFile(FileName); double MaxGapFract = 0.5; if (optset_max_gap_fract) MaxGapFract = opt(max_gap_fract); double D_LP; double D_Cols; E.GetDispersion(MaxGapFract, D_LP, D_Cols); ProgressLog("@disperse file=%s D_LP=%.4g D_Cols=%.4g\n", FileName.c_str(), D_LP, D_Cols); } muscle-5.1.0/src/dividetree.cpp000066400000000000000000000041461424453062600164370ustar00rootroot00000000000000#include "muscle.h" #include "tree.h" void MakeSubsetNodes(const Tree &InputTree, const vector &SubsetNodes, const vector &SubsetLabels, Tree &SubsetTree); void DivideTree(const Tree &InputTree, uint Node, Tree &Subtree, Tree &Supertree) { asserta(InputTree.IsRooted()); const uint InputNodeCount = InputTree.GetNodeCount(); const uint InputLeafCount = InputTree.GetLeafCount(); asserta(Node < InputNodeCount); asserta(!InputTree.IsRoot(Node)); vector SubtreeLeafNodes; InputTree.GetSubtreeLeafNodes(Node, SubtreeLeafNodes); uint N = SIZE(SubtreeLeafNodes); asserta(N > 0); set SubtreeSet; vector SubtreeLabels; for (uint i = 0; i < N; ++i) { uint Node2 = SubtreeLeafNodes[i]; string Label; InputTree.GetLabel(Node2, Label); SubtreeSet.insert(Node2); SubtreeLabels.push_back(Label); } vector SupertreeLeafNodes; vector SupertreeLabels; for (uint Node2 = 0; Node2 < InputNodeCount; ++Node2) { if (!InputTree.IsLeaf(Node2)) continue; if (SubtreeSet.find(Node2) == SubtreeSet.end()) { string Label; InputTree.GetLabel(Node2, Label); SupertreeLeafNodes.push_back(Node2); SupertreeLabels.push_back(Label); } } const uint SubtreeLeafCount = SIZE(SubtreeLeafNodes); const uint SupertreeLeafCount = SIZE(SupertreeLeafNodes); asserta(SubtreeLeafCount > 0); asserta(SupertreeLeafCount > 0); asserta(SubtreeLeafCount + SupertreeLeafCount == InputLeafCount); MakeSubsetNodes(InputTree, SubtreeLeafNodes, SubtreeLabels, Subtree); MakeSubsetNodes(InputTree, SupertreeLeafNodes, SupertreeLabels, Supertree); } void cmd_divide_tree() { const string &InputFileName = opt(divide_tree); Tree InputTree; InputTree.FromFile(InputFileName); const string &Label1 = opt(label1); const string &Label2 = opt(label2); uint Node1 = InputTree.GetNodeIndex(Label1); uint Node2 = InputTree.GetNodeIndex(Label2); uint DivideNode = InputTree.GetLCA(Node1, Node2); Tree Subtree; Tree Supertree; DivideTree(InputTree, DivideNode, Subtree, Supertree); Subtree.ToFile(opt(subtreeout)); Supertree.ToFile(opt(supertreeout)); } muscle-5.1.0/src/eacluster.cpp000066400000000000000000000146561424453062600163110ustar00rootroot00000000000000#include "muscle.h" #include "eacluster.h" #include "locallock.h" void MakeReplicateFileName_N(const string &Pattern, uint N, string &FileName) { FileName.clear(); bool Found = false; for (uint i = 0; i < SIZE(Pattern); ++i) { char c = Pattern[i]; if (c == '@') { string s; Ps(s, "%u", N); FileName += s; Found = true; } else FileName += c; } if (!Found) { string s; Ps(s, "%u", N); FileName += s; } } void EACluster::Clear() { m_CentroidSeqIndexes.clear(); m_CentroidIndexToSeqIndexes.clear(); m_SeqIndexToCentroidIndex.clear(); m_ClusterMFAs.clear(); } void EACluster::Run(MultiSequence &InputSeqs, float MinEA) { AssertSameLabels(InputSeqs); Clear(); m_US.Init(); m_InputSeqs = &InputSeqs; const uint InputSeqCount = InputSeqs.GetSeqCount(); asserta(InputSeqCount > 0); m_SeqIndexToCentroidIndex.clear(); m_SeqIndexToCentroidIndex.resize(InputSeqCount, UINT_MAX); const float MinEE = (1 - MinEA); uint ClusterCount = 0; uint MemberCount = 0; for (uint SeqIndex = 0; SeqIndex < InputSeqCount; ++SeqIndex) { ProgressStep(SeqIndex, InputSeqCount, "UCLUST %u seqs EE<%.2f, %u centroids, %u members", InputSeqCount, MinEE, ClusterCount, MemberCount); const char *Label = m_InputSeqs->GetSequence(SeqIndex)->m_Label.c_str(); float BestEA; uint CentroidIndex = GetBestCentroid(SeqIndex, MinEA, BestEA); m_SeqIndexToCentroidIndex[SeqIndex] = CentroidIndex; if (CentroidIndex == UINT_MAX) { uint ClusterIndex = ClusterCount; ++ClusterCount; m_SeqIndexToCentroidIndex[SeqIndex] = ClusterIndex; m_CentroidSeqIndexes.push_back(SeqIndex); vector v; v.push_back(SeqIndex); m_CentroidIndexToSeqIndexes.push_back(v); uint L; const byte *ByteSeq = m_InputSeqs->GetByteSeq(SeqIndex, L); m_US.AddSeq(ByteSeq, L, SeqIndex); } else { ++MemberCount; asserta(CentroidIndex < SIZE(m_CentroidIndexToSeqIndexes)); asserta(CentroidIndex < SIZE(m_CentroidSeqIndexes)); uint CentroidSeqIndex = m_CentroidSeqIndexes[CentroidIndex]; const char *CentroidLabel = m_InputSeqs->GetSequence(CentroidSeqIndex)->m_Label.c_str(); m_CentroidIndexToSeqIndexes[CentroidIndex].push_back(SeqIndex); } Validate(); } MakeClusterMFAs(); } uint EACluster::GetBestCentroid(uint SeqIndex, float MinEA, float &BestEA) { uint CentroidCount = SIZE(m_CentroidSeqIndexes); if (CentroidCount == 0) return UINT_MAX; uint L; const byte *ByteSeq = m_InputSeqs->GetByteSeq(SeqIndex, L); vector TopSeqIndexes; vector TopWordCounts; m_US.SearchSeq(ByteSeq, L, TopSeqIndexes, TopWordCounts); const uint TopCount = SIZE(TopSeqIndexes); asserta(SIZE(TopWordCounts) == TopCount); if (TopCount == 0) return UINT_MAX; uint ThreadCount = GetRequestedThreadCount(); BestEA = 0; uint BestCentroidIndex = UINT_MAX; bool Done = false; #pragma omp parallel for num_threads(ThreadCount) for (int TopIndex = 0; TopIndex < (int) TopCount; ++TopIndex) { if (Done) continue; uint TopSeqIndex = TopSeqIndexes[TopIndex]; float EA = AlignSeqPair(SeqIndex, TopSeqIndex); Lock(); if (EA > MinEA && EA > BestEA) { BestEA = EA; asserta(TopSeqIndex < SIZE(m_SeqIndexToCentroidIndex)); uint CentroidIndex = m_SeqIndexToCentroidIndex[TopSeqIndex]; asserta(CentroidIndex < CentroidCount); BestCentroidIndex = CentroidIndex; } if (BestEA >= MinEA) { if (BestEA > 0.9) Done = true; if (BestEA - EA > 0.3) Done = true; } if (BestEA < MinEA - 0.3 && TopIndex > 20) Done = true; Unlock(); } return BestCentroidIndex; } void EACluster::GetClusterMFAs(vector &MFAs) const { const uint N = SIZE(m_ClusterMFAs); MFAs.clear(); for (uint i = 0; i < N; ++i) { MultiSequence *ClusterMFA = m_ClusterMFAs[i]; AssertSameLabels(*ClusterMFA); MFAs.push_back(ClusterMFA); } } void EACluster::WriteMFAs(const string &FileNamePattern) const { const uint CentroidCount = SIZE(m_ClusterMFAs); for (uint CentroidIndex = 0; CentroidIndex < CentroidCount; ++CentroidIndex) { ProgressStep(CentroidIndex, CentroidCount, "Write cluster MFAs"); const MultiSequence *MFA = m_ClusterMFAs[CentroidIndex]; asserta(MFA != 0); string FileName; MakeReplicateFileName_N(FileNamePattern, CentroidIndex+1, FileName); MFA->WriteMFA(FileName); } } void EACluster::MakeClusterMFAs() { const uint CentroidCount = SIZE(m_CentroidSeqIndexes); m_ClusterMFAs.clear(); for (uint CentroidIndex = 0; CentroidIndex < CentroidCount; ++CentroidIndex) { ProgressStep(CentroidIndex, CentroidCount, "Make cluster MFAs"); MultiSequence *ClusterMFA = new MultiSequence; asserta(ClusterMFA != 0); const vector &SeqIndexes = m_CentroidIndexToSeqIndexes[CentroidIndex]; const uint MemberCount = SIZE(SeqIndexes); for (uint i = 0; i < MemberCount; ++i) { uint SeqIndex = SeqIndexes[i]; const Sequence *seq = m_InputSeqs->GetSequence(SeqIndex); ClusterMFA->AddSequence(seq, false); } AssertSameLabels(*ClusterMFA); m_ClusterMFAs.push_back(ClusterMFA); } AssertSameSeqsVec(*m_InputSeqs, m_ClusterMFAs); } float EACluster::AlignSeqPair(uint SeqIndex1, uint SeqIndex2) { const Sequence *Seq1 = m_InputSeqs->GetSequence(SeqIndex1); const Sequence *Seq2 = m_InputSeqs->GetSequence(SeqIndex2); string Path; float EA = AlignPairFlat(Seq1, Seq2, Path); return EA; } void EACluster::Validate() const { const uint SeqCount = m_InputSeqs->GetSeqCount(); const uint CentroidCount = SIZE(m_CentroidSeqIndexes); asserta(SIZE(m_CentroidIndexToSeqIndexes) == CentroidCount); for (uint CentroidIndex = 0; CentroidIndex < CentroidCount; ++CentroidIndex) { uint CentroidSeqIndex = m_CentroidSeqIndexes[CentroidIndex]; asserta(CentroidSeqIndex < SeqCount); const vector &MemberSeqIndexes = m_CentroidIndexToSeqIndexes[CentroidIndex]; const uint MemberCount = SIZE(MemberSeqIndexes); for (uint MemberIndex = 0; MemberIndex < MemberCount; ++MemberIndex) { uint MemberSeqIndex = MemberSeqIndexes[MemberIndex]; asserta(MemberSeqIndex < SeqCount); uint CentroidIndex2 = m_SeqIndexToCentroidIndex[MemberSeqIndex]; asserta(CentroidIndex2 == CentroidIndex); } } } void cmd_eacluster() { const string &InputFileName = opt(eacluster); MultiSequence InputSeqs; InputSeqs.FromFASTA(InputFileName); const float MinEA = (float) optd(minea, 0.9); string OutputFileNamePattern = optd(output, "cluster%.afa"); InitProbcons(); EACluster EC; EC.Run(InputSeqs, MinEA); EC.WriteMFAs(OutputFileNamePattern); } muscle-5.1.0/src/eacluster.h000066400000000000000000000012321424453062600157400ustar00rootroot00000000000000#pragma once #include "usorter.h" class EACluster { public: MultiSequence *m_InputSeqs = 0; USorter m_US; float m_MinEA = FLT_MAX; vector m_CentroidSeqIndexes; vector > m_CentroidIndexToSeqIndexes; vector m_SeqIndexToCentroidIndex; vector m_ClusterMFAs; public: void Clear(); void Run(MultiSequence &InputSeqs, float MinEA); void MakeClusterMFAs(); uint GetBestCentroid(uint SeqIndex, float MinEA, float &BestEA); float AlignSeqPair(uint SeqIndex1, uint SeqIndex2); void WriteMFAs(const string &FileNamePattern) const; void GetClusterMFAs(vector &MFAs) const; void Validate() const; }; muscle-5.1.0/src/eadistmx.cpp000066400000000000000000000045751424453062600161370ustar00rootroot00000000000000#include "myutils.h" #include "muscle.h" #include "locallock.h" void ProgressLogInputSummary(const string &FileName, const MultiSequence &Seqs); void CalcEADistMx(FILE *f, MultiSequence* sequences, vector > &DistMx, vector *SparsePostVec) { DistMx.clear(); const uint SeqCount = sequences->GetSeqCount(); DistMx.resize(SeqCount); for (uint i = 0; i < SeqCount; ++i) { DistMx[i].resize(SeqCount, 0); DistMx[i][i] = 1; } if (SparsePostVec != 0) asserta(SIZE(*SparsePostVec) == 0); vector SeqIndexes1; vector SeqIndexes2; GetAllPairs(SeqCount, SeqIndexes1, SeqIndexes2); uint PairCount = SIZE(SeqIndexes1); asserta(SIZE(SeqIndexes1) == PairCount); uint PairCount2 = (SeqCount * (SeqCount - 1)) / 2; asserta(PairCount == PairCount2); // all-vs-all pairwise alignments for posterior probability matrices unsigned ThreadCount = GetRequestedThreadCount(); uint PairCounter = 0; float SumEA = 0; #pragma omp parallel for num_threads(ThreadCount) for (int PairIndex = 0; PairIndex < (int) PairCount; ++PairIndex) { uint SeqIndex1 = SeqIndexes1[PairIndex]; uint SeqIndex2 = SeqIndexes2[PairIndex]; const Sequence* seq1 = sequences->GetSequence(SeqIndex1); const Sequence* seq2 = sequences->GetSequence(SeqIndex2); const char *Label1 = seq1->m_Label.c_str(); const char *Label2 = seq2->m_Label.c_str(); Lock(); double MeanEA = (PairCounter == 0 ? 0 : SumEA/PairCounter); ProgressStep(PairCounter++, PairCount, "%u consensus seqs, mean EE %.2g", SeqCount, 1 - MeanEA); Unlock(); string Path; float EA; if (SparsePostVec == 0) EA = AlignPairFlat(seq1, seq2, Path); else { MySparseMx *SparsePost = new MySparseMx; EA = AlignPairFlat_SparsePost(seq1, seq2, Path, SparsePost); SparsePostVec->push_back(SparsePost); } Lock(); DistMx[SeqIndex1][SeqIndex2] = EA; DistMx[SeqIndex2][SeqIndex1] = EA; if (f != 0) fprintf(f, "%s\t%s\t%.4g\n", Label1, Label2, EA); SumEA += EA; Unlock(); } } void cmd_eadistmx() { const string &InputFileName = opt(eadistmx); asserta(optset_output); FILE *f = CreateStdioFile(opt(output)); MultiSequence* sequences = new MultiSequence(); assert(sequences); sequences->LoadMFA(InputFileName, true); ProgressLogInputSummary(InputFileName, *sequences); InitProbcons(); vector > DistMx; CalcEADistMx(f, sequences, DistMx); CloseStdioFile(f); } muscle-5.1.0/src/eadistmxmsas.cpp000066400000000000000000000016611424453062600170140ustar00rootroot00000000000000#include "muscle.h" #include "pprog.h" void cmd_eadistmx_msas() { const string &FileName = opt(eadistmx_msas); vector MSAFileNames; ReadStringsFromFile(FileName, MSAFileNames); const uint MSACount = SIZE(MSAFileNames); asserta(optset_output); FILE *f = CreateStdioFile(opt(output)); PProg PP; if (optset_paircount) PP.m_TargetPairCount = opt(paircount); bool IsNucleo; PP.LoadMSAs(MSAFileNames, IsNucleo); SetAlpha(IsNucleo ? ALPHA_Nucleo : ALPHA_Amino); InitProbcons(); PP.AlignAllInputPairs(); vector > &ScoreMx = PP.m_ScoreMx; for (uint i = 0; i < MSACount; ++i) { asserta(i < SIZE(ScoreMx)); const char *Labeli = PP.GetMSALabel(i).c_str(); for (uint j = i+1; j < MSACount; ++j) { asserta(j < SIZE(ScoreMx[i])); const char *Labelj = PP.GetMSALabel(j).c_str(); float Score = ScoreMx[i][j]; fprintf(f, "%s\t%s\t%.4f\n", Labeli, Labelj, Score); } } CloseStdioFile(f); } muscle-5.1.0/src/eesort.cpp000066400000000000000000000035221424453062600156110ustar00rootroot00000000000000#include "muscle.h" #include "sort.h" #include "locallock.h" void cmd_eesort() { const string &QueryFileName = opt(eesort); const string &DBFileName = opt(db); const string &OutputFileName = opt(output); FILE *fTsv = CreateStdioFile(opt(tsvout)); FILE *fFa = CreateStdioFile(OutputFileName); MultiSequence Query; MultiSequence DB; Query.FromFASTA(QueryFileName, true); Progress("Reading %s ...", DBFileName.c_str()); DB.FromFASTA(DBFileName, true); Progress("done\n"); bool IsNucleo = DB.GuessIsNucleo(); if (IsNucleo) SetAlpha(ALPHA_Nucleo); else SetAlpha(ALPHA_Amino); InitProbcons(); const uint QuerySeqCount = Query.GetSeqCount(); const uint DBSeqCount = DB.GetSeqCount(); unsigned ThreadCount = GetRequestedThreadCount(); uint PairCounter = 0; vector EAs(DBSeqCount, DBL_MAX); #pragma omp parallel for num_threads(ThreadCount) for (int iDBSeqIndex = 0; iDBSeqIndex < (int) DBSeqCount; ++iDBSeqIndex) { Lock(); ProgressStep(PairCounter++, DBSeqCount, "Calculating"); Unlock(); uint DBSeqIndex = uint(iDBSeqIndex); const Sequence *DBSeq = DB.GetSequence(DBSeqIndex); for (uint QuerySeqIndex = 0; QuerySeqIndex < QuerySeqCount; ++QuerySeqIndex) { const Sequence *QSeq = Query.GetSequence(QuerySeqIndex); string Path; double EA = AlignPairFlat(QSeq, DBSeq, Path); if (QuerySeqIndex == 0) { Lock(); EAs[DBSeqIndex] = EA; Unlock(); } } } vector Order(DBSeqCount); QuickSortOrderDesc(EAs.data(), DBSeqCount, Order.data()); for (uint k = 0; k < DBSeqCount; ++k) { ProgressStep(k, DBSeqCount, "Writing %s", OutputFileName.c_str()); uint DBSeqIndex = Order[k]; const Sequence *DBSeq = DB.GetSequence(DBSeqIndex); double EA = EAs[DBSeqIndex]; asserta(EA != DBL_MAX); Pf(fTsv, "%.3g %s\n", EA, DBSeq->GetLabel().c_str()); DBSeq->WriteMFA(fFa); } } muscle-5.1.0/src/efabestcols.cpp000066400000000000000000000027571424453062600166130ustar00rootroot00000000000000#include "muscle.h" #include "ensemble.h" #include "sort.h" void cmd_efa_bestcols() { const string EfaFileName = opt(efa_bestcols); const string OutputFileName = opt(output); double MinConf = 1.0; if (optset_minconf) MinConf = opt(minconf); double MaxGapFract = 0.5; if (optset_max_gap_fract) MaxGapFract = opt(max_gap_fract); asserta(MaxGapFract >= 0 && MaxGapFract <= 1.0); uint MaxCols = UINT_MAX; if (optset_maxcols) MaxCols = opt(maxcols); Ensemble E; E.FromFile(EfaFileName); vector Confs; const uint UniqueIxCount = SIZE(E.m_UniqueIxToIxs); vector UniqueIxs; for (uint UniqueIx = 0; UniqueIx < UniqueIxCount; ++UniqueIx) { uint n = SIZE(Confs); double Pct = GetPct(n, UniqueIxCount); ProgressStep(UniqueIx, UniqueIxCount, "%u cols (%.1f%%) conf >= %.3g, gaps <= %.3g", n, Pct, MinConf, MaxGapFract); double Conf = E.GetConf(UniqueIx); if (Conf < MinConf) continue; uint Ix = E.m_UniqueIxs[UniqueIx]; double GapFract = E.GetGapFract(Ix); if (GapFract > MaxGapFract) continue; UniqueIxs.push_back(UniqueIx); Confs.push_back(Conf); } const uint M = SIZE(Confs); vector Order(M); QuickSortOrderDesc(Confs.data(), M, Order.data()); vector BestUniqueIxs; const uint N = min(SIZE(Order), MaxCols); for (uint i = 0; i < N; ++i) { uint UniqueIx = UniqueIxs[Order[i]]; BestUniqueIxs.push_back(UniqueIx); } MSA RepAln; E.MakeResampledMSA(BestUniqueIxs, RepAln); RepAln.ToFASTAFile(OutputFileName); } muscle-5.1.0/src/efabestconf.cpp000066400000000000000000000032741424453062600165730ustar00rootroot00000000000000#include "muscle.h" #include "ensemble.h" void cmd_efa_bestconf() { const string &FileName = opt(efa_bestconf); Ensemble E; E.FromFile(FileName); const uint SeqCount = E.GetSeqCount(); const uint MSACount = E.GetMSACount(); const uint IxCount = E.GetIxCount(); double AvgCols = double(IxCount)/MSACount; ProgressLog("%u seqs, %u MSAs, avg cols %.1f\n", SeqCount, MSACount, AvgCols); ProgressLog(" MSA Cols N1 N1f TotConf MedConf Name\n"); // 12345 1234567 12345 1234 1234567 1234567 uint BestMSAIndex_Total = 0; uint BestMSAIndex_Median = 0; string BestMSAName_Total; string BestMSAName_Median; double BestConf_Total = -1; double BestConf_Median = -1; for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const MSA &M = *E.m_MSAs[MSAIndex]; const string &Name = E.m_MSANames[MSAIndex]; uint N1 = E.GetN1(MSAIndex); uint ColCount = M.GetColCount(); double TotalConf = E.GetTotalConf(MSAIndex); double MedianConf = E.GetMedianConf(MSAIndex); if (TotalConf > BestConf_Total) { BestConf_Total = TotalConf; BestMSAIndex_Total = MSAIndex; BestMSAName_Total = Name; } if (MedianConf > BestConf_Median) { BestConf_Median = MedianConf; BestMSAIndex_Median = MSAIndex; BestMSAName_Median = Name; } double N1f = double(N1)/ColCount; ProgressLog("%5u %7u %5u %4.2f %7.3f %7.4f %s\n", MSAIndex+1, ColCount, N1, N1f, TotalConf, MedianConf, Name.c_str()); } ProgressLog("Best MSA, total %u (%s)\n", BestMSAIndex_Total+1, BestMSAName_Total.c_str()); ProgressLog("Best MSA, median %u (%s)\n", BestMSAIndex_Median+1, BestMSAName_Median.c_str()); E.m_MSAs[BestMSAIndex_Median]->ToFASTAFile(opt(output)); } muscle-5.1.0/src/efaexplode.cpp000066400000000000000000000011251424453062600164210ustar00rootroot00000000000000#include "muscle.h" #include "ensemble.h" void cmd_efa_explode() { const string &InputFileName = opt(efa_explode); string Prefix; if (optset_prefix) Prefix = opt(prefix); string Suffix; if (optset_suffix) Suffix = opt(suffix); Ensemble E; E.FromFile(InputFileName); const uint MSACount = E.GetMSACount(); for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const MSA &M = E.GetMSA(MSAIndex); string FileName = E.GetMSAName(MSAIndex); if (FileName == "") Ps(FileName, "%u", MSAIndex); FileName = Prefix + FileName + Suffix; M.ToFASTAFile(FileName); } } muscle-5.1.0/src/efastats.cpp000066400000000000000000000053251424453062600161250ustar00rootroot00000000000000#include "muscle.h" #include "ensemble.h" #include "qscorer.h" static void CmpRef(const Ensemble &E, const MSA &RefMSA, double MaxGapFract, vector &Qs, vector &TCs) { Qs.clear(); TCs.clear(); QScorer QS; QS.m_MaxGapFract = MaxGapFract; const uint MSACount = E.GetMSACount(); for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const MSA &TestMSA = E.GetMSA(MSAIndex); const string &TestName = E.GetMSAName(MSAIndex); QS.Run(TestMSA, RefMSA); double Q = QS.m_Q; double TC = QS.m_TC; Qs.push_back(Q); TCs.push_back(TC); } } void cmd_efastats() { const string &InputFileName = opt(efastats); double MaxGapFract = optd(max_gap_fract, 0.5); const string &RefFileName = opt(ref); Ensemble E; E.FromFile(InputFileName); vector Qs; vector TCs; if (optset_ref) { MSA RefMSA; RefMSA.FromFASTAFile(opt(ref)); CmpRef(E, RefMSA, MaxGapFract, Qs, TCs); } const uint SeqCount = E.GetSeqCount(); const uint MSACount = E.GetMSACount(); const uint IxCount = E.GetIxCount(); double D_LetterPairs; double D_Columns; E.GetDispersion(MaxGapFract, D_LetterPairs, D_Columns); vector CCs; double AvgCols = double(IxCount)/MSACount; ProgressLog(" MSA Cols N1 N1f Conf CC"); // 12345 1234567 12345 1234 1234 12345 if (optset_ref) ProgressLog(" Q TC"); // 123456 123456 ProgressLog(" Name\n"); for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const MSA &M = *E.m_MSAs[MSAIndex]; const string &Name = E.m_MSANames[MSAIndex]; uint N1 = E.GetN1(MSAIndex); uint ColCount = M.GetColCount(); double TotalConf = E.GetTotalConf(MSAIndex); double N1f = double(N1)/ColCount; double CC = TotalConf/ColCount; CCs.push_back(CC); ProgressLog("%5u %7u %5u %4.2f %4.2f %5.3f", MSAIndex+1, ColCount, N1, N1f, TotalConf, CC); if (optset_ref) ProgressLog(" %6.4f %6.4f", Qs[MSAIndex], TCs[MSAIndex]); ProgressLog(" %s\n", Name.c_str()); } sort(CCs.begin(), CCs.end()); double MedianCC = CCs[MSACount/2]; Progress("%u seqs, %u MSAs, avg cols %.1f, D_LP %.3g, D_Cols %.3g, CC %.3g", SeqCount, MSACount, AvgCols, D_LetterPairs, D_Columns, MedianCC); Log("@SUMMARY input=%s D_LP=%.4f D_Cols=%.4f CC=%.4f", InputFileName.c_str(), D_LetterPairs, D_Columns, MedianCC); if (optset_ref) { sort(Qs.begin(), Qs.end()); sort(TCs.begin(), TCs.end()); asserta(SIZE(Qs) == MSACount); asserta(SIZE(TCs) == MSACount); double MedianQ = Qs[MSACount/2]; double MedianTC = TCs[MSACount/2]; double E_LP = 1 - MedianQ; double E_Cols = 1 - MedianTC; Progress(" E_LP %.4f, E_Cols %.4f", E_LP, E_Cols); Log(" E_LP=%.4f E_Cols=%.4f", E_LP, E_Cols); } Progress("\n"); Log("\n"); } muscle-5.1.0/src/ensemble.cpp000066400000000000000000000561231424453062600161070ustar00rootroot00000000000000#include "muscle.h" #include "ensemble.h" #include "qscorer.h" static char ReadFirstChar(const string &FileName) { FILE *f = OpenStdioFile(FileName); char c; ReadStdioFile(f, &c, 1); CloseStdioFile(f); return c; } void Ensemble::SetDerived() { ToUpper(); MapLabels(); SortMSAs(); SetUngappedSeqs(); SetColToPosVec(); SetColumns(); } void Ensemble::MapLabels() { asserta(!m_MSAs.empty()); const MSA &M0 = *m_MSAs[0]; const uint SeqCount = M0.GetSeqCount(); M0.GetLabelToSeqIndex(m_Labels0, m_LabelToSeqIndex0); asserta(SIZE(m_Labels0) == SeqCount); } void Ensemble::SortMSA(MSA &M) { const MSA &M0 = *m_MSAs[0]; asserta(&M != &M0); const uint SeqCount = GetSeqCount(); map LabelToSeqIndex2; vector Labels2; M.GetLabelToSeqIndex(Labels2, LabelToSeqIndex2); char **szSeqsSorted = myalloc(char *, SeqCount); memset_zero(szSeqsSorted, SeqCount); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { const string &Label = Labels2[SeqIndex]; map::const_iterator p = m_LabelToSeqIndex0.find(Label); if (p == m_LabelToSeqIndex0.end()) Die("SortMSA, different labels (%s)", Label.c_str()); uint SeqIndex0 = p->second; asserta(szSeqsSorted[SeqIndex0] == 0); szSeqsSorted[SeqIndex0] = M.m_szSeqs[SeqIndex]; } M.m_szNames = M0.m_szNames; M.m_szSeqs = szSeqsSorted; M.GetLabelToSeqIndex(Labels2, LabelToSeqIndex2); asserta(Labels2 == m_Labels0); asserta(LabelToSeqIndex2 == m_LabelToSeqIndex0); } void Ensemble::SortMSAs() { const uint MSACount = GetMSACount(); const uint SeqCount = GetSeqCount(); const MSA &M0 = *m_MSAs[0]; for (uint MSAIndex = 1; MSAIndex < MSACount; ++MSAIndex) { MSA &M = *m_MSAs[MSAIndex]; const uint SeqCount2 = M.GetSeqCount(); if (SeqCount2 != SeqCount) Die("Bad ensemble, different nr seqs"); SortMSA(M); } } void Ensemble::FromEFA(const string &FN) { Clear(); vector Strings; ReadStringsFromFile(FN, Strings); if (Strings.empty()) Die("Empty EFA (%s)", FN.c_str()); if (Strings[0].c_str()[0] != '<') Die("Invalid EFA, must start with '<' (%s)", FN.c_str()); vector MSAStrings; for (uint i = 0; i < SIZE(Strings); ++i) { const string &s = Strings[i]; if (s.c_str()[0] == '<') { if (!MSAStrings.empty()) { MSA &M = *new MSA; M.FromStrings(MSAStrings); m_MSAs.push_back(&M); MSAStrings.clear(); } string MSAName = s.substr(1); m_MSANames.push_back(MSAName); } else MSAStrings.push_back(s); } MSA &M = *new MSA; M.FromStrings(MSAStrings); m_MSAs.push_back(&M); if (SIZE(m_MSAs) != SIZE(m_MSANames)) Die("Invalid EFA, %u MSAs %u names (%s)", SIZE(m_MSAs), SIZE(m_MSANames), FN.c_str()); SetDerived(); } void Ensemble::ToEFA(const string &FN) const { if (FN.empty()) return; FILE *f = CreateStdioFile(FN); const uint MSACount = GetMSACount(); asserta(SIZE(m_MSANames) == MSACount); for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const string &MSAName = m_MSANames[MSAIndex]; fprintf(f, "<%s\n", MSAName.c_str()); const MSA &M = *m_MSAs[MSAIndex]; M.ToFASTAFile(f); } CloseStdioFile(f); } void Ensemble::FromFile(const string &FN) { char c = ReadFirstChar(FN); if (c == '<') FromEFA(FN); else FromMSAPaths(FN); } void Ensemble::FromMSAPaths(const string &FN) { Clear(); m_MSANames.clear(); ReadStringsFromFile(FN, m_MSANames); const uint MSACount = SIZE(m_MSANames); if (MSACount == 0) { Warning("Empty ensemble"); return; } for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { ProgressStep(MSAIndex, MSACount, "Reading m_MSAs"); const string &MSAFileName = m_MSANames[MSAIndex]; MSA *M = new MSA; M->FromFASTAFile(MSAFileName); m_MSAs.push_back(M); } if (opt(basename)) { for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const string &MSAFileName = m_MSANames[MSAIndex]; m_MSANames[MSAIndex] = string(BaseName(MSAFileName.c_str())); } } if (opt(intsuffix)) { for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { string Name = m_MSANames[MSAIndex]; Psa(Name, ".%u", MSAIndex); m_MSANames[MSAIndex] = Name; } } SetDerived(); } uint Ensemble::GetSeqCount() const { if (m_MSAs.empty()) return 0; uint SeqCount = m_MSAs[0]->GetSeqCount(); return SeqCount; } void Ensemble::ToUpper() { const uint MSACount = GetMSACount(); const uint SeqCount = GetSeqCount(); for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { MSA &M = *m_MSAs[MSAIndex]; const uint ColCount = M.GetColCount(); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { char *Seq = M.m_szSeqs[SeqIndex]; for (uint i = 0; i < ColCount; ++i) Seq[i] = toupper(Seq[i]); } } } void Ensemble::MakeResampledMSA(const vector &UniqueIxs, MSA &M) const { M.Clear(); const uint ColCount = SIZE(UniqueIxs); const uint SeqCount = GetSeqCount(); M.SetSize(SeqCount, ColCount); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { const string &Label = m_Labels0[SeqIndex]; M.m_szNames[SeqIndex] = mystrsave(Label.c_str()); } for (uint ColIndex = 0; ColIndex < ColCount; ++ColIndex) { uint UniqueIx = UniqueIxs[ColIndex]; asserta(UniqueIx < SIZE(m_UniqueIxs)); uint Ix = m_UniqueIxs[UniqueIx]; asserta(Ix < SIZE(m_ColumnStrings)); const string &ColumnString = m_ColumnStrings[Ix]; asserta(SIZE(ColumnString) == SeqCount); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { char c = ColumnString[SeqIndex]; M.m_szSeqs[SeqIndex][ColIndex] = c; } } } void Ensemble::GetHiQualUniqueIxs(double MaxGapFract, double MinConf, vector &UniqueIxs) const { UniqueIxs.clear(); const uint N = SIZE(m_UniqueIxs); for (uint UniqueIx = 0; UniqueIx < N; ++UniqueIx) { uint Ix = m_UniqueIxs[UniqueIx]; double Conf = GetConf(UniqueIx); if (Conf < MinConf) continue; double GapFract = GetGapFract(Ix); if (GapFract <= MaxGapFract) UniqueIxs.push_back(UniqueIx); } } uint Ensemble::GetMedianHiQualColCount(double MaxGapFract, double MinConf) const { vector ColCounts; const uint MSACount = SIZE(m_MSAs); if (MSACount == 0) return 0; for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const MSA &M = *m_MSAs[MSAIndex]; const uint SeqCount = M.GetSeqCount(); const uint ColCount = M.GetColCount(); uint NonGappyColCount = 0; for (uint ColIndex = 0; ColIndex < ColCount; ++ColIndex) { double Conf = GetConf_MSACol(MSAIndex, ColIndex); if (Conf < MinConf) continue; uint GapCount = M.GetGapCount(ColIndex); double GapFract = double(GapCount)/double(SeqCount); if (GapFract <= MaxGapFract) ++NonGappyColCount; } ColCounts.push_back(NonGappyColCount); } sort(ColCounts.begin(), ColCounts.end()); uint MedianColCount = ColCounts[MSACount/2]; return MedianColCount; } void Ensemble::SetUngappedSeqs() { m_UngappedSeqs.clear(); const MSA &M0 = *m_MSAs[0]; const uint SeqCount = M0.GetSeqCount(); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { string UngappedSeq; M0.GetUngappedSeqStr(SeqIndex, UngappedSeq); m_UngappedSeqs.push_back(UngappedSeq); } // Validate same seqs const uint MSACount = GetMSACount(); for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const MSA &M = *m_MSAs[MSAIndex]; asserta(M.GetSeqCount() == SeqCount); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { asserta(strcmp(M0.m_szNames[SeqIndex], M.m_szNames[SeqIndex]) == 0); string UngappedSeq; M.GetUngappedSeqStr(SeqIndex, UngappedSeq); if (UngappedSeq != m_UngappedSeqs[SeqIndex]) { const uint L = SIZE(UngappedSeq); const uint L2 = SIZE(m_UngappedSeqs[SeqIndex]); Log(">%s\n", M0.m_szNames[SeqIndex]); Log("%s\n", UngappedSeq.c_str()); Log("%s\n", m_UngappedSeqs[SeqIndex].c_str()); for (uint i = 0; i < max(L, L2); ++i) { if (i >= min(L, L2)) { Log("*"); continue; } char c = UngappedSeq[i]; char c2 = m_UngappedSeqs[SeqIndex][i]; if (c == c2) Log(" "); else Log("d"); } Log("\n"); Die("MSA %u UngappedSeq != m_UngappedSeqs[%u]", MSAIndex, SeqIndex); } } } } void Ensemble::SetColToPosVec() { const uint MSACount = GetMSACount(); const uint SeqCount = GetSeqCount(); m_ColToPosVec.clear(); m_ColToPosVec.resize(MSACount); for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const MSA &M = *m_MSAs[MSAIndex]; m_ColToPosVec[MSAIndex].resize(SeqCount); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) M.GetColToPos(SeqIndex, m_ColToPosVec[MSAIndex][SeqIndex]); } } void Ensemble::GetColumn(uint MSAIndex, uint ColIndex, string &ColStr, vector &PosVec) const { ColStr.clear(); PosVec.clear(); const MSA &M = *m_MSAs[MSAIndex]; const uint SeqCount = GetSeqCount(); ColStr.resize(SeqCount, '?'); PosVec.resize(SeqCount, UINT_MAX); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { char c = M.GetChar(SeqIndex, ColIndex); ColStr[SeqIndex] = c; uint Pos = m_ColToPosVec[MSAIndex][SeqIndex][ColIndex]; PosVec[SeqIndex] = Pos; if (Pos != UINT_MAX) { const string &UngappedSeq = m_UngappedSeqs[SeqIndex]; asserta(Pos < SIZE(UngappedSeq)); char c2 = UngappedSeq[Pos]; asserta(c2 == c); } } for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) asserta(ColStr[SeqIndex] != '?'); } void Ensemble::SetColumns() { m_ColumnStrings.clear(); m_ColumnPositions.clear(); m_IxToMSAIndex.clear(); m_IxToColIndex.clear(); const uint MSACount = GetMSACount(); const uint SeqCount = GetSeqCount(); if (MSACount == 0) return; for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { map LabelToSeqIndex2; const MSA &M = *m_MSAs[MSAIndex]; uint SeqCount2 = M.GetSeqCount(); asserta(SeqCount2 == SeqCount); const uint ColCount = M.GetColCount(); for (uint ColIndex = 0; ColIndex < ColCount; ++ColIndex) { string ColStr; vector PosVec; GetColumn(MSAIndex, ColIndex, ColStr, PosVec); m_ColumnStrings.push_back(ColStr); m_ColumnPositions.push_back(PosVec); m_IxToMSAIndex.push_back(MSAIndex); m_IxToColIndex.push_back(ColIndex); } } SetUniqueColMap(); } void Ensemble::SetUniqueColMap() { m_UniqueIxs.clear(); m_UniqueIxToIxs.clear(); m_IxToUniqueIx.clear(); m_UniqueColMap.clear(); m_MSAColToIx.clear(); const uint MSACount = GetMSACount(); m_MSAColToIx.resize(MSACount); for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const MSA &M = *m_MSAs[MSAIndex]; uint ColCount = M.GetColCount(); m_MSAColToIx[MSAIndex].resize(ColCount, UINT_MAX); } const vector Empty; const uint N = SIZE(m_ColumnPositions); for (uint Ix = 0; Ix < N; ++Ix) { uint MSAIndex = m_IxToMSAIndex[Ix]; uint ColIndex = m_IxToColIndex[Ix]; asserta(MSAIndex < MSACount); asserta(ColIndex < SIZE(m_MSAColToIx[MSAIndex])); m_MSAColToIx[MSAIndex][ColIndex] = Ix; const vector &PosVec = m_ColumnPositions[Ix]; map, uint>::const_iterator p = m_UniqueColMap.find(PosVec); if (p == m_UniqueColMap.end()) { uint UniqueIx = SIZE(m_UniqueIxs); m_UniqueColMap[PosVec] = UniqueIx; m_UniqueIxs.push_back(Ix); asserta(SIZE(m_UniqueIxToIxs) == UniqueIx); m_UniqueIxToIxs.push_back(Empty); m_UniqueIxToIxs[UniqueIx].push_back(Ix); m_IxToUniqueIx.push_back(UniqueIx); } else { uint UniqueIx = p->second; m_UniqueIxToIxs[UniqueIx].push_back(Ix); m_IxToUniqueIx.push_back(UniqueIx); } } ValidateUniqueColMap(); } void Ensemble::ValidateUniqueColMap1(uint MSAIndex, uint ColIndex) const { asserta(ColIndex < SIZE(m_MSAColToIx[MSAIndex])); uint Ix = m_MSAColToIx[MSAIndex][ColIndex]; asserta(Ix < SIZE(m_ColumnPositions)); const vector &PosVec = m_ColumnPositions[Ix]; asserta(Ix < SIZE(m_IxToUniqueIx)); uint UniqueIx = m_IxToUniqueIx[Ix]; asserta(UniqueIx < SIZE(m_UniqueIxToIxs)); const vector &Ixs = m_UniqueIxToIxs[UniqueIx]; bool Found = false; for (uint i = 0; i < SIZE(Ixs); ++i) { if (Ixs[i] == Ix) { Found = true; break; } } asserta(Found); map, uint>::const_iterator p = m_UniqueColMap.find(PosVec); asserta(p != m_UniqueColMap.end()); uint UniqueIx2 = p->second; asserta(UniqueIx == UniqueIx2); } void Ensemble::ValidateUniqueIx(uint UniqueIx) const { asserta(UniqueIx < SIZE(m_UniqueIxs)); asserta(UniqueIx < SIZE(m_UniqueIxToIxs)); uint Ix = m_UniqueIxs[UniqueIx]; asserta(Ix < SIZE(m_ColumnPositions)); const vector &PosVec = m_ColumnPositions[Ix]; map, uint>::const_iterator p = m_UniqueColMap.find(PosVec); asserta(p != m_UniqueColMap.end()); asserta(p->first == PosVec); const vector &Ixs = m_UniqueIxToIxs[UniqueIx]; for (uint i = 0; i < SIZE(Ixs); ++i) { uint Ix2 = Ixs[i]; asserta(Ix2 < SIZE(m_IxToUniqueIx)); uint UniqueIx2 = m_IxToUniqueIx[Ix2]; asserta(UniqueIx2 == UniqueIx); const vector &PosVec2 = m_ColumnPositions[Ix2]; asserta(PosVec2 == PosVec); } } void Ensemble::ValidateUniqueColMap() const { const uint MSACount = GetMSACount(); for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const MSA &M = *m_MSAs[MSAIndex]; uint ColCount = M.GetColCount(); asserta(MSAIndex < SIZE(m_MSAColToIx)); for (uint ColIndex = 0; ColIndex < ColCount; ++ColIndex) ValidateUniqueColMap1(MSAIndex, ColIndex); } const uint UniqueIxCount = SIZE(m_UniqueIxs); for (uint UniqueIx = 0; UniqueIx < UniqueIxCount; ++UniqueIx) ValidateUniqueIx(UniqueIx); } double Ensemble::GetGapFract(uint Ix) const { asserta(Ix < SIZE(m_ColumnStrings)); const string &ColStr = m_ColumnStrings[Ix]; const uint SeqCount = GetSeqCount(); asserta(SIZE(ColStr) == SeqCount); uint GapCount = 0; for (uint i = 0; i < SeqCount; ++i) { char c = ColStr[i]; if (isgap(c)) ++GapCount; } double GapFract = double(GapCount)/SeqCount; return GapFract; } void Ensemble::SubsampleWithReplacement(double MaxGapFract, uint ColCount, MSA &M) const { vector Ixs; GetIxSubset(MaxGapFract, Ixs); SubsampleWithReplacement(Ixs, ColCount, M); } void Ensemble::SubsampleWithReplacement(const vector &Ixs, uint ColCount, MSA &M) const { asserta(ColCount > 0); const uint SeqCount = GetSeqCount(); M.SetSize(SeqCount, ColCount); asserta(SIZE(m_Labels0) == SeqCount); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { const string &Label = m_Labels0[SeqIndex]; M.m_szNames[SeqIndex] = mystrsave(Label.c_str()); } const uint N = SIZE(Ixs); for (uint i = 0; i < N; ++i) { uint r = randu32()%N; uint Ix = Ixs[r]; asserta(Ix < SIZE(m_ColumnStrings)); const string &ColStr = m_ColumnStrings[Ix]; asserta(SIZE(ColStr) == SeqCount); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) M.m_szSeqs[SeqIndex][i] = ColStr[SeqIndex]; } } void Ensemble::GetIxSubset(double MaxGapFract, vector &Ixs) const { Ixs.clear(); const uint IxCount = SIZE(m_ColumnStrings); for (uint Ix = 0; Ix < IxCount; ++Ix) { double GapFract = GetGapFract(Ix); if (GapFract <= MaxGapFract) Ixs.push_back(Ix); } } void Ensemble::GetAbToCountAll(vector &AbToCountAll) { const uint MSACount = GetMSACount(); AbToCountAll.clear(); AbToCountAll.resize(MSACount+1, 0); for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { vector AbToCount; GetAbToCount(MSAIndex, AbToCount); asserta(SIZE(AbToCount) == MSACount); for (uint i = 0; i < MSACount; ++i) AbToCountAll[i] += AbToCount[i]; } } void Ensemble::GetAbToCount(uint MSAIndex, vector &AbToCount) { const uint MSACount = GetMSACount(); asserta(MSAIndex < MSACount); AbToCount.clear(); AbToCount.resize(MSACount+1, 0); const MSA &M = *m_MSAs[MSAIndex]; const uint ColCount = M.GetColCount(); for (uint Col = 0; Col < ColCount; ++Col) { uint Ab = GetAb(MSAIndex, Col); asserta(Ab > 0); asserta(Ab <= MSACount); ++AbToCount[Ab]; } } uint Ensemble::GetIx(uint MSAIndex, uint ColIndex) const { asserta(MSAIndex < SIZE(m_MSAColToIx)); asserta(ColIndex < SIZE(m_MSAColToIx[MSAIndex])); uint Ix = m_MSAColToIx[MSAIndex][ColIndex]; return Ix; } uint Ensemble::GetUniqueIx(uint MSAIndex, uint ColIndex) const { uint Ix = GetIx(MSAIndex, ColIndex); asserta(Ix < SIZE(m_IxToUniqueIx)); uint UniqueIx = m_IxToUniqueIx[Ix]; return UniqueIx; } double Ensemble::GetMedianConf(uint MSAIndex) const { asserta(MSAIndex < SIZE(m_MSAs)); const MSA &M = *m_MSAs[MSAIndex]; const uint ColCount = M.GetColCount(); vector Confs; for (uint ColIndex = 0; ColIndex < ColCount; ++ColIndex) { uint Ix = m_MSAColToIx[MSAIndex][ColIndex]; asserta(Ix < SIZE(m_IxToUniqueIx)); uint UniqueIx = m_IxToUniqueIx[Ix]; double Conf = GetConf(UniqueIx); Confs.push_back(Conf); } sort(Confs.begin(), Confs.end()); double MedianConf = Confs[ColCount/2]; return MedianConf; } double Ensemble::GetTotalConf(uint MSAIndex) const { asserta(MSAIndex < SIZE(m_MSAs)); const MSA &M = *m_MSAs[MSAIndex]; const uint ColCount = M.GetColCount(); asserta(MSAIndex < SIZE(m_MSAColToIx)); asserta(SIZE(m_MSAColToIx[MSAIndex]) == ColCount); double SumConf = 0; for (uint ColIndex = 0; ColIndex < ColCount; ++ColIndex) { uint Ix = m_MSAColToIx[MSAIndex][ColIndex]; asserta(Ix < SIZE(m_IxToUniqueIx)); uint UniqueIx = m_IxToUniqueIx[Ix]; double Conf = GetConf(UniqueIx); SumConf += Conf; } return SumConf; } double Ensemble::GetConf_MSACol(uint MSAIndex, uint ColIndex) const { asserta(MSAIndex < SIZE(m_MSAColToIx)); asserta(ColIndex < SIZE(m_MSAColToIx[MSAIndex])); uint Ix = m_MSAColToIx[MSAIndex][ColIndex]; uint UniqueIx = m_IxToUniqueIx[Ix]; double Conf = GetConf(UniqueIx); return Conf; } double Ensemble::GetConf(uint UniqueIx) const { const uint MSACount = GetMSACount(); asserta(UniqueIx < SIZE(m_UniqueIxToIxs)); const vector &Ixs = m_UniqueIxToIxs[UniqueIx]; uint Ab = SIZE(Ixs); double Conf = double(Ab)/MSACount; return Conf; } uint Ensemble::GetAb(uint MSAIndex, uint ColIndex) const { uint UniqueIx = GetUniqueIx(MSAIndex, ColIndex); asserta(UniqueIx < SIZE(m_UniqueIxToIxs)); const vector &Ixs = m_UniqueIxToIxs[UniqueIx]; uint Ab = SIZE(Ixs); asserta(Ab > 0); return Ab; } uint Ensemble::GetN1(uint MSAIndex) const { asserta(MSAIndex < SIZE(m_MSAs)); const MSA &M = *m_MSAs[MSAIndex]; const uint ColCount = M.GetColCount(); uint N1 = 0; for (uint ColIndex = 0; ColIndex < ColCount; ++ColIndex) { uint Ab = GetAb(MSAIndex, ColIndex); if (Ab == 1) ++N1; } return N1; } void Ensemble::GetDispersion(double MaxGapFract, double &D_LetterPairs, double &D_Columns) const { QScorer QS; QS.m_MaxGapFract = MaxGapFract; vector Qs; vector TCs; const uint MSACount = GetMSACount(); const uint PairCount = (MSACount*(MSACount - 1))/2; uint PairIndex = 0; for (uint i = 0; i < MSACount; ++i) { const MSA &MSAi = *m_MSAs[i]; const string &Namei = m_MSANames[i]; for (uint j = i + 1; j < MSACount; ++j) { ProgressStep(PairIndex++, PairCount, "Pairwise dists"); const MSA &MSAj = *m_MSAs[j]; const string &Namej = m_MSANames[j]; QS.Run(MSAi, MSAj); double Qij = QS.m_Q; double TCij = QS.m_TC; QS.Run(MSAj, MSAi); double Qji = QS.m_Q; double TCji = QS.m_TC; double Q = (Qij + Qji)/2; double TC = (TCij + TCji)/2; asserta(Q >= 0 && Q <= 1); asserta(TC >= 0 && TC <= 1); Qs.push_back(Q); TCs.push_back(TC); } } sort(Qs.begin(), Qs.end()); sort(TCs.begin(), TCs.end()); const uint N = SIZE(Qs); asserta(SIZE(TCs) == N); double MedianQ = Qs[N/2]; double MedianTC = TCs[N/2]; D_LetterPairs = 1.0 - MedianQ; D_Columns = 1.0 - MedianTC; asserta(D_LetterPairs >= 0 && D_LetterPairs <= 1); asserta(D_Columns >= 0 && D_Columns <= 1); } void Ensemble::CheckRefMSA(const MSA &Ref) const { const uint SeqCount = Ref.GetSeqCount(); const uint RefSeqCount = Ref.GetSeqCount(); const uint RefColCount = Ref.GetColCount(); if (RefSeqCount != SeqCount) Die("Different nr seqs"); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { const string Label = string(Ref.GetSeqName(SeqIndex)); if (Label != m_Labels0[SeqIndex]) Die("GetRefUniqueIxs, not sorted"); } } void Ensemble::GetRefUniqueIxs(const MSA &Ref, set &UniqueIxs, double MaxGapFract) const { UniqueIxs.clear(); CheckRefMSA(Ref); const uint SeqCount = GetSeqCount(); const uint RefColCount = Ref.GetColCount(); vector > ColToPosVec(SeqCount); for (uint RefSeqIndex = 0; RefSeqIndex < SeqCount; ++RefSeqIndex) Ref.GetColToPos(RefSeqIndex, ColToPosVec[RefSeqIndex]); for (uint RefColIndex = 0; RefColIndex < RefColCount; ++RefColIndex) { bool IsUpper = Ref.ColIsUpper(RefColIndex, MaxGapFract); if (!IsUpper) continue; vector PosVec(SeqCount, UINT_MAX); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { asserta(SeqIndex < SIZE(ColToPosVec)); asserta(RefColIndex < SIZE(ColToPosVec[SeqIndex])); uint Pos = ColToPosVec[SeqIndex][RefColIndex]; PosVec[SeqIndex] = Pos; } map, uint >::const_iterator p = m_UniqueColMap.find(PosVec); if (p != m_UniqueColMap.end()) { uint UniqueIx = p->second; UniqueIxs.insert(UniqueIx); } } } void Ensemble::GetRefPosSet(const MSA &Ref, double MaxGapFract, set > &PosSet) const { PosSet.clear(); CheckRefMSA(Ref); const uint SeqCount = GetSeqCount(); const uint RefColCount = Ref.GetColCount(); vector > ColToPosVec(SeqCount); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) Ref.GetColToPos(SeqIndex, ColToPosVec[SeqIndex]); for (uint RefColIndex = 0; RefColIndex < RefColCount; ++RefColIndex) { bool IsUpper = Ref.ColIsUpper(RefColIndex, MaxGapFract); if (!IsUpper) continue; for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { uint Pos = ColToPosVec[SeqIndex][RefColIndex]; pair SeqPos(SeqIndex, Pos); PosSet.insert(SeqPos); } } } void Ensemble::GetTestUniqueIxs(uint MSAIndex, const set > &RefPosSet, vector &UniqueIxs, vector &Confs) const { UniqueIxs.clear(); Confs.clear(); const uint MSACount = GetMSACount(); const uint SeqCount = GetSeqCount(); asserta(MSAIndex < MSACount); const MSA &M = *m_MSAs[MSAIndex]; const uint ColCount = M.GetColCount(); asserta(MSAIndex < SIZE(m_MSAColToIx)); asserta(SIZE(m_MSAColToIx[MSAIndex]) == ColCount); for (uint ColIndex = 0; ColIndex < ColCount; ++ColIndex) { uint Ix = m_MSAColToIx[MSAIndex][ColIndex]; asserta(Ix < SIZE(m_ColumnPositions)); const vector &PosVec = m_ColumnPositions[Ix]; asserta(SIZE(PosVec) == SeqCount); uint FoundCount = 0; for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { uint Pos = PosVec[SeqIndex]; if (Pos == UINT_MAX) continue; pair SeqPos(SeqIndex, Pos); if (RefPosSet.find(SeqPos) != RefPosSet.end()) ++FoundCount; } if (FoundCount >= SeqCount/2) { asserta(Ix < SIZE(m_IxToUniqueIx)); uint UniqueIx = m_IxToUniqueIx[Ix]; double Conf = GetConf(UniqueIx); UniqueIxs.push_back(UniqueIx); Confs.push_back(Conf); } } } const MSA &Ensemble::GetMSA(uint MSAIndex) const { asserta(MSAIndex < SIZE(m_MSAs)); return *m_MSAs[MSAIndex]; } const string &Ensemble::GetMSAName(uint MSAIndex) const { asserta(MSAIndex < SIZE(m_MSANames)); return m_MSANames[MSAIndex]; } muscle-5.1.0/src/ensemble.h000066400000000000000000000064231424453062600155520ustar00rootroot00000000000000#pragma once #include class Ensemble { public: vector m_MSAs; vector m_MSANames; vector m_Labels0; map m_LabelToSeqIndex0; vector m_UngappedSeqs; vector m_ColumnStrings; vector > m_ColumnPositions; vector > > m_ColToPosVec; vector m_IxToMSAIndex; vector m_IxToColIndex; vector > m_MSAColToIx; vector m_UniqueIxs; vector > m_UniqueIxToIxs; vector m_IxToUniqueIx; map, uint> m_UniqueColMap; public: void Clear() { m_MSAs.clear(); m_MSANames.clear(); m_ColumnStrings.clear(); m_ColumnPositions.clear(); m_Labels0.clear(); m_LabelToSeqIndex0.clear(); m_ColToPosVec.clear(); m_IxToMSAIndex.clear(); m_IxToColIndex.clear(); m_UniqueIxToIxs.clear(); m_UniqueIxs.clear(); m_IxToUniqueIx.clear(); m_UniqueColMap.clear(); m_MSAColToIx.clear(); } void FromFile(const string &FileName); void FromMSAPaths(const string &FileName); void FromEFA(const string &FileName); void ToEFA(const string &FileName) const; void SetDerived(); uint GetMSACount() const { return SIZE(m_MSAs); } uint GetIxCount() const { return SIZE(m_IxToMSAIndex); } uint GetSeqCount() const; void SetColumns(); void GetColumn(uint MSAIndex, uint ColIndex, string &ColStr, vector &ColPos) const; void GetIxSubset(double MaxGapFract, vector &Ixs) const; double GetGapFract(uint Ix) const; void SubsampleWithReplacement(double MaxGapFract, uint ColCount, MSA &M) const; void SubsampleWithReplacement(const vector &Ixs, uint ColCount, MSA &M) const; void GetAbToCountAll(vector &AbToCount); void GetAbToCount(uint MSAIndex, vector &AbToCount); uint GetUniqueIx(uint MSAIndex, uint ColIndex) const; uint GetIx(uint MSAIndex, uint ColIndex) const; uint GetAb(uint MSAIndex, uint ColIndex) const; double GetConf(uint UniqueIx) const; double GetConf_MSACol(uint MSAIndex, uint ColIndex) const; uint GetN1(uint MSAIndex) const; void ValidateUniqueColMap() const; void ValidateUniqueColMap1(uint MSAIndex, uint ColIndex) const; void ValidateUniqueIx(uint UniqueIx) const; void GetDispersion(double MaxGapFract, double &D_LetterPairs, double &D_Columns) const; double GetTotalConf(uint MSAIndex) const; double GetMedianConf(uint MSAIndex) const; void SortMSA(MSA &M); void CheckRefMSA(const MSA &Ref) const; void GetRefPosSet(const MSA &Ref, double MaxGapFract, set > &PosSet) const; void GetTestUniqueIxs(uint MSAIndex, const set > &RefPosSet, vector &UniqueIxs, vector &Confs) const; void GetRefUniqueIxs(const MSA &Ref, set &UniqueIxs, double MaxGapFract) const; void MakeResampledMSA(const vector &UniqueIxs, MSA &M) const; uint GetMedianHiQualColCount(double MaxGapFract, double MinConf) const; void GetHiQualUniqueIxs(double MaxGapFract, double MinConf, vector &UniqueIxs) const; const MSA &GetMSA(uint MSAIndex) const; const string &GetMSAName(uint MSAIndex) const; void GetLetterConfsVec(const MSA &Ref, double MaxGapFract, vector > &LetterConfsVec) const; private: void MapLabels(); void SortMSAs(); void ToUpper(); void SetColToPosVec(); void SetUngappedSeqs(); void SetUniqueColMap(); }; muscle-5.1.0/src/fa2efa.cpp000066400000000000000000000005651424453062600154400ustar00rootroot00000000000000#include "muscle.h" #include "ensemble.h" void cmd_fa2efa() { const string &InputFileName = opt(fa2efa); const string &OutputFileName = opt(output); Ensemble E; E.FromMSAPaths(InputFileName); Progress("%u seqs, %u MSAs\n", E.GetSeqCount(), E.GetMSACount()); Progress("Writing %s ...\n", OutputFileName.c_str()); E.ToEFA(OutputFileName); Progress("done.\n"); } muscle-5.1.0/src/fasta.cpp000066400000000000000000000063401424453062600154070ustar00rootroot00000000000000#include "muscle.h" const unsigned FASTA_BLOCK = 60; void MSA::FromFASTAFile(TextFile &File) { Clear(); FILE *f = File.GetStdioFile(); unsigned uSeqCount = 0; unsigned uColCount = UINT_MAX; for (;;) { char *Label; unsigned uSeqLength; char *GetFastaSeq(FILE *f, unsigned *ptrSeqLength, char **ptrLabel, bool DeleteGaps); char *SeqData = GetFastaSeq(f, &uSeqLength, &Label, false); if (0 == SeqData) break; AppendSeq(SeqData, uSeqLength, Label); } } void MSA::FromFASTAFile_PreserveCase(const string &FileName) { extern bool g_FASTA_Upper; bool SaveUpper = g_FASTA_Upper; g_FASTA_Upper = false; FromFASTAFile(FileName); g_FASTA_Upper = true; } void MSA::FromStrings(const vector &Strings) { Clear(); if (Strings.empty()) Die("MSA::FromStrings, no data"); vector Labels; vector Seqs; string CurrSeq; for (uint i = 0; i < SIZE(Strings); ++i) { const string &s = Strings[i]; char s0 = s.c_str()[0]; if (s0 == '>') { if (!Labels.empty()) Seqs.push_back(CurrSeq); Labels.push_back(s.substr(1)); CurrSeq.clear(); } else { for (uint i = 0; i < SIZE(s); ++i) { char c = s[i]; if (!isspace(c)) CurrSeq.push_back(c); } } } Seqs.push_back(CurrSeq); FromStrings2(Labels, Seqs); } void MSA::FromStrings2(const vector &Labels, vector &Seqs) { const uint SeqCount = SIZE(Labels); if (SIZE(Seqs) != SeqCount) Die("Invalid FASTA, %u labels %u seqs", SIZE(Labels), SIZE(Seqs)); if (SeqCount == 0) Die("Empty FASTA"); const uint ColCount = SIZE(Seqs[0]); SetSize(SeqCount, ColCount); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { const char *Label = Labels[SeqIndex].c_str(); const string &Str = Seqs[SeqIndex]; const uint n = SIZE(Str); if (n != ColCount) Die("MSA not aligned, seq lengths %u, %u", ColCount, n); const char *S = Str.c_str(); m_szNames[SeqIndex] = mystrsave(Label); m_szSeqs[SeqIndex] = mystrsave(S); } } void MSA::FromFASTAFile(const string &FileName) { Clear(); TextFile TF(FileName); FromFASTAFile(TF); TF.Close(); } void MSA::ToFASTAFile(const string &FileName) const { if (FileName.empty()) return; TextFile TF(FileName, true); ToFASTAFile(TF); TF.Close(); } void MSA::ToFASTAFile(FILE *f) const { if (f == 0) return; for (uint SeqIndex = 0; SeqIndex < m_uSeqCount; ++SeqIndex) { const byte *S = (const byte *) m_szSeqs[SeqIndex]; const char *Label = m_szNames[SeqIndex]; SeqToFasta(f, S, m_uColCount, Label); } } void MSA::ToFASTAFile(TextFile &File) const { const unsigned uColCount = GetColCount(); assert(uColCount > 0); const unsigned uLinesPerSeq = (GetColCount() - 1)/FASTA_BLOCK + 1; const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { File.PutString(">"); File.PutString(GetSeqName(uSeqIndex)); File.PutString("\n"); unsigned n = 0; for (unsigned uLine = 0; uLine < uLinesPerSeq; ++uLine) { unsigned uLetters = uColCount - uLine*FASTA_BLOCK; if (uLetters > FASTA_BLOCK) uLetters = FASTA_BLOCK; for (unsigned i = 0; i < uLetters; ++i) { char c = GetChar(uSeqIndex, n); File.PutChar(c); ++n; } File.PutChar('\n'); } } } muscle-5.1.0/src/fasta2.cpp000066400000000000000000000046641424453062600155000ustar00rootroot00000000000000#include "muscle.h" #include #include const int BUFFER_BYTES = 16*1024; //const int BUFFER_BYTES = 128; const int CR = '\r'; const int NL = '\n'; bool g_FASTA_Upper = true; bool g_FASTA_AllowDigits = false; #define ADD(c) \ { \ if (Pos >= BufferLength) \ { \ const int NewBufferLength = BufferLength + BUFFER_BYTES; \ char *NewBuffer = new char[NewBufferLength]; \ memcpy(NewBuffer, Buffer, BufferLength); \ delete[] Buffer; \ Buffer = NewBuffer; \ BufferLength = NewBufferLength; \ } \ Buffer[Pos++] = c; \ } // Get next sequence from file. char *GetFastaSeq(FILE *f, unsigned *ptrSeqLength, char **ptrLabel, bool DeleteGaps) { unsigned BufferLength = 0; unsigned Pos = 0; char *Buffer = 0; int c = fgetc(f); if (EOF == c) return 0; if ('>' != c) Die("Invalid file format, expected '>' to start FASTA label"); for (;;) { int c = fgetc(f); if (EOF == c) Die("End-of-file or input error in FASTA label"); // NL or CR terminates label if (NL == c || CR == c) break; // All other characters added to label ADD(c) } // Nul-terminate label ADD(0) *ptrLabel = Buffer; BufferLength = 0; Pos = 0; Buffer = 0; int PreviousChar = NL; for (;;) { int c = fgetc(f); if (EOF == c) { if (feof(f)) break; else if (ferror(f)) Die("Error reading FASTA file, ferror=TRUE feof=FALSE errno=%d %s", errno, strerror(errno)); else Die("Error reading FASTA file, fgetc=EOF feof=FALSE ferror=FALSE errno=%d %s", errno, strerror(errno)); } if ('>' == c) { if (NL == PreviousChar || CR == PreviousChar) { ungetc(c, f); break; } else Die("Unexpected '>' in FASTA sequence data"); } else if (isspace(c)) ; else if (IsGapChar(c)) { if (!DeleteGaps) { ADD(c); } } else if (isalpha(c)) { if (g_FASTA_Upper) c = toupper(c); ADD(c) } else if (g_FASTA_AllowDigits && isdigit(c)) { ADD(c); } else if (isprint(c)) { Warning("Invalid character '%c' in FASTA sequence data, ignored", c); continue; } else { Warning("Invalid byte hex %02x in FASTA sequence data, ignored", (unsigned char) c); continue; } PreviousChar = c; } if (0 == Pos) return GetFastaSeq(f, ptrSeqLength, ptrLabel, DeleteGaps); *ptrSeqLength = Pos; return Buffer; } muscle-5.1.0/src/filebuffer.h000066400000000000000000000051171424453062600160700ustar00rootroot00000000000000///////////////////////////////////////////////////////////////// // filebuffer.h // // Buffered file reading. ///////////////////////////////////////////////////////////////// #ifndef FILEBUFFER_H #define FILEBUFFER_H #include #include #include #include using namespace std; const int BufferSize = 1000; ///////////////////////////////////////////////////////////////// // FileBuffer // // Class for buffering file reading. ///////////////////////////////////////////////////////////////// class FileBuffer { ifstream file; char buffer[BufferSize]; int currPos; int size; bool isEOF; bool isValid; bool canUnget; public: // Some common routines FileBuffer (const char *filename) : file (filename), currPos (0), size (0), isEOF (false), isValid (!file.fail()), canUnget (false){} ~FileBuffer (){ close(); } bool fail () const { return !isValid; } bool eof () const { return (!isValid || isEOF); } void close(){ file.close(); isValid = false; } ///////////////////////////////////////////////////////////////// // FileBuffer::Get() // // Retrieve a character from the file buffer. Returns true if // and only if a character is read. ///////////////////////////////////////////////////////////////// bool Get (char &ch){ // check to make sure that there's more stuff in the file if (!isValid || isEOF) return false; // if the buffer is empty, it's time to reload it if (currPos == size){ file.read (buffer, BufferSize); size = int(file.gcount()); isEOF = (size == 0); currPos = 0; if (isEOF) return false; } // store the read character ch = buffer[currPos++]; canUnget = true; return true; } ///////////////////////////////////////////////////////////////// // FileBuffer::UnGet() // // Unretrieve the most recently read character from the file // buffer. Note that this allows only a one-level undo. ///////////////////////////////////////////////////////////////// void UnGet (){ assert (canUnget); assert (isValid); assert (currPos > 0); currPos--; assert (currPos < size); isEOF = false; canUnget = false; } ///////////////////////////////////////////////////////////////// // FileBuffer::GetLine() // // Retrieve characters of text until a newline character is // encountered. Terminates properly on end-of-file condition. ///////////////////////////////////////////////////////////////// void GetLine (string &s){ char ch; s = ""; while (Get (ch) && ch != '\n') s += ch; } }; #endif muscle-5.1.0/src/flatmx.h000066400000000000000000000006661424453062600152560ustar00rootroot00000000000000#pragma once static void FlatCoords(uint Ix, uint LY, uint &s, uint &i, uint &j) { s = Ix%HMMSTATE_COUNT; uint r = (Ix - s)/HMMSTATE_COUNT; // i*(LY+1) + j j = r%(LY+1); i = r/(LY+1); } static inline uint FlatIx(uint s, uint i, uint j, uint LY) { uint Ix = HMMSTATE_COUNT*(i*(LY+1) + j) + s; return Ix; } static inline uint FlatIx(HMMSTATE s, uint i, uint j, uint LY) { uint Ix = FlatIx(uint(s), i, j, LY); return Ix; } muscle-5.1.0/src/fwdflat3.cpp000066400000000000000000000076721424453062600160340ustar00rootroot00000000000000#include "muscle.h" /*** Fwd[s][i][j] = probability of aligning first i letters of X to first j letters of Y and ending in state s. ***/ void CalcFwdFlat(const byte *X, uint LX, const byte *Y, uint LY, float *Flat) { #include "hmmscores.h" char x0 = X[0]; char y0 = Y[0]; float Ins_x0 = InsScore[x0]; float Ins_y0 = InsScore[y0]; float Emit_x0_y0 = MatchScore[x0][y0]; const uint LY1 = LY+1; const uint Base_0_0 = HMMSTATE_COUNT*(0*(LY1) + 0); const uint Base_1_1 = HMMSTATE_COUNT*(1*(LY1) + 1); const uint Base_1_0 = HMMSTATE_COUNT*(1*(LY1) + 0); const uint Base_0_1 = HMMSTATE_COUNT*(0*(LY1) + 1); const uint BaseInc_i = HMMSTATE_COUNT*LY1; const uint BaseInc_j = HMMSTATE_COUNT; Flat[Base_0_0 + HMMSTATE_M] = LOG_ZERO; // M(0,0) Flat[Base_0_0 + HMMSTATE_IX] = LOG_ZERO; // IX(0,0) Flat[Base_0_0 + HMMSTATE_JX] = LOG_ZERO; // JX(0,0) Flat[Base_0_0 + HMMSTATE_IY] = LOG_ZERO; // IY(0,0) Flat[Base_0_0 + HMMSTATE_JY] = LOG_ZERO; // JY(0,0) Flat[Base_1_1 + HMMSTATE_M] = tSM + Emit_x0_y0; Flat[Base_1_0 + HMMSTATE_IX] = tSI + Ins_x0; Flat[Base_1_0 + HMMSTATE_JX] = tSJ + Ins_x0; Flat[Base_0_1 + HMMSTATE_IY] = tSI + Ins_y0; Flat[Base_0_1 + HMMSTATE_JY] = tSJ + Ins_y0; uint Base = Base_1_0; for (uint i = 1; i <= LX; ++i) { Flat[Base + HMMSTATE_M] = LOG_ZERO; Flat[Base + HMMSTATE_IY] = LOG_ZERO; Flat[Base + HMMSTATE_JY] = LOG_ZERO; Base += BaseInc_i; } Base = Base_0_1; for (uint j = 1; j <= LY; ++j) { Flat[Base + HMMSTATE_M] = LOG_ZERO; Flat[Base + HMMSTATE_IX] = LOG_ZERO; Flat[Base + HMMSTATE_JX] = LOG_ZERO; Base += BaseInc_j; } Base = Base_1_0; uint NextBase = Base + BaseInc_i; for (uint i = 1; i < LX; ++i) { char x = X[i]; float Emit_x = InsScore[x]; Flat[NextBase + HMMSTATE_IX] = Flat[Base + HMMSTATE_IX] + tII + Emit_x; Flat[NextBase + HMMSTATE_JX] = Flat[Base + HMMSTATE_JX] + tJJ + Emit_x; Base = NextBase; NextBase += BaseInc_i; } Base = Base_0_1; NextBase = Base + BaseInc_j; for (uint j = 1; j < LY; ++j) { char y = Y[j]; float Emit_y = InsScore[y]; Flat[NextBase + HMMSTATE_IY] = Flat[Base + HMMSTATE_IY] + tII + Emit_y; Flat[NextBase + HMMSTATE_JY] = Flat[Base + HMMSTATE_JY] + tJJ + Emit_y; Base = NextBase; NextBase += BaseInc_j; } uint Base_i_j = Base_1_1; uint Base_i1_j = Base_0_1; uint Base_i_j1 = Base_1_0; uint Base_i1_j1 = Base_0_0; for (uint i = 1; i <= LX; ++i) { char x = X[i-1]; float Emit_x = InsScore[x]; for (uint j = 1; j <= LY; ++j) { char y = Y[j-1]; float Emit_y = InsScore[y]; float Emit_Pair = MatchScore[x][y]; if (i == 1 && j == 1) Flat[Base_1_1 + HMMSTATE_M] = tSM + Emit_x0_y0; else { float M_M = Flat[Base_i1_j1 + HMMSTATE_M] + tMM; float IX_M = Flat[Base_i1_j1 + HMMSTATE_IX] + tIM; float JX_M = Flat[Base_i1_j1 + HMMSTATE_JX] + tJM; float IY_M = Flat[Base_i1_j1 + HMMSTATE_IY] + tIM; float JY_M = Flat[Base_i1_j1 + HMMSTATE_JY] + tJM; float SumPrev = LOG_ADD(M_M, IX_M, JX_M, IY_M, JY_M); Flat[Base_i_j + HMMSTATE_M] = SumPrev + Emit_Pair; } float PrevM_i1_j = Flat[Base_i1_j + HMMSTATE_M]; float PrevM_i_j1 = Flat[Base_i_j1 + HMMSTATE_M]; float M_IX = PrevM_i1_j + tMI; float IX_IX = Flat[Base_i1_j + HMMSTATE_IX] + tII; Flat[Base_i_j + HMMSTATE_IX] = LOG_ADD(IX_IX, M_IX) + Emit_x; float M_JX = PrevM_i1_j + tMJ; float JX_JX = Flat[Base_i1_j + HMMSTATE_JX] + tJJ; Flat[Base_i_j + HMMSTATE_JX] = LOG_ADD(JX_JX, M_JX) + Emit_x; float M_IY = PrevM_i_j1 + tMI; float IY_IY = Flat[Base_i_j1 + HMMSTATE_IY] + tII; Flat[Base_i_j + HMMSTATE_IY] = LOG_ADD(IY_IY, M_IY) + Emit_y; float M_JY = PrevM_i_j1 + tMJ; float JY_JY = Flat[Base_i_j1 + HMMSTATE_JY] + tJJ; Flat[Base_i_j + HMMSTATE_JY] = LOG_ADD(JY_JY, M_JY) + Emit_y; Base_i_j += BaseInc_j; Base_i1_j += BaseInc_j; Base_i_j1 += BaseInc_j; Base_i1_j1 += BaseInc_j; } Base_i_j += BaseInc_j; Base_i1_j += BaseInc_j; Base_i_j1 += BaseInc_j; Base_i1_j1 += BaseInc_j; } } muscle-5.1.0/src/getconsseq.cpp000066400000000000000000000030521424453062600164610ustar00rootroot00000000000000#include "muscle.h" static char GetConsChar(const MultiSequence &MSA, uint ColIndex) { asserta(g_AlphaSize == 4 || g_AlphaSize == 20); vector Counts(g_AlphaSize+1); const uint ColCount = MSA.GetColCount(); const uint SeqCount = MSA.GetSeqCount(); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { char c = MSA.GetChar(SeqIndex, ColIndex); if (isgap(c)) { ++(Counts[g_AlphaSize]); continue; } uint Letter = CharToLetter(c); if (Letter < g_AlphaSize) ++(Counts[Letter]); } uint MaxCount = 0; uint MaxLetter = 0; for (uint Letter = 0; Letter <= g_AlphaSize; ++Letter) { uint Count = Counts[Letter]; if (Count > MaxCount) { MaxCount = Count; MaxLetter = Letter; } } if (MaxLetter == g_AlphaSize) return '-'; char ConsChar = LetterToChar(MaxLetter); return ConsChar; } void GetConsensusSequence(const MultiSequence &MSA, string &Seq) { Seq.clear(); const uint SeqCount = MSA.GetSeqCount(); const uint ColCount = MSA.GetColCount(); vector Freqs(g_AlphaSize); for (uint ColIndex = 0; ColIndex < ColCount; ++ColIndex) { char c = GetConsChar(MSA, ColIndex); if (c != '-') Seq += c; } } void cmd_consseq() { const string &MSAFileName = opt(consseq); const string &OutputFileName = opt(output); string Label = "CONSENSUS"; if (optset_label) Label = opt(label); MultiSequence MSA; MSA.FromFASTA(MSAFileName); string ConsSeq; GetConsensusSequence(MSA, ConsSeq); FILE *fOut = CreateStdioFile(OutputFileName); SeqToFasta(fOut, ConsSeq, Label); CloseStdioFile(fOut); } muscle-5.1.0/src/getpairs.cpp000066400000000000000000000027471424453062600161360ustar00rootroot00000000000000#include "muscle.h" void GetAllPairs(uint Count, vector &Indexes1, vector &Indexes2) { Indexes1.clear(); Indexes2.clear(); for (uint i = 0; i < Count; ++i) { for (uint j = i + 1; j < Count; ++j) { Indexes1.push_back(i); Indexes2.push_back(j); } } } void GetAllPairs(uint Count1, uint Count2, vector &Indexes1, vector &Indexes2) { Indexes1.clear(); Indexes2.clear(); for (uint i = 0; i < Count1; ++i) { for (uint j = 0; j < Count2; ++j) { Indexes1.push_back(i); Indexes2.push_back(j); } } } void GetPairs(uint Count1, uint Count2, uint TargetPairCount, vector &Indexes1, vector &Indexes2) { Indexes1.clear(); Indexes2.clear(); uint AllPairCount = Count1*Count2; if (TargetPairCount == UINT_MAX || AllPairCount < TargetPairCount*3/2) { GetAllPairs(Count1, Count2, Indexes1, Indexes2); return; } set > PairSet; const uint MaxCounter = TargetPairCount*10; uint Counter = 0; while (Counter++ < MaxCounter && (uint) SIZE(PairSet) < TargetPairCount) { uint i = randu32()%Count1; uint j = randu32()%Count2; if (i == j) continue; pair Pair(i, j); PairSet.insert(Pair); } uint PairCount = SIZE(PairSet); asserta(PairCount > TargetPairCount/2); for (set >::const_iterator p = PairSet.begin(); p != PairSet.end(); ++p) { uint Index1 = p->first; uint Index2 = p->second; Indexes1.push_back(Index1); Indexes2.push_back(Index2); } } muscle-5.1.0/src/getpostpairsalignedflat.cpp000066400000000000000000000045611424453062600212330ustar00rootroot00000000000000#include "muscle.h" #include "locallock.h" float GetPostPairsAlignedFlat(const string &aProgressStr, const MultiSequence &MSA1, const MultiSequence &MSA2, const vector &SeqIndexes1, const vector &SeqIndexes2, vector &SparsePosts) { string ProgressStr = aProgressStr; if (SIZE(ProgressStr) > 20) ProgressStr = ProgressStr.substr(0, 20); const uint SeqCount1 = MSA1.GetSeqCount(); const uint SeqCount2 = MSA2.GetSeqCount(); const uint PairCount = SIZE(SeqIndexes1); asserta(SIZE(SeqIndexes2) == PairCount); asserta(SparsePosts.empty()); // Allocate here to avoid race condition with push_back() in loop SparsePosts.resize(PairCount); int PairCounter = 0; uint ThreadCount = GetRequestedThreadCount(); float SumEA = 0; #pragma omp parallel for num_threads(ThreadCount) for (int PairIndex = 0; PairIndex < (int) PairCount; ++PairIndex) { uint Min = min(SeqCount1, SeqCount2); uint Max = max(SeqCount1, SeqCount2); Lock(); ProgressStep(PairCounter++, PairCount, "%s [%u x %u, %u pairs]", ProgressStr.c_str(), Min, Max, PairCount); Unlock(); uint SeqIndex1 = SeqIndexes1[PairIndex]; uint SeqIndex2 = SeqIndexes2[PairIndex]; asserta(SeqIndex1 < SeqCount1); asserta(SeqIndex2 < SeqCount2); const Sequence *gapped_seq1 = MSA1.GetSequence(SeqIndex1); const Sequence *gapped_seq2 = MSA2.GetSequence(SeqIndex2); Sequence *seq1 = gapped_seq1->DeleteGaps(); Sequence *seq2 = gapped_seq2->DeleteGaps(); const byte *ByteSeq1 = seq1->GetBytePtr(); const byte *ByteSeq2 = seq2->GetBytePtr(); const uint L1 = seq1->GetLength(); const uint L2 = seq2->GetLength(); float *Fwd = AllocFB(L1, L2); float *Bwd = AllocFB(L1, L2); float *Post = AllocPost(L1, L2); CalcFwdFlat(ByteSeq1, L1, ByteSeq2, L2, Fwd); CalcBwdFlat(ByteSeq1, L1, ByteSeq2, L2, Bwd); DeleteSequence(seq1); DeleteSequence(seq2); CalcPostFlat(Fwd, Bwd, L1, L2, Post); delete Fwd; delete Bwd; float *DPRows = AllocDPRows(L1, L2); char *TB = AllocTB(L1, L2); string Path; float Score = CalcAlnFlat(Post, L1, L2, DPRows, TB, Path); delete DPRows; delete TB; MySparseMx *SparsePost = new MySparseMx; asserta(SparsePost); SparsePost->FromPost(Post, L1, L2); SparsePosts[PairIndex] = SparsePost; delete Post; float EA = Score/min(L1, L2); Lock(); SumEA += EA; Unlock(); } float AvgEA = SumEA/PairCount; return AvgEA; } muscle-5.1.0/src/gitver.bash000066400000000000000000000005401424453062600157400ustar00rootroot00000000000000#!/bin/bash if [ ! -d ../.git ] ; then if [ ! -f gitver.txt ] ; then echo "0" > gitver.txt fi echo "Repo not found, git hash set to zero" exit 0 fi PATH=$PATH:/usr/bin git describe --abbrev=6 --dirty --long --always \ > gitver.tmp sed -i '-es/"//g' gitver.tmp echo \"`cat gitver.tmp`\" > gitver.txt rm -f gitver.tmp cat gitver.txt muscle-5.1.0/src/gitver.bat000066400000000000000000000003641424453062600155750ustar00rootroot00000000000000@echo off if exist gitver.txt ( echo gitver.txt found ) else ( echo "-" > gitver.txt ) if exist c:\cygwin64\bin\bash.exe ( echo bash found c:\cygwin64\bin\bash -c ./gitver.bash ) else ( echo bash not found echo 000 > gitver.txt exit ) muscle-5.1.0/src/globalinputms.cpp000066400000000000000000000035641424453062600171760ustar00rootroot00000000000000#include "muscle.h" static MultiSequence *g_GlobalMS; static uint g_GlobalMSSeqCount = 0; static double g_GlobalMSMeanSeqLength = 0; static uint g_GlobalMSMaxSeqLength = 0; void ClearGlobalInputMS() { if (g_GlobalMS == 0) return; delete g_GlobalMS; g_GlobalMS = 0; } MultiSequence &LoadGlobalInputMS(const string &FileName) { asserta(g_GlobalMS == 0); g_GlobalMS = new MultiSequence; asserta(g_GlobalMS != 0); g_GlobalMS->FromFASTA(FileName, true); g_GlobalMSSeqCount = g_GlobalMS->GetSeqCount(); g_GlobalMSMeanSeqLength = 0; g_GlobalMSMaxSeqLength = 0; double SumSeqLength = 0; for (uint GSI = 0; GSI < g_GlobalMSSeqCount; ++GSI) { const Sequence *Seq = g_GlobalMS->GetSequence(GSI); uint L = Seq->GetLength(); g_GlobalMSMaxSeqLength = max(L, g_GlobalMSMaxSeqLength); SumSeqLength += L; Sequence *HackSeq = (Sequence *) Seq; HackSeq->m_GSI = GSI; } if (g_GlobalMSSeqCount > 0) g_GlobalMSMeanSeqLength = SumSeqLength/g_GlobalMSSeqCount; return *g_GlobalMS; } MultiSequence &GetGlobalInputMS() { asserta(g_GlobalMS != 0); return *g_GlobalMS; } uint GetGlobalMSSeqCount() { return g_GlobalMSSeqCount; } double GetGlobalMSMeanSeqLength() { return g_GlobalMSMeanSeqLength; } uint GetGSICount() { return GetGlobalMSSeqCount(); } const Sequence &GetGlobalInputSeq(uint GSI) { asserta(GSI < g_GlobalMSSeqCount); asserta(g_GlobalMS != 0); const Sequence *Seq = g_GlobalMS->GetSequence(GSI); asserta(Seq != 0); return *Seq; } const string &GetGlobalInputSeqLabel(uint GSI) { const Sequence &Seq = GetGlobalInputSeq(GSI); const string &Label = Seq.GetLabel(); return Label; } void ShowGlobalInputSeqStats() { ProgressLog("Input: %u seqs, length avg %.0f max %u\n\n", g_GlobalMSSeqCount, g_GlobalMSMeanSeqLength, g_GlobalMSMaxSeqLength); if (g_GlobalMSMaxSeqLength > 5000) Warning("Sequence length >5k may require excessive memory"); } muscle-5.1.0/src/gobuff.h000066400000000000000000000015351424453062600152270ustar00rootroot00000000000000#ifndef gobuff_h #define gobuff_h #include "myutils.h" template class GoBuff { public: unsigned MaxSize; unsigned Size; T *Data; public: GoBuff() { MaxSize = 0; Size = 0; Data = 0; } ~GoBuff() { Free(); } void Free() { myfree(Data); Size = 0; Data = 0; } void Alloc(unsigned n) { if (n <= MaxSize) return; unsigned NewMaxSize = n + SizeInc; T *NewBuffer = myalloc(T, NewMaxSize); if (Size > 0) { if (CopyOnGrow) memcpy(NewBuffer, Data, Size*sizeof(T)); myfree(Data); } if (ZeroOnGrow) memset(NewBuffer, 0, NewMaxSize*sizeof(T)); Data = NewBuffer; MaxSize = NewMaxSize; } unsigned GetMemUseBytes() const { return (MaxSize*sizeof(T)); } }; const unsigned GROW64K = 0x10000; #endif // gobuff_h muscle-5.1.0/src/guidetreejoinorder.cpp000066400000000000000000000143461424453062600202070ustar00rootroot00000000000000#include "myutils.h" #include "muscle.h" #include "textfile.h" #include "tree.h" #include "pprog.h" #include void ValidateJoinOrder(const vector &Indexes1, const vector &Indexes2) { const uint JoinCount = SIZE(Indexes1); asserta(SIZE(Indexes2) == JoinCount); const uint LeafCount = JoinCount + 1; const uint NodeCount = 2*LeafCount - 1; set Pending; for (uint LeafIndex = 0; LeafIndex < LeafCount; ++LeafIndex) Pending.insert(LeafIndex); vector Used(NodeCount, false); for (uint JoinIndex = 0; JoinIndex < JoinCount; ++JoinIndex) { uint Index1 = Indexes1[JoinIndex]; uint Index2 = Indexes2[JoinIndex]; asserta(Index1 != Index2); asserta(Index1 < NodeCount); asserta(Index2 < NodeCount); asserta(Used[Index1] == false); asserta(Used[Index2] == false); asserta(Pending.find(Index1) != Pending.end()); asserta(Pending.find(Index2) != Pending.end()); uint JoinNodeIndex = LeafCount + JoinIndex; Used[Index1] = true; Used[Index2] = true; Pending.erase(Index1); Pending.erase(Index2); Pending.insert(JoinNodeIndex); } asserta(SIZE(Pending) == 1); uint UsedCount = 0; uint NotUsedCount = 0; for (uint NodeIndex = 0; NodeIndex < NodeCount; ++NodeIndex) { if (Used[NodeIndex]) ++UsedCount; else ++NotUsedCount; } asserta(NotUsedCount == 1); } static const char *GetLabel(const map &LabelToIndex, uint Index) { for (map::const_iterator p = LabelToIndex.begin(); p != LabelToIndex.end(); ++p) { if (p->second == Index) return p->first.c_str(); } asserta(false); return 0; } void LogGuideTreeJoinOrder(const Tree &GuideTree, const map &LabelToIndex, const vector &Indexes1, const vector &Indexes2) { asserta(GuideTree.IsRooted()); const uint NodeCount = GuideTree.GetNodeCount(); const uint LeafCount = GuideTree.GetLeafCount(); const uint JoinCount = LeafCount - 1; asserta(SIZE(Indexes1) == JoinCount); asserta(SIZE(Indexes2) == JoinCount); Log(" Join Index1 Index2\n"); for (uint JoinIndex = 0; JoinIndex < JoinCount; ++JoinIndex) { uint Index1 = Indexes1[JoinIndex]; uint Index2 = Indexes2[JoinIndex]; Log("%6u", JoinIndex); Log(" %6u", Index1); Log(" %6u", Index2); Log(" "); if (Index1 < LeafCount) Log(" '%s'", GetLabel(LabelToIndex, Index1)); else Log(" Join%u", Index1 - LeafCount); Log(" +"); if (Index2 < LeafCount) Log(" '%s'", GetLabel(LabelToIndex, Index2)); else Log(" Join%u", Index2 - LeafCount); Log("\n"); } } void GetGuideTreeJoinOrder(const Tree &GuideTree, const map &LabelToIndex, vector &Indexes1, vector &Indexes2) { asserta(GuideTree.IsRooted()); Indexes1.clear(); Indexes2.clear(); vector Pending; const uint NodeCount = GuideTree.GetNodeCount(); const uint LeafCount = GuideTree.GetLeafCount(); vector IndexUsed(LeafCount); const uint JoinCount = LeafCount - 1; uint JoinIndex = LeafCount; vector Stack; for (uint Node = GuideTree.FirstDepthFirstNode(); Node != UINT_MAX; Node = GuideTree.NextDepthFirstNode(Node)) { if (GuideTree.IsLeaf(Node)) { const string Label = GuideTree.GetLeafName(Node); map::const_iterator p = LabelToIndex.find(Label); if (p == LabelToIndex.end()) Die("Label not found >%s", Label.c_str()); uint Index = p->second; asserta(Index < LeafCount); asserta(!IndexUsed[Index]); Stack.push_back(Index); IndexUsed[Index] = true; } else { asserta(SIZE(Stack) >= 2); uint Left = Stack.back(); Stack.pop_back(); uint Right = Stack.back(); Stack.pop_back(); Indexes1.push_back(Right); Indexes2.push_back(Left); Stack.push_back(JoinIndex++); } } } void MakeGuideTreeFromJoinOrder(const vector &Indexes1, const vector &Indexes2, const map &LabelToIndex, Tree &GuideTree) { const uint JoinCount = SIZE(Indexes1); asserta(SIZE(Indexes2) == JoinCount); const uint LeafCount = JoinCount + 1; const uint NodeCount = LeafCount + JoinCount; char **LeafLabels = myalloc(char *, LeafCount); for (uint LeafIndex = 0; LeafIndex < LeafCount; ++LeafIndex) LeafLabels[LeafIndex] = (char *) GetLabel(LabelToIndex, LeafIndex); vector Lefts; vector Rights; for (uint JoinIndex = 0; JoinIndex < JoinCount; ++JoinIndex) { uint Index1 = Indexes1[JoinIndex]; uint Index2 = Indexes2[JoinIndex]; Lefts.push_back(Index1); Rights.push_back(Index2); } vector LeafIds(LeafCount, 1); const vector Lengths(NodeCount, 1); GuideTree.Create(LeafCount, JoinCount-1, Lefts.data(), Rights.data(), Lengths.data(), Lengths.data(), LeafIds.data(), LeafLabels); } void cmd_guide_tree_join_order() { const string &TreeFileName = opt(guide_tree_join_order); const string &OutputFileName = opt(output); Tree GuideTree; GuideTree.FromFile(TreeFileName); map LabelToIndex; const uint NodeCount = GuideTree.GetNodeCount(); const uint LeafCount = GuideTree.GetLeafCount(); const uint JoinCount = LeafCount - 1; uint LeafIndex = 0; for (uint Node = 0; Node < NodeCount; ++Node) { if (GuideTree.IsLeaf(Node)) { const string Label = GuideTree.GetLeafName(Node); assert(LabelToIndex.find(Label) == LabelToIndex.end()); LabelToIndex[Label] = LeafIndex++; } } vector Indexes1; vector Indexes2; GetGuideTreeJoinOrder(GuideTree, LabelToIndex, Indexes1, Indexes2); LogGuideTreeJoinOrder(GuideTree, LabelToIndex, Indexes1, Indexes2); ValidateJoinOrder(Indexes1, Indexes2); if (OutputFileName.empty()) return; FILE *f = CreateStdioFile(OutputFileName); asserta(GuideTree.IsRooted()); asserta(SIZE(Indexes1) == JoinCount); asserta(SIZE(Indexes2) == JoinCount); for (uint JoinIndex = 0; JoinIndex < JoinCount; ++JoinIndex) { uint Index1 = Indexes1[JoinIndex]; uint Index2 = Indexes2[JoinIndex]; fprintf(f, "%u", JoinIndex); fprintf(f, "\t%u", Index1); fprintf(f, "\t%u", Index2); if (Index1 < LeafCount) fprintf(f, "\tleaf\t%s", GetLabel(LabelToIndex, Index1)); else fprintf(f, "\tjoin\t%u", Index1 - LeafCount); if (Index2 < LeafCount) fprintf(f, "\tleaf\t%s", GetLabel(LabelToIndex, Index2)); else fprintf(f, "\tjoin\t%u", Index2 - LeafCount); fprintf(f, "\n"); } CloseStdioFile(f); } muscle-5.1.0/src/heatmapcolors.cpp000066400000000000000000000004731424453062600171530ustar00rootroot00000000000000 const char *g_HeatmapColors_HTML[10] = { "ff6464", "ff7878", "ff9696", "ffbebe", "ffe6e6", "ffffff", "e8f6f9", "d1f2f9", "a4f3fc", "98e8f9" }; const char *g_HeatmapColors_JalView[10] = { "A00000", "902020", "803030", "703030", "603030", "404040", "407040", "308030", "009000", "00A000" }; muscle-5.1.0/src/heatmapcolors.h000066400000000000000000000001531424453062600166130ustar00rootroot00000000000000#pragma once extern const char *g_HeatmapColors_HTML[10]; extern const char *g_HeatmapColors_JalView[10]; muscle-5.1.0/src/help.cpp000066400000000000000000000001441424453062600152350ustar00rootroot00000000000000#include "muscle.h" void Help() { PrintBanner(stdout); fprintf(stdout, #include "help.h" ); } muscle-5.1.0/src/help.h000066400000000000000000000055501424453062600147100ustar00rootroot00000000000000"Align FASTA input, write aligned FASTA (AFA) output:\n" " muscle -align input.fa -output aln.afa\n" "\n" "Align large input using Super5 algorithm if -align is too expensive,\n" "typically needed with more than a few hundred sequences:\n" " muscle -super5 input.fa -output aln.afa\n" "\n" "Single replicate alignment:\n" " muscle -align input.fa -perm PERM -perturb SEED -output aln.afa\n" " muscle -super5 input.fa -perm PERM -perturb SEED -output aln.afa\n" " PERM is guide tree permutation none, abc, acb, bca (default none).\n" " SEED is perturbation seed 0, 1, 2... (default 0 = don't perturb).\n" "\n" "Ensemble of replicate alignments, output in Ensemble FASTA (EFA) format,\n" "EFA has one aligned FASTA for each replicate with header line \" InitProbs; vector InitScores; for (uint i = 0; i < HMMSTATE_COUNT; ++i) { float Score = PairHMM::m_StartScore[i]; float Prob = exp(Score); InitProbs.push_back(Prob); InitScores.push_back(Score); } const float InitProb_M = InitProbs[HMMSTATE_M]; const float InitProb_IX = InitProbs[HMMSTATE_IX]; const float InitProb_IY = InitProbs[HMMSTATE_IY]; const float InitProb_JX = InitProbs[HMMSTATE_JX]; const float InitProb_JY = InitProbs[HMMSTATE_JY]; const float InitSum = InitProb_M + InitProb_IX + InitProb_IY + InitProb_JX + InitProb_JY; const float InitProb_IS = InitProb_IX; const float InitProb_IL = InitProb_JX; asserta(feq(InitProb_IX, InitProb_IY)); asserta(feq(InitProb_JX, InitProb_JY)); asserta(feq(InitSum, 1.0)); fprintf(fOut, "\n"); fprintf(fOut, "// Probs\n"); fprintf(fOut, "const float InitProb_IM = %.5g;\n", InitProb_M); fprintf(fOut, "const float InitProb_IS = %.5g;\n", InitProb_IX); fprintf(fOut, "const float InitProb_IL = %.5g;\n", InitProb_JX); vector > TransProbs(HMMSTATE_COUNT); vector > TransScores(HMMSTATE_COUNT); for (uint i = 0; i < HMMSTATE_COUNT; ++i) { TransProbs[i].resize(HMMSTATE_COUNT); TransScores[i].resize(HMMSTATE_COUNT); float Sum = 0; for (uint j = 0; j < HMMSTATE_COUNT; ++j) { float Score = PairHMM::m_TransScore[i][j]; float Prob = exp(Score); Sum += Prob; TransProbs[i][j] = Prob; TransScores[i][j] = Score; } asserta(feq(Sum, 1.0)); } // No transitions between different insert states asserta(TransProbs[HMMSTATE_IX][HMMSTATE_IY] == 0); asserta(TransProbs[HMMSTATE_IX][HMMSTATE_JY] == 0); asserta(TransProbs[HMMSTATE_JX][HMMSTATE_IY] == 0); asserta(TransProbs[HMMSTATE_JX][HMMSTATE_JY] == 0); asserta(TransProbs[HMMSTATE_IY][HMMSTATE_IX] == 0); asserta(TransProbs[HMMSTATE_IY][HMMSTATE_JX] == 0); asserta(TransProbs[HMMSTATE_JY][HMMSTATE_IX] == 0); asserta(TransProbs[HMMSTATE_JY][HMMSTATE_JX] == 0); asserta(TransProbs[HMMSTATE_M][HMMSTATE_IX] == TransProbs[HMMSTATE_M][HMMSTATE_IY]); asserta(TransProbs[HMMSTATE_M][HMMSTATE_JX] == TransProbs[HMMSTATE_M][HMMSTATE_JY]); asserta(TransProbs[HMMSTATE_IX][HMMSTATE_M] == TransProbs[HMMSTATE_IY][HMMSTATE_M]); asserta(TransProbs[HMMSTATE_JX][HMMSTATE_M] == TransProbs[HMMSTATE_JY][HMMSTATE_M]); asserta(TransProbs[HMMSTATE_IX][HMMSTATE_M] == TransProbs[HMMSTATE_IY][HMMSTATE_M]); const float TransProb_M_M = TransProbs[HMMSTATE_M][HMMSTATE_M]; const float TransScore_M_M = TransScores[HMMSTATE_M][HMMSTATE_M]; const float TransProb_M_IS = TransProbs[HMMSTATE_M][HMMSTATE_IX]; const float TransScore_M_IS = TransScores[HMMSTATE_M][HMMSTATE_IX]; const float TransProb_M_IL = TransProbs[HMMSTATE_M][HMMSTATE_JX]; const float TransScore_M_IL = TransScores[HMMSTATE_M][HMMSTATE_JX]; const float TransProb_IS_IS = TransProbs[HMMSTATE_IX][HMMSTATE_IX]; const float TransScore_IS_IS = TransScores[HMMSTATE_IX][HMMSTATE_IX]; const float TransProb_IL_IL = TransProbs[HMMSTATE_JX][HMMSTATE_JX]; const float TransScore_IL_IL = TransScores[HMMSTATE_JX][HMMSTATE_JX]; const float TransProb_IS_M = TransProbs[HMMSTATE_IX][HMMSTATE_M]; const float TransScore_IS_M = TransScores[HMMSTATE_IX][HMMSTATE_M]; const float TransProb_IL_M = TransProbs[HMMSTATE_JX][HMMSTATE_M]; const float TransScore_IL_M = TransScores[HMMSTATE_JX][HMMSTATE_M]; asserta(feq(InitProb_M + 2*InitProb_IS + 2*InitProb_IL, 1.0)); asserta(feq(TransProb_IS_IS + TransProb_IS_M, 1.0)); asserta(feq(TransProb_IL_IL + TransProb_IL_M, 1.0)); asserta(feq(TransProb_M_M + 2*TransProb_M_IS + 2*TransProb_M_IL, 1.0)); fprintf(fOut, "\n"); fprintf(fOut, "const float TransProb_M_M = %.5g;\n", TransProb_M_M); fprintf(fOut, "const float TransProb_M_IS = %.5g;\n", TransProb_M_IS); fprintf(fOut, "const float TransProb_M_IL = %.5g;\n", TransProb_M_IL); fprintf(fOut, "const float TransProb_IS_IS = %.5g;\n", TransProb_IS_IS); fprintf(fOut, "const float TransProb_IL_IL = %.5g;\n", TransProb_IL_IL); fprintf(fOut, "const float TransProb_IS_M = %.5g;\n", TransProb_IS_M); fprintf(fOut, "const float TransProb_IL_M = %.5g;\n", TransProb_IL_M); const string A = "ACDEFGHIKLMNPQRSTVWY"; asserta(SIZE(A) == 20); vector InsProbs(20); vector InsScores(20); float Sum = 0; for (uint i = 0; i < 20; ++i) { char a = A[i]; uint Letter = uint(a); float Score = PairHMM::m_InsScore[Letter]; float Prob = exp(Score); InsProbs[i] = Prob; InsScores[i] = Score; Sum += Prob; } asserta(feq(Sum, 1.0)); vector > EmitProbs(20); vector > EmitScores(20); Sum = 0; for (uint i = 0; i < 20; ++i) { EmitProbs[i].resize(20); EmitScores[i].resize(20); char a = A[i]; uint Letter_a = uint(a); for (uint j = 0; j < 20; ++j) { char b = A[j]; uint Letter_b = uint(b); float Score = PairHMM::m_MatchScore[Letter_a][Letter_b]; float Prob = exp(Score); EmitProbs[i][j] = Prob; EmitScores[i][j] = Score; Sum += Prob; } } asserta(feq(Sum, 1.0)); fprintf(fOut, "\n"); fprintf(fOut, "const float InsProbs[20] =\n"); fprintf(fOut, " {\n"); for (uint i = 0; i < 20; ++i) { char a = A[i]; float Prob = InsProbs[i]; fprintf(fOut, " %.5g, // %c\n", Prob, a); } fprintf(fOut, " };\n"); fprintf(fOut, "\n"); fprintf(fOut, "const float EmitProbs[20][20] =\n"); fprintf(fOut, " {\n"); fprintf(fOut, "// "); for (uint i = 0; i < 20; ++i) { char a = A[i]; fprintf(fOut, " %c", a); } fprintf(fOut, "\n"); for (uint i = 0; i < 20; ++i) { char a = A[i]; fprintf(fOut, "/* %c */ { ", a); for (uint j = 0; j < 20; ++j) { float Prob = EmitProbs[i][j]; fprintf(fOut, " %.5g", Prob); } fprintf(fOut, " } // %c\n", a); } fprintf(fOut, " };\n"); /////////////////////////////////////////////////////////////////// // Scores /////////////////////////////////////////////////////////////////// fprintf(fOut, "\n"); fprintf(fOut, "// Scores\n"); fprintf(fOut, "const float InitScore_IM = %.5g;\n", InitScores[HMMSTATE_M]); fprintf(fOut, "const float InitScore_IS = %.5g;\n", InitScores[HMMSTATE_IX]); fprintf(fOut, "const float InitScore_IL = %.5g;\n", InitScores[HMMSTATE_JX]); fprintf(fOut, "\n"); fprintf(fOut, "const float TransScore_M_M = %.5g;\n", TransScore_M_M); fprintf(fOut, "const float TransScore_M_IS = %.5g;\n", TransScore_M_IS); fprintf(fOut, "const float TransScore_M_IL = %.5g;\n", TransScore_M_IL); fprintf(fOut, "const float TransScore_IS_IS = %.5g;\n", TransScore_IS_IS); fprintf(fOut, "const float TransScore_IL_IL = %.5g;\n", TransScore_IL_IL); fprintf(fOut, "const float TransScore_IS_M = %.5g;\n", TransScore_IS_M); fprintf(fOut, "const float TransScore_IL_M = %.5g;\n", TransScore_IL_M); fprintf(fOut, "\n"); fprintf(fOut, "const float InsScores[20] =\n"); fprintf(fOut, " {\n"); for (uint i = 0; i < 20; ++i) { char a = A[i]; float Score = InsScores[i]; fprintf(fOut, " %.5g, // %c\n", Score, a); } fprintf(fOut, " };\n"); fprintf(fOut, "\n"); fprintf(fOut, "const float EmitScores[20][20] =\n"); fprintf(fOut, " {\n"); fprintf(fOut, "// "); for (uint i = 0; i < 20; ++i) { char a = A[i]; fprintf(fOut, " %c", a); } fprintf(fOut, "\n"); for (uint i = 0; i < 20; ++i) { char a = A[i]; fprintf(fOut, "/* %c */ { ", a); for (uint j = 0; j < 20; ++j) { float Prob = EmitScores[i][j]; fprintf(fOut, " %.5g", Prob); } fprintf(fOut, " } // %c\n", a); } fprintf(fOut, " };\n"); } void cmd_hmmdump() { string OutDir = opt(hmmdump); Dirize(OutDir); SetAlpha(ALPHA_Amino); InitProbcons(); PairHMM::WriteParamsReport(OutDir + "params_report.txt"); bool Nucleo = opt(nt); HMMParams HP; HP.FromDefaults(Nucleo); HP.ToFile(OutDir + "hmm.tsv"); HP.ToPairHMM(); PairHMM::WriteParamsReport(OutDir + "params_report2.txt"); HP.ToFile(OutDir + "hmm2.tsv"); HP.FromFile(OutDir + "hmm2.tsv"); HP.ToFile(OutDir + "hmm3.tsv"); HMMParams SA; HP.ToSingleAffineProbs(SA); SA.ToFile(OutDir + "sa.hmm"); } muscle-5.1.0/src/hmmparams.cpp000066400000000000000000000260611424453062600163000ustar00rootroot00000000000000#include "muscle.h" #include "hmmparams.h" const char *HMMTRANSToStr(HMMTRANS t) { switch (t) { #define T(x) case HMMTRANS_##x: return #x; #include "hmmtrans.h" default: asserta(false); } return "?"; } void HMMParams::GetProbs(HMMParams &Probs) const { if (m_Logs) ProbsToScores(*this, Probs); else Probs = *this; Probs.AssertProbsValid(); } void HMMParams::FromParams(const HMMParams &Params, bool AsProbs) { if (AsProbs) { HMMParams Probs; Params.GetProbs(Probs); *this = Probs; } else { HMMParams Scores; Params.GetProbs(Scores); *this = Scores; } } void HMMParams::GetScores(HMMParams &Scores) const { if (m_Logs) Scores = *this; else { AssertProbsValid(); ProbsToScores(*this, Scores); } } void HMMParams::ToSingleAffineProbs(HMMParams &Params) { GetProbs(Params); vector &T = Params.m_Trans; float SI = (T[HMMTRANS_START_IS] + T[HMMTRANS_START_IL])/2; float MI = (T[HMMTRANS_M_IS] + T[HMMTRANS_M_IL])/2; float IM = (T[HMMTRANS_IS_M] + T[HMMTRANS_IL_M])/2; float II = (T[HMMTRANS_IS_IS] + T[HMMTRANS_IL_IL])/2; T[HMMTRANS_START_IS] = SI; T[HMMTRANS_START_IL] = SI; T[HMMTRANS_M_IS] = MI; T[HMMTRANS_M_IL] = MI; T[HMMTRANS_IS_M] = IM; T[HMMTRANS_IL_M] = IM; T[HMMTRANS_IS_IS] = II; T[HMMTRANS_IL_IL] = II; Params.AssertProbsValid(); } void HMMParams::ScoresToProbs(const HMMParams &Scores, HMMParams &Probs) { Probs.m_Alpha = Scores.m_Alpha; const unsigned AlphaSize = Scores.GetAlphaSize(); for (uint i = 0; i < HMMTRANS_N; ++i) Probs.m_Trans[i] = exp(Scores.m_Trans[i]); for (uint i = 0; i < AlphaSize; ++i) for (uint j = 0; j < AlphaSize; ++j) Probs.m_Emits[i][j] = exp(Scores.m_Emits[i][j]); Probs.m_Logs = false; Probs.AssertProbsValid(); } void HMMParams::ProbsToScores(const HMMParams &Probs, HMMParams &Scores) { Probs.AssertProbsValid(); Scores.m_Alpha = Probs.m_Alpha; const unsigned AlphaSize = Probs.GetAlphaSize(); Scores.m_Trans.clear(); Scores.m_Trans.resize(HMMTRANS_N); Scores.m_Emits.clear(); Scores.m_Emits.clear(); Scores.m_Emits.resize(AlphaSize); for (uint i = 0; i < AlphaSize; ++i) Scores.m_Emits[i].resize(AlphaSize, FLT_MAX); for (uint i = 0; i < HMMTRANS_N; ++i) Scores.m_Trans[i] = log(Probs.m_Trans[i]); for (uint i = 0; i < AlphaSize; ++i) for (uint j = 0; j < AlphaSize; ++j) Scores.m_Emits[i][j] = log(Probs.m_Emits[i][j]); Scores.m_Logs = true; } void HMMParams::AssertProbsValid() const { asserta(!m_Logs); asserta(SIZE(m_Trans) == HMMTRANS_N); const uint AlphaSize = GetAlphaSize(); asserta(SIZE(m_Emits) == AlphaSize); for (uint i = 0; i < AlphaSize; ++i) asserta(SIZE(m_Emits[i]) == AlphaSize); float SumSTART = m_Trans[HMMTRANS_START_M] + 2*m_Trans[HMMTRANS_START_IS] + 2*m_Trans[HMMTRANS_START_IL]; asserta(feq(SumSTART, 1.0)); float SumIS = m_Trans[HMMTRANS_IS_M] + m_Trans[HMMTRANS_IS_IS]; asserta(feq(SumIS, 1.0)); float SumIL = m_Trans[HMMTRANS_IL_M] + m_Trans[HMMTRANS_IL_IL]; asserta(feq(SumIL, 1.0)); float SumM = m_Trans[HMMTRANS_M_M] + 2*m_Trans[HMMTRANS_M_IS] + 2*m_Trans[HMMTRANS_M_IL]; asserta(feq(SumM, 1.0)); float SumEmit = 0; for (uint i = 0; i < AlphaSize; ++i) for (uint j = 0; j < AlphaSize; ++j) SumEmit += m_Emits[i][j]; asserta(feq(SumEmit, 1.0)); for (uint i = 0; i < AlphaSize; ++i) for (uint j = 0; j < i; ++j) asserta(feq(m_Emits[i][j], m_Emits[j][i])); } void HMMParams::ToFile(const string &FileName) const { if (FileName.empty()) return; const uint AlphaSize = GetAlphaSize(); AssertProbsValid(); FILE *f = CreateStdioFile(FileName); if (m_Alpha == AMINO_ALPHA) fprintf(f, "HMM aa\n"); else if (m_Alpha == NT_ALPHA) fprintf(f, "HMM nt\n"); else Die("HMMParams::ToFile alpha='%'s", m_Alpha.c_str()); #define T(x) fprintf(f, "T.%s %.5g\n", #x, m_Trans[HMMTRANS_##x]); #include "hmmtrans.h" for (uint i = 0; i < AlphaSize; ++i) { char a = m_Alpha[i]; for (uint j = 0; j <= i; ++j) { char b = m_Alpha[j]; float P = m_Emits[i][j]; fprintf(f, "E.%c%c %.5g\n", a, b, P); } } CloseStdioFile(f); } float HMMParams::GetNextProb(const string &Name) { string Line; vector Fields; bool Ok = GetNextLine(Line); if (!Ok) Die("GetNextProb(%s) end-of-data", Name.c_str()); Split(Line, Fields, '\t'); if (SIZE(Fields) != 2) Die("GetNextProb(%s) expected 2 fields got '%s'", Name.c_str(), Line.c_str()); if (Fields[0] != Name) Die("ReadGetNextProbTrans(%s) got '%s'", Name.c_str(), Line.c_str()); float P = (float) StrToFloat(Fields[1]); return P; } bool HMMParams::GetNextLine(string &Line) { Line.clear(); if (m_LineNr >= SIZE(m_Lines)) return false; Line = m_Lines[m_LineNr++]; return true; } void HMMParams::FromFile(const string &FileName) { vector Lines; ReadStringsFromFile(FileName, Lines); FromStrings(Lines); } void HMMParams::FromStrings(const vector &Lines) { m_Lines = Lines; m_LineNr = 0; string AlphaLine; bool Ok = GetNextLine(AlphaLine); asserta(Ok); vector Fields; Split(AlphaLine, Fields, '\t'); if (SIZE(Fields) != 2 || Fields[0] != "HMM") Die("Invalid HMM file"); m_Alpha = Fields[1]; if (Fields[1] == "aa") m_Alpha = AMINO_ALPHA; else if (Fields[1] == "nt") m_Alpha = NT_ALPHA; else Die("Invalid HMM alphabet '%s'", m_Alpha.c_str()); m_Trans.clear(); m_Trans.resize(HMMTRANS_N, FLT_MAX); #define T(x) m_Trans[HMMTRANS_##x] = GetNextProb("T." #x); #include "hmmtrans.h" const uint AlphaSize = SIZE(m_Alpha); m_Emits.clear(); m_Emits.resize(AlphaSize); for (uint i = 0; i < AlphaSize; ++i) m_Emits[i].resize(AlphaSize, FLT_MAX); for (uint i = 0; i < AlphaSize; ++i) { char a = m_Alpha[i]; for (uint j = 0; j <= i; ++j) { char b = m_Alpha[j]; string Name; Ps(Name, "E.%c%c", a, b); float P = GetNextProb(Name); m_Emits[i][j] = P; m_Emits[j][i] = P; } } AssertProbsValid(); } void HMMParams::FromDefaults(bool Nucleo) { vector Lines; if (Nucleo) GetDefaultHMMParams_Nucleo(Lines); else GetDefaultHMMParams_Amino(Lines); FromStrings(Lines); } void HMMParams::ToPairHMM() const { HMMParams Scores; GetScores(Scores); HMMParams Probs; GetProbs(Probs); const uint AlphaSize = GetAlphaSize(); const vector &Trans = Scores.m_Trans; const vector > &Emits = Scores.m_Emits; float SumInserts = 0; vector InsertScores; for (uint i = 0; i < AlphaSize; ++i) { float MarginalProb = 0; for (uint j = 0; j < AlphaSize; ++j) { float P = Probs.m_Emits[i][j]; MarginalProb += P; } float Score = log(MarginalProb); InsertScores.push_back(Score); SumInserts += MarginalProb; } asserta(feq(SumInserts, 1.0)); PairHMM::m_StartScore[HMMSTATE_M] = Trans[HMMTRANS_START_M]; PairHMM::m_StartScore[HMMSTATE_IX] = Trans[HMMTRANS_START_IS]; PairHMM::m_StartScore[HMMSTATE_IY] = Trans[HMMTRANS_START_IS]; PairHMM::m_StartScore[HMMSTATE_JX] = Trans[HMMTRANS_START_IL]; PairHMM::m_StartScore[HMMSTATE_JY] = Trans[HMMTRANS_START_IL]; for (uint i = 0; i < HMMSTATE_COUNT; ++i) for (uint j = 0; j < HMMSTATE_COUNT; ++j) PairHMM::m_TransScore[i][j] = LOG_ZERO; PairHMM::m_TransScore[HMMSTATE_M][HMMSTATE_M] = Trans[HMMTRANS_M_M]; PairHMM::m_TransScore[HMMSTATE_M][HMMSTATE_IX] = Trans[HMMTRANS_M_IS]; PairHMM::m_TransScore[HMMSTATE_M][HMMSTATE_IY] = Trans[HMMTRANS_M_IS]; PairHMM::m_TransScore[HMMSTATE_M][HMMSTATE_JX] = Trans[HMMTRANS_M_IL]; PairHMM::m_TransScore[HMMSTATE_M][HMMSTATE_JY] = Trans[HMMTRANS_M_IL]; PairHMM::m_TransScore[HMMSTATE_IX][HMMSTATE_IX] = Trans[HMMTRANS_IS_IS]; PairHMM::m_TransScore[HMMSTATE_IY][HMMSTATE_IY] = Trans[HMMTRANS_IS_IS]; PairHMM::m_TransScore[HMMSTATE_JX][HMMSTATE_JX] = Trans[HMMTRANS_IL_IL]; PairHMM::m_TransScore[HMMSTATE_JY][HMMSTATE_JY] = Trans[HMMTRANS_IL_IL]; PairHMM::m_TransScore[HMMSTATE_IX][HMMSTATE_M] = Trans[HMMTRANS_IS_M]; PairHMM::m_TransScore[HMMSTATE_IY][HMMSTATE_M] = Trans[HMMTRANS_IS_M]; PairHMM::m_TransScore[HMMSTATE_JX][HMMSTATE_M] = Trans[HMMTRANS_IL_M]; PairHMM::m_TransScore[HMMSTATE_JY][HMMSTATE_M] = Trans[HMMTRANS_IL_M]; float WildcardInsertProb = 1.0f/AlphaSize; for (uint i = 0; i < 256; ++i) PairHMM::m_InsScore[i] = log(WildcardInsertProb); for (uint i = 0; i < AlphaSize; ++i) { char a = m_Alpha[i]; byte ia = (byte) tolower(a); byte iA = (byte) toupper(a); float P = InsertScores[i]; PairHMM::m_InsScore[ia] = P; PairHMM::m_InsScore[iA] = P; } for (uint i = 0; i < 256; ++i) for (uint j = 0; j < 256; ++j) PairHMM::m_MatchScore[i][j] = log(WildcardInsertProb*WildcardInsertProb); for (uint i = 0; i < AlphaSize; ++i) { char a = m_Alpha[i]; byte ia = (byte) tolower(a); byte iA = (byte) toupper(a); for (uint j = 0; j < AlphaSize; ++j) { float P = Emits[i][j]; char b = m_Alpha[j]; byte ib = (byte) tolower(b); byte iB = (byte) toupper(b); PairHMM::m_MatchScore[ia][ib] = P; PairHMM::m_MatchScore[ia][iB] = P; PairHMM::m_MatchScore[iA][ib] = P; PairHMM::m_MatchScore[iA][iB] = P; } } if (AlphaSize == 4) PairHMM::FixUT(); } void PairHMM::FixUT() { PairHMM::m_InsScore['U'] = PairHMM::m_InsScore['T']; PairHMM::m_InsScore['u'] = PairHMM::m_InsScore['t']; for (uint i = 0; i < 256; ++i) { float P = PairHMM::m_MatchScore['T'][i]; PairHMM::m_MatchScore['U'][i] = P; PairHMM::m_MatchScore['u'][i] = P; PairHMM::m_MatchScore[i]['U'] = P; PairHMM::m_MatchScore[i]['u'] = P; } } void HMMParams::NormalizeEmit() { asserta(!m_Logs); const uint AlphaSize = GetAlphaSize(); float Sum = 0; for (uint i = 0; i < AlphaSize; ++i) { for (uint j = 0; j <= i; ++j) { float P = m_Emits[i][j]; m_Emits[i][j] = P; m_Emits[j][i] = P; Sum += P; if (i != j) Sum += P; } } for (uint i = 0; i < AlphaSize; ++i) for (uint j = 0; j < AlphaSize; ++j) m_Emits[i][j] /= Sum; } void HMMParams::NormalizeStart() { float Sum = m_Trans[HMMTRANS_START_M] + 2*m_Trans[HMMTRANS_START_IS] + 2*m_Trans[HMMTRANS_START_IL]; m_Trans[HMMTRANS_START_M] /= Sum; m_Trans[HMMTRANS_START_IS] /= Sum; m_Trans[HMMTRANS_START_IL] /= Sum; } void HMMParams::NormalizeShortGap() { float SumM = m_Trans[HMMTRANS_M_M] + 2*m_Trans[HMMTRANS_M_IS] + 2*m_Trans[HMMTRANS_M_IL]; m_Trans[HMMTRANS_M_M] /= SumM; m_Trans[HMMTRANS_M_IS] /= SumM; m_Trans[HMMTRANS_M_IL] /= SumM; float SumIS = m_Trans[HMMTRANS_IS_IS] + m_Trans[HMMTRANS_IS_M]; m_Trans[HMMTRANS_IS_IS] /= SumIS; m_Trans[HMMTRANS_IS_M] /= SumIS; } void HMMParams::NormalizeLongGap() { float SumM = m_Trans[HMMTRANS_M_M] + 2*m_Trans[HMMTRANS_M_IS] + 2*m_Trans[HMMTRANS_M_IL]; m_Trans[HMMTRANS_M_M] /= SumM; m_Trans[HMMTRANS_M_IS] /= SumM; m_Trans[HMMTRANS_M_IL] /= SumM; float SumIL = m_Trans[HMMTRANS_IL_IL] + m_Trans[HMMTRANS_IL_M]; m_Trans[HMMTRANS_IL_IL] /= SumIL; m_Trans[HMMTRANS_IL_M] /= SumIL; } void HMMParams::NormalizeMatch() { float SumM = m_Trans[HMMTRANS_M_M] + 2*m_Trans[HMMTRANS_M_IS] + 2*m_Trans[HMMTRANS_M_IL]; m_Trans[HMMTRANS_M_M] /= SumM; m_Trans[HMMTRANS_M_IS] /= SumM; m_Trans[HMMTRANS_M_IL] /= SumM; } void HMMParams::Normalize() { NormalizeStart(); NormalizeShortGap(); NormalizeLongGap(); NormalizeEmit(); AssertProbsValid(); } muscle-5.1.0/src/hmmparams.h000066400000000000000000000037251424453062600157470ustar00rootroot00000000000000#pragma once enum HMMTRANS { #define T(x) HMMTRANS_##x, #include "hmmtrans.h" HMMTRANS_N }; static const string AMINO_ALPHA = "ACDEFGHIKLMNPQRSTVWY"; static const string NT_ALPHA = "ACGT"; //#define HMM_ALPHA "ACDEFGHIKLMNPQRSTVWY" //const uint HMM_ALPHASIZE = 20; const float DEFAULT_PERTURB_VAR = 0.25f; class HMMParams { public: bool m_Logs = false; uint m_LineNr = 0; float m_Var; vector m_Trans; vector > m_Emits; vector m_Lines; string m_Alpha; public: HMMParams() { m_Logs = false; m_LineNr = 0; m_Var = DEFAULT_PERTURB_VAR; } public: void Clear() { m_Logs = false; m_LineNr = 0; m_Var = DEFAULT_PERTURB_VAR; m_Trans.clear(); m_Emits.clear(); m_Lines.clear(); m_Alpha.clear(); } uint GetAlphaSize() const { uint n = SIZE(m_Alpha); asserta(n == 4 || n == 20); return n; } void FromParams(const HMMParams &Params, bool AsProbs); void FromStrings(const vector &Lines); void FromFile(const string &FileName); void FromDefaults(bool Nucleo); void PerturbProbs(uint Seed); void ToSingleAffineProbs(HMMParams &Params); void Normalize(); void NormalizeStart(); void NormalizeMatch(); void NormalizeShortGap(); void NormalizeLongGap(); void NormalizeEmit(); void AssertProbsValid() const; void ToFile(const string &FileName) const; void ToPairHMM() const; void GetProbs(HMMParams &Probs) const; void GetScores(HMMParams &Scores) const; private: bool GetNextLine(string &Line); float GetNextProb(const string &Name); public: static void ScoresToProbs(const HMMParams &Scores, HMMParams &Probs); static void ProbsToScores(const HMMParams &Probs, HMMParams &Scores); static void GetDefaultHMMParams(bool Nucleo, vector &Lines); static void GetDefaultHMMParams_Amino(vector &Lines); static void GetDefaultHMMParams_Nucleo(vector &Lines); static void Compare(const HMMParams &HP1, const HMMParams &HP2, float &MeanTransDelta, float &MeanEmitDelta); }; muscle-5.1.0/src/hmmscores.h000066400000000000000000000013311424453062600157510ustar00rootroot00000000000000const float tSM = PairHMM::m_StartScore[HMMSTATE_M]; const float tSI = PairHMM::m_StartScore[HMMSTATE_IX]; const float tSJ = PairHMM::m_StartScore[HMMSTATE_JX]; const float tMM = PairHMM::m_TransScore[HMMSTATE_M][HMMSTATE_M]; const float tMI = PairHMM::m_TransScore[HMMSTATE_M][HMMSTATE_IX]; const float tMJ = PairHMM::m_TransScore[HMMSTATE_M][HMMSTATE_JX]; const float tII = PairHMM::m_TransScore[HMMSTATE_IX][HMMSTATE_IX]; const float tIM = PairHMM::m_TransScore[HMMSTATE_IX][HMMSTATE_M]; const float tJJ = PairHMM::m_TransScore[HMMSTATE_JX][HMMSTATE_JX]; const float tJM = PairHMM::m_TransScore[HMMSTATE_JX][HMMSTATE_M]; const t_ByteMx &MatchScore = PairHMM::m_MatchScore; const t_ByteVec &InsScore = PairHMM::m_InsScore; muscle-5.1.0/src/hmmtrans.h000066400000000000000000000002221424453062600156000ustar00rootroot00000000000000#ifndef T #error "T not defined" #endif T(START_M) T(START_IS) T(START_IL) T(M_M) T(M_IS) T(M_IL) T(IS_IS) T(IS_M) T(IL_IL) T(IL_M) #undef T muscle-5.1.0/src/jalview.cpp000066400000000000000000000044171424453062600157550ustar00rootroot00000000000000#include "muscle.h" #include "heatmapcolors.h" static void GetConfRanges_Ungapped(const char *Seq, uint L, vector &Confs, vector &Los, vector &His) { Confs.clear(); Los.clear(); His.clear(); char CurrConf = Seq[0]; uint Lo = 0; uint Pos = 0; if (!isgap(Seq[0])) ++Pos; for (uint i = 1; i <= L; ++i) { char Conf = (i == L ? 0 : Seq[i]); if (Conf != CurrConf) { uint Hi = Pos - 1; uint n = SIZE(His); if (n > 0) asserta(Lo == His[n-1] + 1); Confs.push_back(CurrConf); Los.push_back(Lo); His.push_back(Hi); Lo = Pos; CurrConf = Conf; } if (!isgap(Conf)) ++Pos; } } void WriteLetterConfJalView(const string &FileName, const MSA &Ref, const MSA &ConfAln) { if (FileName.empty()) return; const uint SeqCount = ConfAln.GetSeqCount(); const uint ColCount = ConfAln.GetColCount(); if (Ref.GetSeqCount() != SeqCount || Ref.GetColCount() != ColCount) Die("-ref has different number of rows or columns"); vector Labels; uint MaxLabelLength = 0; for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { const string Label = (string) ConfAln.GetSeqName(SeqIndex); const string RefLabel = (string) Ref.GetSeqName(SeqIndex); if (Label != RefLabel) Die("-ref labels do not match, seq %u input=%s ref=%s", SeqIndex + 1, Label.c_str(), RefLabel.c_str()); MaxLabelLength = max(MaxLabelLength, SIZE(Label)); Labels.push_back(Label); } FILE *fOut = CreateStdioFile(FileName); for (uint i = 0; i < 10; ++i) fprintf(fOut, "LC%u\t%s\n", i, g_HeatmapColors_JalView[i]); fprintf(fOut, "STARTGROUP Muscle5_LetterConfs\n"); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { const string &Label = Labels[SeqIndex]; const char *Seq = ConfAln.m_szSeqs[SeqIndex]; vector Confs; vector Los; vector His; GetConfRanges_Ungapped(Seq, ColCount, Confs, Los, His); const uint RangeCount = SIZE(Confs); asserta(SIZE(His) == RangeCount); asserta(SIZE(Los) == RangeCount); for (uint i = 0; i < RangeCount; ++i) { char Conf = Confs[i]; uint Lo = Los[i]; uint Hi = His[i]; if (Conf == '-') continue; fprintf(fOut, "- %s %u %u %u LC%c\n", Label.c_str(), SeqIndex, Lo+1, Hi+1, Conf); } } fprintf(fOut, "ENDGROUP Muscle5_LetterConfs\n"); } muscle-5.1.0/src/jointrees.cpp000066400000000000000000000031311424453062600163060ustar00rootroot00000000000000#include "muscle.h" #include "tree.h" void JoinTrees(const Tree &Tree1, const Tree &Tree2, Tree &OutputTree, float NewEdgeLength) { const uint NodeCount1 = Tree1.GetNodeCount(); const uint NodeCount2 = Tree2.GetNodeCount(); vector Labels1; vector Labels2; vector Parents1; vector Parents2; vector Lengths1; vector Lengths2; Tree1.ToVectors(Labels1, Parents1, Lengths1); Tree2.ToVectors(Labels2, Parents2, Lengths2); uint Root = NodeCount1 + NodeCount2; vector Labels; vector Parents; vector Lengths; bool Root1Found = false; for (uint Node1 = 0; Node1 < NodeCount1; ++Node1) { string Label = Labels1[Node1]; float Length = Lengths1[Node1]; uint Parent = Parents1[Node1]; if (Parent == UINT_MAX) { asserta(!Root1Found); Root1Found = true; Parent = Root; Length = NewEdgeLength; } Labels.push_back(Label); Parents.push_back(Parent); Lengths.push_back(Length); } asserta(Root1Found); bool Root2Found = false; for (uint Node2 = 0; Node2 < NodeCount2; ++Node2) { string Label = Labels2[Node2]; float Length = Lengths2[Node2]; uint Parent2 = Parents2[Node2]; uint Parent; if (Parent2 == UINT_MAX) { asserta(!Root2Found); Root2Found = true; Parent = Root; Length = NewEdgeLength; } else Parent = Parent2 + NodeCount1; Labels.push_back(Label); Parents.push_back(Parent); Lengths.push_back(Length); } asserta(Root2Found); Labels.push_back("ROOT"); Parents.push_back(UINT_MAX); Lengths.push_back(0); OutputTree.FromVectors(Labels, Parents, Lengths); } muscle-5.1.0/src/kmerscan.h000066400000000000000000000012501424453062600155540ustar00rootroot00000000000000#pragma once typedef uint fn_OnKmer(uint32 Code, uint32 Pos, void *UserData); void SyncmerScanNt(const byte *Seq, uint Lo, uint Len, uint k, uint d, fn_OnKmer OnKmer, void *UserData); void SyncmerScanAa(const byte *Seq, uint Lo, uint Len, uint k, uint d, fn_OnKmer OnKmer, void *UserData); uint32 GetKmerMaskNt(uint k); uint32 GetKmerMaskAa(uint k); const char *CodeToStrNt(uint32 Code, uint k); const char *CodeToStrAa(uint32 Code, uint k); uint32 SeqToCodeNt(const byte *Seq, uint k); uint32 SeqToCodeAa(const byte *Seq, uint k); uint32 SeqToCode(bool Nucleo, const byte *Seq, uint k); static const uint BITS_PER_LETTER_SEB8 = 3; static const uint ALPHA_SIZE_SEB8 = 8; muscle-5.1.0/src/letterconf.cpp000066400000000000000000000052461424453062600164620ustar00rootroot00000000000000#include "muscle.h" #include "qscorer.h" #include "ensemble.h" void WriteLetterConfHTML(const string &FileName, const MSA &Ref, const MSA &ConfAln); void WriteLetterConfJalView(const string &FileName, const MSA &Ref, const MSA &ConfAln); void Ensemble::GetLetterConfsVec(const MSA &Ref, double MaxGapFract, vector > &LetterConfsVec) const { QScorer QS; vector > LetterCountsVec; const uint TestMSACount = GetMSACount(); for (uint TestMSAIndex = 0; TestMSAIndex < TestMSACount; ++TestMSAIndex) { const MSA &Test = GetMSA(TestMSAIndex); QS.Run(Test, Ref); QS.UpdateRefLetterCounts(LetterCountsVec); } const uint RefSeqCount = QS.GetRefSeqCount(); const uint RefColCount = QS.GetRefColCount(); asserta(SIZE(LetterCountsVec) == RefSeqCount); asserta(SIZE(LetterCountsVec[0]) == RefColCount); LetterConfsVec.resize(RefSeqCount); for (uint RefSeqIndex = 0; RefSeqIndex < RefSeqCount; ++RefSeqIndex) { for (uint RefColIndex = 0; RefColIndex < RefColCount; ++RefColIndex) { uint n = LetterCountsVec[RefSeqIndex][RefColIndex]; char c = Ref.GetChar(RefSeqIndex, RefColIndex); uint LetterConf = UINT_MAX; if (c == '-' || c == '.') asserta(n == 0); else LetterConf = (n*9)/TestMSACount; LetterConfsVec[RefSeqIndex].push_back(LetterConf); } } } static char GetConfChar(uint n) { if (n == UINT_MAX) return '-'; asserta(n <= 9); return '0' + n; } void cmd_letterconf() { const string EnsembleFileName = opt(letterconf); const string RefFileName = opt(ref); double MaxGapFract = optd(max_gap_fract, 1.0); string Name; GetBaseName(RefFileName.c_str(), Name); MSA Ref; MSA RefPC; Ref.FromFASTAFile(RefFileName); RefPC.FromFASTAFile_PreserveCase(RefFileName); const uint RefSeqCount = Ref.GetSeqCount(); const uint RefColCount = Ref.GetColCount(); Ensemble E; E.FromFile(EnsembleFileName); QScorer QS; QS.m_MaxGapFract = MaxGapFract; vector > LetterConfsVec; E.GetLetterConfsVec(Ref, MaxGapFract, LetterConfsVec); asserta(SIZE(LetterConfsVec) == RefSeqCount); asserta(SIZE(LetterConfsVec[0]) == RefColCount); MSA ConfAln; ConfAln.Copy(Ref); for (uint RefSeqIndex = 0; RefSeqIndex < RefSeqCount; ++RefSeqIndex) { const string RefLabel = Ref.GetSeqName(RefSeqIndex); string SeqStr; for (uint RefColIndex = 0; RefColIndex < RefColCount; ++RefColIndex) { uint n = LetterConfsVec[RefSeqIndex][RefColIndex]; asserta(n <= 9 || n == UINT_MAX); char ConfChar = GetConfChar(n); ConfAln.SetChar(RefSeqIndex, RefColIndex, ConfChar); } } ConfAln.ToFASTAFile(opt(output)); WriteLetterConfHTML(opt(html), RefPC, ConfAln); WriteLetterConfJalView(opt(jalview), RefPC, ConfAln); } muscle-5.1.0/src/letterconfhtml.cpp000066400000000000000000000121251424453062600173410ustar00rootroot00000000000000#include "muscle.h" #include "heatmapcolors.h" uint GetOverlap(uint Lo1, uint Hi1, uint Lo2, uint Hi2) { uint MaxLo = max(Lo1, Lo2); uint MinHi = min(Hi1, Hi2); if (MaxLo > MinHi) return 0; return MinHi - MaxLo + 1; } static void GetConfRanges_Gapped(const char *Seq, uint L, vector &Confs, vector &Los, vector &His) { Confs.clear(); Los.clear(); His.clear(); char CurrConf = Seq[0]; uint Lo = 0; for (uint i = 1; i <= L; ++i) { char Conf = (i == L ? 0 : Seq[i]); if (Conf != CurrConf) { uint Hi = i - 1; uint n = SIZE(His); if (n > 0) asserta(Lo == His[n-1] + 1); Confs.push_back(CurrConf); Los.push_back(Lo); His.push_back(Hi); Lo = i; CurrConf = Conf; } } } static void HTML_Head(FILE *f) { if (f == 0) return; fprintf(f, "\n" "\n" "\n" " \n" " Muscle5 alignment\n" " \n" "\n" "\n" "\n" "
\n" "
\n" ); } static void HTML_Foot(FILE *f) { if (f == 0) return; fprintf(f, "
\n" " Confidence high\n" ); fprintf(f, " "); for (int i = 9; i >= 0; --i) fprintf(f, "%c", '0' + i, '0' + i); fprintf(f, " low\n" ); fprintf(f, "\n"); fprintf(f, //" 0123456789\n" "
\n" " \n" "\n" ); } void WriteLetterConfHTML(const string &FileName, const MSA &Ref, const MSA &ConfAln) { if (FileName.empty()) return; FILE *fOut = CreateStdioFile(FileName); HTML_Head(fOut); const uint SeqCount = ConfAln.GetSeqCount(); const uint ColCount = ConfAln.GetColCount(); if (Ref.GetSeqCount() != SeqCount || Ref.GetColCount() != ColCount) Die("-ref has different number of rows or columns"); vector Labels; uint MaxLabelLength = 0; for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { const string Label = (string) ConfAln.GetSeqName(SeqIndex); const string RefLabel = (string) Ref.GetSeqName(SeqIndex); if (Label != RefLabel) Die("-ref labels do not match, seq %u input=%s ref=%s", SeqIndex + 1, Label.c_str(), RefLabel.c_str()); MaxLabelLength = max(MaxLabelLength, SIZE(Label)); Labels.push_back(Label); } const uint ROWLEN = 80; unsigned BlockCount = (ColCount + ROWLEN - 1)/ROWLEN; for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex) { unsigned BlockLo = BlockIndex*ROWLEN; unsigned BlockHi = BlockLo + ROWLEN - 1; for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { const string &Label = Labels[SeqIndex]; const uint n = SIZE(Label); fputs("", fOut); for (uint k = n; k < MaxLabelLength; ++k) fputs(" ", fOut); fputs(Label.c_str(), fOut); fputs("  ", fOut); const char *Seq = ConfAln.m_szSeqs[SeqIndex]; vector Confs; vector Los; vector His; GetConfRanges_Gapped(Seq, ColCount, Confs, Los, His); const uint RangeCount = SIZE(Confs); asserta(SIZE(His) == RangeCount); asserta(SIZE(Los) == RangeCount); for (uint i = 0; i < RangeCount; ++i) { char Conf = Confs[i]; uint Lo = Los[i]; uint Hi = His[i]; uint Overlap = GetOverlap(BlockLo, BlockHi, Lo, Hi); if (Overlap == 0) continue; if (i > 0) asserta(Lo == His[i-1] + 1); if (Conf >= '0' && Conf <= '9') fprintf(fOut, "", Conf); else if (Conf == '-') fprintf(fOut, ""); else Die("Bad conf=%c", Conf); for (uint Pos = Lo; Pos <= Hi; ++Pos) { if (Pos < BlockLo || Pos > BlockHi) continue; char c = Ref.GetChar(SeqIndex, Pos); fputc(c, fOut); } fprintf(fOut, ""); } fprintf(fOut, "
\n"); } fprintf(fOut, "
\n"); fprintf(fOut, "
\n"); } HTML_Foot(fOut); } void cmd_letterconf_html() { extern bool g_FASTA_AllowDigits; g_FASTA_AllowDigits = true; MSA ConfAln; ConfAln.FromFASTAFile(opt(letterconf_html)); g_FASTA_AllowDigits = false; const string &RefFileName = opt(output); if (RefFileName.empty()) Die("Must set -ref"); MSA Ref; Ref.FromFASTAFile_PreserveCase(opt(ref)); const string &OutputFileName = opt(output); if (OutputFileName.empty()) Die("Must set -output"); WriteLetterConfHTML(OutputFileName, Ref, ConfAln); } muscle-5.1.0/src/locallock.h000066400000000000000000000004131424453062600157140ustar00rootroot00000000000000#pragma once #include static omp_lock_t g_Lock; static bool InitLock() { omp_init_lock(&g_Lock); return true; } static bool g_InitDone = InitLock(); static void Lock() { omp_set_lock(&g_Lock); } static void Unlock() { omp_unset_lock(&g_Lock); } muscle-5.1.0/src/logaln.cpp000066400000000000000000000064611424453062600155710ustar00rootroot00000000000000#include "muscle.h" void MakeAlnRows(const string &X, const string &Y, const string &PathXY, string &RowX, string &RowY) { RowX.clear(); RowY.clear(); const byte *XSeq = (const byte *) X.c_str(); const byte *YSeq = (const byte *) Y.c_str(); const uint ColCount = SIZE(PathXY); const uint LX = SIZE(X); const uint LY = SIZE(Y); uint XPos = 0; uint YPos = 0; for (uint Col = 0; Col < ColCount; ++Col) { char c = PathXY[Col]; if (c == 'B') { RowX += XSeq[XPos]; RowY += YSeq[YPos]; ++YPos; ++XPos; } else if (c == 'X') { RowX += XSeq[XPos]; RowY += '-'; ++XPos; } else if (c == 'Y') { RowY += YSeq[YPos]; RowX += '-'; ++YPos; } else asserta(false); } asserta(XPos == LX && YPos == LY); } void MakeAlnRows(const Sequence &X, const Sequence &Y, const string &PathXY, string &RowX, string &RowY) { RowX.clear(); RowY.clear(); const byte *XSeq = X.GetBytePtr(); const byte *YSeq = Y.GetBytePtr(); const uint ColCount = SIZE(PathXY); const uint LX = X.GetLength(); const uint LY = Y.GetLength(); uint XPos = 0; uint YPos = 0; for (uint Col = 0; Col < ColCount; ++Col) { char c = PathXY[Col]; if (c == 'B') { RowX += XSeq[XPos]; RowY += YSeq[YPos]; ++YPos; ++XPos; } else if (c == 'X') { RowX += XSeq[XPos]; RowY += '-'; ++XPos; } else if (c == 'Y') { RowY += YSeq[YPos]; RowX += '-'; ++YPos; } else asserta(false); } asserta(XPos == LX && YPos == LY); } void LogAln(const string &X, const string &Y, const string &PathXY) { string RowX; string RowY; MakeAlnRows(X, Y, PathXY, RowX, RowY); Log("\n"); Log("%s\n", RowX.c_str()); Log("%s\n", RowY.c_str()); } void LogAln(const Sequence &X, const Sequence &Y, const string &PathXY) { string RowX; string RowY; MakeAlnRows(X, Y, PathXY, RowX, RowY); const string &LabelX = X.GetLabel(); const string &LabelY = Y.GetLabel(); Log("\n"); Log("%10.10s %s\n", LabelX.c_str(), RowX.c_str()); Log("%10.10s %s\n", LabelY.c_str(), RowY.c_str()); } void PathToColVecs(const string &PathXY, vector &PosToColX, vector &PosToColY, vector &ColToPosX, vector &ColToPosY) { PosToColX.clear(); PosToColY.clear(); ColToPosX.clear(); PosToColY.clear(); const uint ColCount = SIZE(PathXY); for (uint Col = 0; Col < ColCount; ++Col) { char c = PathXY[Col]; if (c == 'B') { uint PosX = SIZE(PosToColX); uint PosY = SIZE(PosToColY); ColToPosX.push_back(PosX); ColToPosY.push_back(PosY); PosToColX.push_back(Col); PosToColY.push_back(Col); } else if (c == 'X') { uint PosX = SIZE(PosToColX); ColToPosX.push_back(PosX); ColToPosY.push_back(UINT_MAX); PosToColX.push_back(Col); } else if (c == 'Y') { uint PosY = SIZE(PosToColY); ColToPosY.push_back(PosY); ColToPosX.push_back(UINT_MAX); PosToColY.push_back(Col); } else asserta(false); } // Validate { asserta(SIZE(ColToPosX) == ColCount); asserta(SIZE(ColToPosY) == ColCount); for (uint Col = 0; Col < ColCount; ++Col) { uint PosX = ColToPosX[Col]; uint PosY = ColToPosY[Col]; if (PosX != UINT_MAX) { asserta(PosX < SIZE(PosToColX)); asserta(PosToColX[PosX] == Col); } if (PosY != UINT_MAX) { asserta(PosY < SIZE(PosToColY)); asserta(PosToColY[PosY] == Col); } } } } muscle-5.1.0/src/logdistmx.cpp000066400000000000000000000010761424453062600163240ustar00rootroot00000000000000#include "muscle.h" void LogDistMx(const string &Msg, const vector > &Mx) { Log("\n"); Log("LogDistMx(%s)\n", Msg.c_str()); const uint RowCount = SIZE(Mx); asserta(RowCount > 0); const uint ColCount = SIZE(Mx[0]); for (uint Row = 0; Row < RowCount; ++Row) { Log("[%5u] ", Row); const uint ColCount = SIZE(Mx[Row]); for (uint Col = 0; Col < ColCount; ++Col) { float x = Mx[Row][Col]; if (x == FLT_MAX) Log(" %7.7s", "*"); else if (x == LOG_ZERO) Log(" %7.7s", "."); else Log(" %7.3g", x); } Log("\n"); } } muscle-5.1.0/src/logmx.cpp000066400000000000000000000037621424453062600154440ustar00rootroot00000000000000#include "muscle.h" void LogTomMx(const string &Name, const vector &Mx, uint LX, uint LY) { Log("\n"); Log("Tom %s: LX=%u LY=%u\n", Name.c_str(), LX, LY); Log(" "); for (uint j = 0; j <= LY; ++j) Log(" %10u", j); Log("\n"); uint Ix = 0; for (uint i = 0; i <= LX; ++i) { Log("[%3u] ", i); for (uint j = 0; j <= LY; ++j) { float P = Mx[Ix++]; Log(" %10.3g", P); } Log("\n"); } } // (LX+1) x (LY+1) void LogFlatMx1(const string &Name, const float *MyPost, uint LX, uint LY) { Log("\n"); Log("Flat1 %s: LX=%u LY=%u\n", Name.c_str(), LX, LY); Log(" "); for (uint j = 0; j <= LY; ++j) Log(" %10u", j); Log("\n"); uint Ix = 0; for (uint i = 0; i <= LX; ++i) { Log("[%3u] ", i); for (uint j = 0; j <= LY; ++j) { float P = MyPost[Ix++]; Log(" %10.3g", P); } Log("\n"); } } // LX x LY void LogFlatMx(const string &Name, const float *MyPost, uint LX, uint LY) { Log("\n"); Log("Flat %s: LX=%u LY=%u\n", Name.c_str(), LX, LY); Log(" "); for (uint j = 0; j < LY; ++j) Log(" %10u", j); Log("\n"); uint Ix = 0; for (uint i = 0; i < LX; ++i) { Log("[%3u] ", i); for (uint j = 0; j < LY; ++j) { float P = MyPost[Ix++]; Log(" %10.3g", P); } Log("\n"); } } // 5 x (LX + 1) x (LY + 1) void LogFlatMxs(const string &Name, const float *Mxs, uint LX, uint LY) { Log("\n"); for (uint s = 0; s < HMMSTATE_COUNT; ++s) { Log("Flat %s[%u]: LX=%u LY=%u\n", Name.c_str(), s, LX, LY); Log(" "); for (uint j = 0; j <= LY; ++j) Log(" %10u", j); Log("\n"); uint Ix = s; for (uint i = 0; i <= LX; ++i) { Log("[%3u] ", i); for (uint j = 0; j <= LY; ++j) { float x = Mxs[Ix]; Ix += HMMSTATE_COUNT; if (x == INVALID_LOG) Log(" %8.8s", "*ERR*"); if (x == OUT_OF_BAND_LOG) Log(" %8.8s", "#"); if (x == UNINIT_LOG) Log(" %8.8s", "-"); else if (x == LOG_ZERO) Log(" %8.8s", "."); else Log(" %8.3g", x); } Log("\n"); } } } muscle-5.1.0/src/main.cpp000066400000000000000000000014501424453062600152320ustar00rootroot00000000000000#include "muscle.h" #include "myutils.h" int main(int argc, char **argv) { for (int i = 1; i < argc; ++i) { string s = string(argv[i]); if (s == "-h") { void Usage(FILE *f); Usage(stdout); return 0; } if (s == "-help" || s == "--help") { void Help(); Help(); return 0; } } MyCmdLine(argc, argv); if (!opt(quiet)) { PrintBanner(stderr); if (argc < 2) return 0; } SetLogFileName(opt(log)); LogProgramInfoAndCmdLine(); uint CmdCount = 0; #define C(x) if (optset_##x) ++CmdCount; #include "cmds.h" if (CmdCount > 1) Die("More than one command specified"); #define C(x) \ if (optset_##x) \ { \ void cmd_##x(); \ cmd_##x(); \ CheckUsedOpts(false); \ LogElapsedTimeAndRAM(); \ return 0; \ } #include "cmds.h" #undef C return 0; } muscle-5.1.0/src/make_a2m.cpp000066400000000000000000000044541424453062600157710ustar00rootroot00000000000000#include "muscle.h" void cmd_make_a2m() { const string &InputFileName = opt(make_a2m); MSA Aln; Progress("Reading %s ...", InputFileName.c_str()); Aln.FromFASTAFile(InputFileName); Progress("done.\n"); FILE *fOut = CreateStdioFile(opt(output)); const uint SeqCount = Aln.GetSeqCount(); const uint ColCount = Aln.GetColCount(); asserta(SeqCount > 0); asserta(ColCount > 0); uint RefSeqIndex = 0; string RefLabel; if (optset_label) { RefLabel = opt(label); RefSeqIndex = Aln.GetSeqIndex(RefLabel); } else Aln.GetSeqLabel(0, RefLabel); vector RefPosToCol; vector RefColToPos; Aln.GetPosToCol(RefSeqIndex, RefPosToCol); Aln.GetColToPos(RefSeqIndex, RefColToPos); const uint RL = SIZE(RefPosToCol); vector IsInserts; IsInserts.resize(ColCount, false); uint FirstCol = RefPosToCol[0]; for (uint Col = 0; Col < FirstCol; ++Col) IsInserts[Col] = true; for (uint RefPos = 1; RefPos < RL; ++RefPos) { uint PrevCol = RefPosToCol[RefPos-1]; uint ThisCol = RefPosToCol[RefPos]; asserta(PrevCol < ThisCol); for (uint Col = PrevCol + 1; Col < ThisCol; ++Col) IsInserts[Col] = true; } uint LastCol = RefPosToCol[RL-1]; for (uint Col = LastCol + 1; Col < ColCount; ++Col) IsInserts[Col] = true; for (uint Col = 0; Col < ColCount; ++Col) { bool IsMatch = (RefColToPos[Col] != UINT_MAX); bool IsInsert = IsInserts[Col]; asserta(int(IsMatch) + int(IsInsert) == 1); } for (uint i = 0; i < SeqCount; ++i) { uint SeqIndex = i; if (i == 0) SeqIndex = RefSeqIndex; else if (i == RefSeqIndex) SeqIndex = 0; ProgressStep(i, SeqCount, "Converting"); Pf(fOut, ">%s\n", Aln.GetSeqName(SeqIndex)); string SeqStr; const char *SeqCharPtr = Aln.GetSeqCharPtr(SeqIndex); for (uint Col = 0; Col < ColCount; ++Col) { char c = SeqCharPtr[Col]; bool IsMatch = (RefColToPos[Col] != UINT_MAX); bool IsInsert = IsInserts[Col]; if (IsMatch) { asserta(!IsInsert); if (c == '.' || c == '-') c = '-'; else if (isalpha(c)) c = toupper(c); else Die("Bad char 0x%02x", c); } else if (IsInsert) { asserta(!IsMatch); if (c == '.' || c == '-') continue; c = tolower(c); } else asserta(false); SeqStr += c; } Pf(fOut, "%s\n", SeqStr.c_str()); } CloseStdioFile(fOut); } muscle-5.1.0/src/maxcc.cpp000066400000000000000000000023641424453062600154060ustar00rootroot00000000000000#include "muscle.h" #include "ensemble.h" void cmd_maxcc() { const string &InputFileName = opt(maxcc); const string &OutputFileName = opt(output); if (OutputFileName.empty()) Die("Must set -output"); Ensemble E; E.FromFile(InputFileName); const uint MSACount = E.GetMSACount(); if (MSACount == 0) Die("Ensemble is empty"); uint BestMSAIndex = 0; double BestConf = 0; double SumConf = 0; double MinConf = 0; for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const string &Name = E.GetMSAName(MSAIndex); const MSA &M = *E.m_MSAs[MSAIndex]; double TotalConf = E.GetTotalConf(MSAIndex); ProgressStep(MSAIndex, MSACount, "%s (%.3g)", Name.c_str(), TotalConf); Log("%u/%u %s (%.3g)\n", MSAIndex, MSACount, Name.c_str(), TotalConf); if (MSAIndex == 0 || TotalConf < MinConf) MinConf = TotalConf; SumConf += TotalConf; if (TotalConf >= BestConf) { BestMSAIndex = MSAIndex; BestConf = TotalConf; } } const string &BestName = E.GetMSAName(BestMSAIndex); const MSA *BestMSA = E.m_MSAs[BestMSAIndex]; double AvgConf = SumConf/MSACount; ProgressLog("CC min %.3g, avg %.3g, max %.3g, best %s\n", MinConf, AvgConf, BestConf, BestName.c_str()); asserta(BestMSA != 0); BestMSA->ToFASTAFile(OutputFileName); } muscle-5.1.0/src/mpcflat.cpp000066400000000000000000000166241424453062600157450ustar00rootroot00000000000000#include "muscle.h" #include "mpcflat.h" #include "tree.h" #include "locallock.h" #define DOTIMING 0 #if DOTIMING #include "timing.h" static TICKS g_tFwd; static TICKS g_tBwd; static TICKS g_tPost; static TICKS g_tSparse; static TICKS g_tAln; static TICKS g_tDelete; #endif void MPCFlat::Clear() { m_InputSeqs = 0; if (m_MSA != 0) delete m_MSA; m_MSA = 0; m_Labels.clear(); m_LabelToIndex.clear(); m_Upgma5.Clear(); m_GuideTree.Clear(); m_DistMx.clear(); m_Pairs.clear(); m_PairToIndex.clear(); m_JoinIndexes1.clear(); m_JoinIndexes2.clear(); FreeSparsePosts(); FreeProgMSAs(); } const char *MPCFlat::GetLabel(uint SeqIndex) const { const char *Label = m_InputSeqs->GetLabel(SeqIndex); return Label; } uint MPCFlat::GetSeqLength(uint SeqIndex) const { uint L = m_InputSeqs->GetSeqLength(SeqIndex); return L; } const Sequence *MPCFlat::GetSequence(uint SeqIndex) const { const Sequence *s = m_InputSeqs->GetSequence(SeqIndex); return s; } const byte *MPCFlat::GetBytePtr(uint SeqIndex) const { const byte *Ptr = m_InputSeqs->GetBytePtr(SeqIndex); return Ptr; } uint MPCFlat::GetPairIndex(uint SMI1, uint SMI2) const { //asserta(SMI1 > SMI2); //uint PairIndex = SMI2 + (SMI1*(SMI1 - 1))/2; asserta(SMI1 < SMI2); const pair Pair(SMI1, SMI2); map, uint>::const_iterator p = m_PairToIndex.find(Pair); asserta(p != m_PairToIndex.end()); uint PairIndex = p->second; return PairIndex; } const pair &MPCFlat::GetPair(uint PairIndex) const { assert(PairIndex < SIZE(m_Pairs)); return m_Pairs[PairIndex]; } void MPCFlat::AllocPairCount(uint PairCount) { asserta(PairCount > 0); if (PairCount < SIZE(*m_ptrSparsePosts)) return; m_SparsePosts1.resize(PairCount); m_SparsePosts2.resize(PairCount); } MySparseMx &MPCFlat::GetSparsePost(uint PairIndex) { asserta(PairIndex < SIZE(*m_ptrSparsePosts)); MySparseMx *Mx = (*m_ptrSparsePosts)[PairIndex]; if (Mx == 0) { Mx = new MySparseMx; (*m_ptrSparsePosts)[PairIndex] = Mx; } return *Mx; } MySparseMx &MPCFlat::GetUpdatedSparsePost(uint PairIndex) { asserta(PairIndex < SIZE(*m_ptrUpdatedSparsePosts)); MySparseMx *Mx = (*m_ptrUpdatedSparsePosts)[PairIndex]; if (Mx == 0) { Mx = new MySparseMx; (*m_ptrUpdatedSparsePosts)[PairIndex] = Mx; } return *Mx; } uint MPCFlat::GetL(uint SeqIndex) const { return m_InputSeqs->GetSeqLength(SeqIndex); } uint MPCFlat::GetSeqCount() const { asserta(m_InputSeqs != 0); uint SeqCount = m_InputSeqs->GetSeqCount(); return SeqCount; } void MPCFlat::InitSeqs(MultiSequence *InputSeqs) { m_InputSeqs = InputSeqs; const uint SeqCount = GetSeqCount(); m_Labels.clear(); m_LabelToIndex.clear(); for (uint i = 0; i < SeqCount; ++i) { const Sequence *Seq = InputSeqs->GetSequence(i); Sequence *HackSeq = (Sequence *) Seq; HackSeq->m_SMI = i; const string &Label = Seq->GetLabel(); m_Labels.push_back(Label); if (m_LabelToIndex.find(Label) != m_LabelToIndex.end()) Die("Duplicate label >%s", Label.c_str()); m_LabelToIndex[Label] = i; } } void MPCFlat::InitPairs() { const uint SeqCount = GetSeqCount(); m_Pairs.clear(); m_PairToIndex.clear(); uint PairIndex = 0; for (uint SeqIndex1 = 0; SeqIndex1 < SeqCount; ++SeqIndex1) for (uint SeqIndex2 = SeqIndex1 + 1; SeqIndex2 < SeqCount; ++SeqIndex2) { const pair Pair(SeqIndex1, SeqIndex2); m_Pairs.push_back(Pair); assert(m_PairToIndex.find(Pair) == m_PairToIndex.end()); m_PairToIndex[Pair] = PairIndex; uint PairIndex2 = GetPairIndex(SeqIndex1, SeqIndex2); asserta(PairIndex2 == PairIndex); ++PairIndex; } uint PairCount = (SeqCount * (SeqCount - 1)) / 2; uint PairCount2 = SIZE(m_Pairs); asserta(PairCount == PairCount2); } void MPCFlat::InitDistMx() { const uint SeqCount = GetSeqCount(); m_DistMx.clear(); m_DistMx.resize(SeqCount); for (uint i = 0; i < SeqCount; ++i) { m_DistMx[i].resize(SeqCount, FLT_MAX); m_DistMx[i][i] = 0; } } void MPCFlat::Consistency() { const uint SeqCount = GetSeqCount(); if (SeqCount < 3) return; for (uint Iter = 0; Iter < m_ConsistencyIterCount; ++Iter) ConsIter(Iter); } void MPCFlat::CalcGuideTree() { if (opt(randomchaintree)) { CalcGuideTree_RandomChain(); return; } m_Upgma5.Init(m_Labels, m_DistMx); m_Upgma5.FixEADistMx(); m_Upgma5.Run(LINKAGE_Biased, m_GuideTree); PermTree(m_GuideTree, m_TreePerm); } void MPCFlat::CalcJoinOrder() { GetGuideTreeJoinOrder(m_GuideTree, m_LabelToIndex, m_JoinIndexes1, m_JoinIndexes2); ValidateJoinOrder(m_JoinIndexes1, m_JoinIndexes2); } void MPCFlat::CalcPosteriors() { #if 0//TRACE { Log("MPCFlat::CalcPosteriors SIZE(m_SparsePosts1)=%u SIZE(m_SparsePosts2)=%u\n", SIZE(m_SparsePosts1), SIZE(m_SparsePosts2)); for (uint i = 0; i < SIZE(m_SparsePosts1); ++i) { const MySparseMx *M = m_SparsePosts1[i]; Log("m_SparsePosts1[%u]=[%p]", i, M); if (M != 0) Log(" maxlx %u\n", M->m_MaxLX); Log("\n"); } for (uint i = 0; i < SIZE(m_SparsePosts2); ++i) { const MySparseMx *M = m_SparsePosts2[i]; Log("m_SparsePosts2[%u]=[%p]", i, M); if (M != 0) Log(" maxlx %u\n", M->m_MaxLX); Log("\n"); } } #endif uint PairCount = SIZE(m_Pairs); asserta(PairCount > 0); unsigned ThreadCount = GetRequestedThreadCount(); uint PairCounter = 0; #pragma omp parallel for num_threads(ThreadCount) for (int PairIndex = 0; PairIndex < (int) PairCount; ++PairIndex) { Lock(); ProgressStep(PairCounter++, PairCount, "Calc posteriors"); Unlock(); CalcPosterior(PairIndex); } } void MPCFlat::Refine() { const uint SeqCount = GetSeqCount(); if (SeqCount < 3) return; for (uint Iter = 0; Iter < m_RefineIterCount; ++Iter) { ProgressStep(Iter, m_RefineIterCount, "Refining"); RefineIter(); } } void MPCFlat::Run_Super4(MultiSequence *ConsensusSeqs) { assert(ConsensusSeqs != 0); Clear(); const uint SeqCount = ConsensusSeqs->GetSeqCount(); asserta(SeqCount > 1); uint PairCount = (SeqCount*(SeqCount-1))/2; AllocPairCount(PairCount); InitSeqs(ConsensusSeqs); InitPairs(); InitDistMx(); CalcPosteriors(); Consistency(); CalcGuideTree(); } void MPCFlat::Run(MultiSequence *InputSeqs) { assert(InputSeqs != 0); Clear(); const uint SeqCount = InputSeqs->GetSeqCount(); if (SeqCount == 1) { m_MSA = InputSeqs; return; } uint PairCount = (SeqCount*(SeqCount-1))/2; AllocPairCount(PairCount); InitSeqs(InputSeqs); InitPairs(); InitDistMx(); CalcPosteriors(); Consistency(); CalcGuideTree(); CalcJoinOrder(); ProgressiveAlign(); Refine(); asserta(m_MSA != 0); } MultiSequence *RunMPCFlat(MultiSequence *InputSeqs) { MPCFlat M; if (optset_consiters) M.m_ConsistencyIterCount = opt(consiters); if (optset_refineiters) M.m_RefineIterCount = opt(refineiters); TREEPERM TP = TP_None; if (optset_perm) TP = StrToTREEPERM(opt(perm)); if (TP == TP_All) Die("-perm all not supported, please specify none, abc, acb or bca"); M.m_TreePerm = TP; M.Run(InputSeqs); asserta(M.m_MSA != 0); return M.m_MSA; } #if DOTIMING void LogTiming() { double Sum = g_tFwd + g_tBwd + g_tPost + g_tAln + g_tDelete; ProgressLog("%10.3g %4.1f%% Fwd\n", double(g_tFwd), GetPct(g_tFwd, Sum)); ProgressLog("%10.3g %4.1f%% Bwd\n", double(g_tBwd), GetPct(g_tBwd, Sum)); ProgressLog("%10.3g %4.1f%% Post\n", double(g_tPost), GetPct(g_tPost, Sum)); ProgressLog("%10.3g %4.1f%% Aln\n", double(g_tAln), GetPct(g_tAln, Sum)); ProgressLog("%10.3g %4.1f%% Delete\n", double(g_tDelete), GetPct(g_tDelete, Sum)); } #endif muscle-5.1.0/src/mpcflat.h000066400000000000000000000055741424453062600154140ustar00rootroot00000000000000#pragma once #include "multisequence.h" #include "upgma5.h" #include "tree.h" #include "treeperm.h" #include "mysparsemx.h" // Multi-threaded ProbCons class MPCFlat { public: MultiSequence *m_InputSeqs = 0; MultiSequence *m_MSA = 0; uint m_ConsistencyIterCount = DEFAULT_CONSISTENCY_ITERS; uint m_RefineIterCount = DEFAULT_REFINE_ITERS; TREEPERM m_TreePerm = TP_None; vector m_Labels; map m_LabelToIndex; UPGMA5 m_Upgma5; Tree m_GuideTree; vector m_ProgMSAs; vector > m_DistMx; vector > m_Pairs; map, uint> m_PairToIndex; vector m_JoinIndexes1; vector m_JoinIndexes2; // Per-pair vector m_SparsePosts1; vector m_SparsePosts2; vector *m_ptrSparsePosts = &m_SparsePosts1; vector *m_ptrUpdatedSparsePosts = &m_SparsePosts2; public: ~MPCFlat() { Clear(); } void Clear(); void Run(MultiSequence *InputSeqs); uint GetSeqCount() const; void Run_Super4(MultiSequence *InputSeqs); private: void AllocPairCount(uint SeqCount); void FreeProgMSAs(); void FreeSparsePosts(); uint GetL(uint SeqIndex) const; void InitSeqs(MultiSequence *InputSeqs); void InitPairs(); void InitDistMx(); void CalcPosteriors(); void CalcPosterior(uint PairIndex); void Consistency(); void ConsIter(uint Iter); void ConsPair(uint PairIndex); void CalcGuideTree(); void CalcGuideTree_RandomChain(); void CalcJoinOrder(); void ProgressiveAlign(); void Refine(); void RefineIter(); void ProgAln(uint JoinIndex); const pair &GetPair(uint PairIndex) const; const char *GetLabel(uint SeqIndex) const; const byte *GetBytePtr(uint SeqIndex) const; uint GetPairIndex(uint SMI1, uint SMI2) const; MySparseMx &GetSparsePost(uint PairIndex); MySparseMx &GetUpdatedSparsePost(uint PairIndex); MultiSequence *AlignAlns(const MultiSequence &MSA1, const MultiSequence &MSA2); void BuildPost(const MultiSequence &MSA1, const MultiSequence &MSA2, float *Post); uint GetSeqLength(uint SeqIndex) const; const Sequence *GetSequence(uint SeqIndex) const; }; float *AllocFB(uint LX, uint LY); float *AllocPost(uint LX, uint LY); float *AllocDPRows(uint LX, uint LY); char *AllocTB(uint LX, uint LY); float CalcTotalProbFlat(const float *FlatFwd, const float *FlatBwd, uint LX, uint LY); void CalcFwdFlat(const byte *X, uint LX, const byte *Y, uint LY, float *Flat); void CalcBwdFlat(const byte *X, uint LX, const byte *Y, uint LY, float *Flat); float CalcAlnScoreFlat(const float *Post, uint LX, uint LY, float *DPRows); float CalcAlnFlat(const float *Post, uint LX, uint LY, float *DPRows, char *TB, string &Path); void RelaxFlat_XZ_ZY(const MySparseMx &XZ, const MySparseMx &ZY, float *Post); void RelaxFlat_ZX_ZY(const MySparseMx &XZ, const MySparseMx &YZ, float *Post); void RelaxFlat_XZ_YZ(const MySparseMx &XZ, const MySparseMx &YZ, float *Post); muscle-5.1.0/src/msa.cpp000066400000000000000000000521331424453062600150720ustar00rootroot00000000000000#include "muscle.h" #include "msa.h" #include "textfile.h" #include "seq.h" #include "sequence.h" #include const unsigned DEFAULT_SEQ_LENGTH = 500; unsigned MSA::m_uIdCount = 0; MSA::MSA() { m_uSeqCount = 0; m_uColCount = 0; m_szSeqs = 0; m_szNames = 0; m_IdToSeqIndex = 0; m_SeqIndexToId = 0; m_uCacheSeqCount = 0; m_uCacheSeqLength = 0; } MSA::~MSA() { Free(); } void MSA::Free() { for (unsigned n = 0; n < m_uSeqCount; ++n) { delete[] m_szSeqs[n]; delete[] m_szNames[n]; } delete[] m_szSeqs; delete[] m_szNames; delete[] m_IdToSeqIndex; delete[] m_SeqIndexToId; m_uSeqCount = 0; m_uColCount = 0; m_szSeqs = 0; m_szNames = 0; m_IdToSeqIndex = 0; m_SeqIndexToId = 0; m_uCacheSeqLength = 0; m_uCacheSeqCount = 0; } void MSA::SetSize(unsigned uSeqCount, unsigned uColCount) { Free(); m_uSeqCount = uSeqCount; m_uCacheSeqLength = uColCount; m_uColCount = uColCount; if (0 == uSeqCount && 0 == uColCount) return; m_szSeqs = new char *[uSeqCount]; m_szNames = new char *[uSeqCount]; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { m_szSeqs[uSeqIndex] = new char[uColCount+1]; m_szNames[uSeqIndex] = 0; #if DEBUG memset(m_szSeqs[uSeqIndex], '?', uColCount); #endif m_szSeqs[uSeqIndex][uColCount] = 0; } if (m_uIdCount > 0) { m_IdToSeqIndex = new unsigned[m_uIdCount]; m_SeqIndexToId = new unsigned[m_uSeqCount]; #if DEBUG memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned)); memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned)); #endif } } void MSA::LogMe() const { if (0 == GetColCount()) { Log("MSA empty\n"); return; } const unsigned uColsPerLine = 50; unsigned uLinesPerSeq = (GetColCount() - 1)/uColsPerLine + 1; for (unsigned n = 0; n < uLinesPerSeq; ++n) { unsigned i; unsigned iStart = n*uColsPerLine; unsigned iEnd = GetColCount(); if (iEnd - iStart + 1 > uColsPerLine) iEnd = iStart + uColsPerLine; Log(" "); for (i = iStart; i < iEnd; ++i) Log("%u", i%10); Log("\n"); Log(" "); for (i = iStart; i + 9 < iEnd; i += 10) Log("%-10u", i); if (n == uLinesPerSeq - 1) Log(" %-10u", GetColCount()); Log("\n"); for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) { Log("%12.12s", m_szNames[uSeqIndex]); Log(" "); Log(" "); for (i = iStart; i < iEnd; ++i) Log("%c", GetChar(uSeqIndex, i)); if (0 != m_SeqIndexToId) Log(" [%5u]", m_SeqIndexToId[uSeqIndex]); Log("\n"); } Log("\n\n"); } } char MSA::GetChar(unsigned uSeqIndex, unsigned uIndex) const { // TODO: Performance cost? if (uSeqIndex >= m_uSeqCount || uIndex >= m_uColCount) Die("MSA::GetChar(%u/%u,%u/%u)", uSeqIndex, m_uSeqCount, uIndex, m_uColCount); char c = m_szSeqs[uSeqIndex][uIndex]; // assert(IsLegalChar(c)); return c; } unsigned MSA::GetLetter(unsigned uSeqIndex, unsigned uIndex) const { // TODO: Performance cost? char c = GetChar(uSeqIndex, uIndex); unsigned uLetter = CharToLetter(c); if (uLetter >= 20) { char c = ' '; if (uSeqIndex < m_uSeqCount && uIndex < m_uColCount) c = m_szSeqs[uSeqIndex][uIndex]; Die("MSA::GetLetter(%u/%u, %u/%u)='%c'/%u", uSeqIndex, m_uSeqCount, uIndex, m_uColCount, c, uLetter); } return uLetter; } unsigned MSA::GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const { // TODO: Performance cost? char c = GetChar(uSeqIndex, uIndex); unsigned uLetter = CharToLetterEx(c); return uLetter; } void MSA::SetSeqName(unsigned uSeqIndex, const char szName[]) { if (uSeqIndex >= m_uSeqCount) Die("MSA::SetSeqName(%u, %s), count=%u", uSeqIndex, m_uSeqCount); delete[] m_szNames[uSeqIndex]; int n = (int) strlen(szName) + 1; m_szNames[uSeqIndex] = new char[n]; memcpy(m_szNames[uSeqIndex], szName, n); } void MSA::GetSeqLabel(uint SeqIndex, string &Label) const { Label = string(GetSeqName(SeqIndex)); } const char *MSA::GetSeqName(unsigned uSeqIndex) const { if (uSeqIndex >= m_uSeqCount) Die("MSA::GetSeqName(%u), count=%u", uSeqIndex, m_uSeqCount); return m_szNames[uSeqIndex]; } bool MSA::IsGap(unsigned uSeqIndex, unsigned uIndex) const { char c = GetChar(uSeqIndex, uIndex); return IsGapChar(c); } bool MSA::IsWildcard(unsigned uSeqIndex, unsigned uIndex) const { char c = GetChar(uSeqIndex, uIndex); return IsWildcardChar(c); } void MSA::SetChar(unsigned uSeqIndex, unsigned uIndex, char c) { if (uSeqIndex >= m_uSeqCount || uIndex > m_uCacheSeqLength) Die("MSA::SetChar(%u,%u)", uSeqIndex, uIndex); if (uIndex == m_uCacheSeqLength) { const unsigned uNewCacheSeqLength = m_uCacheSeqLength + DEFAULT_SEQ_LENGTH; for (unsigned n = 0; n < m_uSeqCount; ++n) { char *ptrNewSeq = new char[uNewCacheSeqLength+1]; memcpy(ptrNewSeq, m_szSeqs[n], m_uCacheSeqLength); memset(ptrNewSeq + m_uCacheSeqLength, '?', DEFAULT_SEQ_LENGTH); ptrNewSeq[uNewCacheSeqLength] = 0; delete[] m_szSeqs[n]; m_szSeqs[n] = ptrNewSeq; } m_uColCount = uIndex; m_uCacheSeqLength = uNewCacheSeqLength; } if (uIndex >= m_uColCount) m_uColCount = uIndex + 1; m_szSeqs[uSeqIndex][uIndex] = c; } const char *MSA::GetSeqCharPtr(uint SeqIndex) const { asserta(SeqIndex < m_uSeqCount); const char *SeqCharPtr = m_szSeqs[SeqIndex]; return SeqCharPtr; } void MSA::GetRowStr(unsigned uSeqIndex, string &RowStr) const { RowStr.clear(); const char *SeqCharPtr = GetSeqCharPtr(uSeqIndex); for (uint i = 0; i < m_uColCount; ++i) RowStr += SeqCharPtr[i]; } void MSA::GetSeq(unsigned uSeqIndex, Seq &seq) const { assert(uSeqIndex < m_uSeqCount); seq.Clear(); for (unsigned n = 0; n < m_uColCount; ++n) if (!IsGap(uSeqIndex, n)) { char c = GetChar(uSeqIndex, n); if (!isalpha(c)) Die("Invalid character '%c' in sequence", c); c = toupper(c); seq.push_back(c); } const char *ptrName = GetSeqName(uSeqIndex); seq.SetName(ptrName); } bool MSA::HasGap() const { for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) for (unsigned n = 0; n < GetColCount(); ++n) if (IsGap(uSeqIndex, n)) return true; return false; } bool MSA::IsLegalLetter(unsigned uLetter) const { return uLetter < 20; } void MSA::SetSeqCount(unsigned uSeqCount) { Free(); SetSize(uSeqCount, DEFAULT_SEQ_LENGTH); } void MSA::CopyCol(unsigned uFromCol, unsigned uToCol) { assert(uFromCol < GetColCount()); assert(uToCol < GetColCount()); if (uFromCol == uToCol) return; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char c = GetChar(uSeqIndex, uFromCol); SetChar(uSeqIndex, uToCol, c); } } void MSA::Copy(const MSA &msa) { Free(); const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); SetSize(uSeqCount, uColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { SetSeqName(uSeqIndex, msa.GetSeqName(uSeqIndex)); const unsigned uId = msa.GetSeqId(uSeqIndex); if (uId != UINT_MAX) SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msa.GetChar(uSeqIndex, uColIndex); SetChar(uSeqIndex, uColIndex, c); } } } uint MSA::GetGapCount(unsigned uColIndex) const { uint n = 0; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) ++n; return n; } bool MSA::IsGapColumn(unsigned uColIndex) const { assert(GetSeqCount() > 0); for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) if (!IsGap(uSeqIndex, uColIndex)) return false; return true; } uint MSA::GetSeqIndex(const string &Label, bool FailOnError) const { for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) if (0 == stricmp(Label.c_str(), GetSeqName(uSeqIndex))) return uSeqIndex; if (FailOnError) Die("Not found >%s", Label.c_str()); return UINT_MAX; } bool MSA::GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const { for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) if (0 == stricmp(ptrSeqName, GetSeqName(uSeqIndex))) { *ptruSeqIndex = uSeqIndex; return true; } return false; } void MSA::DeleteCol(unsigned uColIndex) { assert(uColIndex < m_uColCount); size_t n = m_uColCount - uColIndex; if (n > 0) { for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { char *ptrSeq = m_szSeqs[uSeqIndex]; memmove(ptrSeq + uColIndex, ptrSeq + uColIndex + 1, n); } } --m_uColCount; } void MSA::DeleteColumns(unsigned uColIndex, unsigned uColCount) { for (unsigned n = 0; n < uColCount; ++n) DeleteCol(uColIndex); } void MSA::FromFile(TextFile &File) { FromFASTAFile(File); } static void FmtChar(char c, unsigned uWidth) { Log("%c", c); for (unsigned n = 0; n < uWidth - 1; ++n) Log(" "); } static void FmtInt(unsigned u, unsigned uWidth) { static char szStr[1024]; assert(uWidth < sizeof(szStr)); if (u > 0) sprintf(szStr, "%u", u); else strcpy(szStr, "."); Log(szStr); unsigned n = (unsigned) strlen(szStr); if (n < uWidth) for (unsigned i = 0; i < uWidth - n; ++i) Log(" "); } static void FmtInt0(unsigned u, unsigned uWidth) { static char szStr[1024]; assert(uWidth < sizeof(szStr)); sprintf(szStr, "%u", u); Log(szStr); unsigned n = (unsigned) strlen(szStr); if (n < uWidth) for (unsigned i = 0; i < uWidth - n; ++i) Log(" "); } static void FmtPad(unsigned n) { for (unsigned i = 0; i < n; ++i) Log(" "); } void MSA::GetLabelToSeqIndex(vector &Labels, map &LabelToSeqIndex) const { Labels.clear(); LabelToSeqIndex.clear(); for (uint SeqIndex = 0; SeqIndex < m_uSeqCount; ++SeqIndex) { const string Label = (string) GetSeqName(SeqIndex); if (LabelToSeqIndex.find(Label) != LabelToSeqIndex.end()) Die("Dupe label >%s", Label.c_str()); Labels.push_back(Label); LabelToSeqIndex[Label] = SeqIndex; } } void MSA::FromSequence(const Sequence &s) { unsigned uSeqLength = s.GetLength(); SetSize(1, uSeqLength); SetSeqName(0, s.GetLabelCStr()); const byte *CharSeq = s.GetBytePtr(); for (unsigned n = 0; n < uSeqLength; ++n) SetChar(0, n, CharSeq[n]); } void MSA::FromSeq(const Seq &s) { unsigned uSeqLength = s.Length(); SetSize(1, uSeqLength); SetSeqName(0, s.GetName()); if (0 != m_SeqIndexToId) SetSeqId(0, s.GetId()); for (unsigned n = 0; n < uSeqLength; ++n) SetChar(0, n, s[n]); } unsigned MSA::GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const { assert(uSeqIndex < GetSeqCount()); assert(uColIndex < GetColCount()); unsigned uCol = 0; for (unsigned n = 0; n <= uColIndex; ++n) if (!IsGap(uSeqIndex, n)) ++uCol; return uCol; } void MSA::CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex) { assert(uToSeqIndex < m_uSeqCount); const unsigned uColCount = msaFrom.GetColCount(); assert(m_uColCount == uColCount || (0 == m_uColCount && uColCount <= m_uCacheSeqLength)); memcpy(m_szSeqs[uToSeqIndex], msaFrom.GetSeqBuffer(uFromSeqIndex), uColCount); SetSeqName(uToSeqIndex, msaFrom.GetSeqName(uFromSeqIndex)); if (0 == m_uColCount) m_uColCount = uColCount; } const char *MSA::GetSeqBuffer(unsigned uSeqIndex) const { assert(uSeqIndex < m_uSeqCount); return m_szSeqs[uSeqIndex]; } void MSA::DeleteSeq(unsigned uSeqIndex) { assert(uSeqIndex < m_uSeqCount); delete m_szSeqs[uSeqIndex]; delete m_szNames[uSeqIndex]; const unsigned uBytesToMove = (m_uSeqCount - uSeqIndex)*sizeof(char *); if (uBytesToMove > 0) { memmove(m_szSeqs + uSeqIndex, m_szSeqs + uSeqIndex + 1, uBytesToMove); memmove(m_szNames + uSeqIndex, m_szNames + uSeqIndex + 1, uBytesToMove); } --m_uSeqCount; } bool MSA::IsEmptyCol(unsigned uColIndex) const { const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (!IsGap(uSeqIndex, uColIndex)) return false; return true; } //void MSA::DeleteEmptyCols(bool bProgress) // { // unsigned uColCount = GetColCount(); // for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) // { // if (IsEmptyCol(uColIndex)) // { // if (bProgress) // { // Log("Deleting col %u of %u\n", uColIndex, uColCount); // printf("Deleting col %u of %u\n", uColIndex, uColCount); // } // DeleteCol(uColIndex); // --uColCount; // } // } // } unsigned MSA::AlignedColIndexToColIndex(unsigned uAlignedColIndex) const { Die("MSA::AlignedColIndexToColIndex not implemented"); return 0; } bool MSA::SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2, unsigned uSeqIndex2) { Seq s1; Seq s2; a1.GetSeq(uSeqIndex1, s1); a2.GetSeq(uSeqIndex2, s2); s1.StripGaps(); s2.StripGaps(); return s1.EqIgnoreCase(s2); } unsigned MSA::GetSeqLength(unsigned uSeqIndex) const { assert(uSeqIndex < GetSeqCount()); const unsigned uColCount = GetColCount(); unsigned uLength = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) if (!IsGap(uSeqIndex, uColIndex)) ++uLength; return uLength; } void MSA::GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrPWID, unsigned *ptruPosCount) const { assert(uSeqIndex1 < GetSeqCount()); assert(uSeqIndex2 < GetSeqCount()); unsigned uSameCount = 0; unsigned uPosCount = 0; const unsigned uColCount = GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { char c1 = GetChar(uSeqIndex1, uColIndex); if (IsGapChar(c1)) continue; char c2 = GetChar(uSeqIndex2, uColIndex); if (IsGapChar(c2)) continue; ++uPosCount; if (c1 == c2) ++uSameCount; } *ptruPosCount = uPosCount; if (uPosCount > 0) *ptrPWID = 100.0 * (double) uSameCount / (double) uPosCount; else *ptrPWID = 0; } unsigned MSA::UniqueResidueTypes(unsigned uColIndex) const { assert(uColIndex < GetColCount()); unsigned Counts[MAX_ALPHA]; memset(Counts, 0, sizeof(Counts)); const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex)) continue; const unsigned uLetter = GetLetter(uSeqIndex, uColIndex); ++(Counts[uLetter]); } unsigned uUniqueCount = 0; for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) if (Counts[uLetter] > 0) ++uUniqueCount; return uUniqueCount; } double MSA::GetOcc(unsigned uColIndex) const { unsigned uGapCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) ++uGapCount; unsigned uSeqCount = GetSeqCount(); return (double) (uSeqCount - uGapCount) / (double) uSeqCount; } void MSA::ToFile(TextFile &File) const { ToFASTAFile(File); } void MSA::GetUngappedSeqStr(uint SeqIndex, string &SeqStr) const { SeqStr.clear(); asserta(SeqIndex < m_uSeqCount); for (uint i = 0; i < m_uColCount; ++i) { char c = m_szSeqs[SeqIndex][i]; if (!isgap(c)) SeqStr += c; } } bool MSA::ColumnHasGap(unsigned uColIndex) const { const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) return true; return false; } void MSA::SetIdCount(unsigned uIdCount) { //if (m_uIdCount != 0) // Die("MSA::SetIdCount: may only be called once"); if (m_uIdCount > 0) { if (uIdCount > m_uIdCount) Die("MSA::SetIdCount: cannot increase count"); return; } m_uIdCount = uIdCount; } void MSA::SetSeqId(unsigned uSeqIndex, unsigned uId) { assert(uSeqIndex < m_uSeqCount); assert(uId == UINT_MAX || uId < m_uIdCount); if (0 == m_SeqIndexToId) { if (0 == m_uIdCount) Die("MSA::SetSeqId, SetIdCount has not been called"); m_IdToSeqIndex = new unsigned[m_uIdCount]; m_SeqIndexToId = new unsigned[m_uSeqCount]; memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned)); memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned)); } m_SeqIndexToId[uSeqIndex] = uId; if (uId != UINT_MAX) m_IdToSeqIndex[uId] = uSeqIndex; } unsigned MSA::GetSeqIndex(unsigned uId) const { assert(uId < m_uIdCount); assert(0 != m_IdToSeqIndex); unsigned uSeqIndex = m_IdToSeqIndex[uId]; assert(uSeqIndex < m_uSeqCount); return uSeqIndex; } bool MSA::GetSeqIndex(unsigned uId, unsigned *ptruIndex) const { for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) { if (uId == m_SeqIndexToId[uSeqIndex]) { *ptruIndex = uSeqIndex; return true; } } return false; } unsigned MSA::GetSeqId(unsigned uSeqIndex) const { if (m_SeqIndexToId == 0) return UINT_MAX; assert(uSeqIndex < m_uSeqCount); unsigned uId = m_SeqIndexToId[uSeqIndex]; assert(uId == UINT_MAX || uId < m_uIdCount); return uId; } void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount, MSA &msaOut) { const unsigned uColCount = msaIn.GetColCount(); msaOut.SetSize(uIdCount, uColCount); for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uIdCount; ++uSeqIndexOut) { const unsigned uId = Ids[uSeqIndexOut]; const unsigned uSeqIndexIn = msaIn.GetSeqIndex(uId); const char *ptrName = msaIn.GetSeqName(uSeqIndexIn); msaOut.SetSeqId(uSeqIndexOut, uId); msaOut.SetSeqName(uSeqIndexOut, ptrName); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uSeqIndexIn, uColIndex); msaOut.SetChar(uSeqIndexOut, uColIndex, c); } } } // Caller must allocate ptrSeq and ptrLabel as new char[n]. void MSA::AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel) { if (m_uSeqCount > m_uCacheSeqCount) Die("Internal error MSA::AppendSeq"); if (m_uSeqCount == m_uCacheSeqCount) ExpandCache(m_uSeqCount + 4, uSeqLength); m_szSeqs[m_uSeqCount] = ptrSeq; m_szNames[m_uSeqCount] = ptrLabel; ++m_uSeqCount; } void MSA::ExpandCache(unsigned uSeqCount, unsigned uColCount) { if (m_IdToSeqIndex != 0 || m_SeqIndexToId != 0 || uSeqCount < m_uSeqCount) Die("Internal error MSA::ExpandCache"); if (m_uSeqCount > 0 && uColCount != m_uColCount) Die("Internal error MSA::ExpandCache, ColCount changed"); char **NewSeqs = new char *[uSeqCount]; char **NewNames = new char *[uSeqCount]; for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) { NewSeqs[uSeqIndex] = m_szSeqs[uSeqIndex]; NewNames[uSeqIndex] = m_szNames[uSeqIndex]; } for (unsigned uSeqIndex = m_uSeqCount; uSeqIndex < uSeqCount; ++uSeqIndex) { char *Seq = new char[uColCount]; NewSeqs[uSeqIndex] = Seq; #if DEBUG memset(Seq, '?', uColCount); #endif } delete[] m_szSeqs; delete[] m_szNames; m_szSeqs = NewSeqs; m_szNames = NewNames; m_uCacheSeqCount = uSeqCount; m_uCacheSeqLength = uColCount; m_uColCount = uColCount; } void MSA::FixAlpha() { ClearInvalidLetterWarning(); for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) { for (unsigned uColIndex = 0; uColIndex < m_uColCount; ++uColIndex) { char c = GetChar(uSeqIndex, uColIndex); if (!IsResidueChar(c) && !IsGapChar(c)) { char w = GetWildcardChar(); // Warning("Invalid letter '%c', replaced by '%c'", c, w); InvalidLetterWarning(c, w); SetChar(uSeqIndex, uColIndex, w); } } } ReportInvalidLetters(); } ALPHA MSA::GuessAlpha() const { // If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap // letters belong to the nucleotide alphabet, guess nucleo. // Otherwise amino. const unsigned CHAR_COUNT = 100; const unsigned MIN_NUCLEO_PCT = 95; const unsigned uSeqCount = GetSeqCount(); const unsigned uColCount = GetColCount(); if (0 == uSeqCount) return ALPHA_Amino; unsigned uDNACount = 0; unsigned uRNACount = 0; unsigned uTotal = 0; unsigned i = 0; for (;;) { unsigned uSeqIndex = i/uColCount; if (uSeqIndex >= uSeqCount) break; unsigned uColIndex = i%uColCount; ++i; char c = GetChar(uSeqIndex, uColIndex); if (IsGapChar(c)) continue; if (IsDNA(c)) ++uDNACount; if (IsRNA(c)) ++uRNACount; ++uTotal; if (uTotal >= CHAR_COUNT) break; } if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT) return ALPHA_Nucleo; if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT) return ALPHA_Nucleo; return ALPHA_Amino; } void MSA::GetPosToCol(uint SeqIndex, vector &PosToCol) const { PosToCol.clear(); const uint ColCount = GetColCount(); const char *Seq = GetSeqCharPtr(SeqIndex); PosToCol.reserve(ColCount); for (uint Col = 0; Col < ColCount; ++Col) { char c = Seq[Col]; if (!isgap(c)) PosToCol.push_back(Col); } } void MSA::GetColToPos(uint SeqIndex, vector &ColToPos) const { ColToPos.clear(); const uint ColCount = GetColCount(); const char *Seq = GetSeqCharPtr(SeqIndex); ColToPos.reserve(ColCount); uint Pos = 0; for (uint Col = 0; Col < ColCount; ++Col) { char c = Seq[Col]; if (isgap(c)) ColToPos.push_back(UINT_MAX); else ColToPos.push_back(Pos++); } } bool MSA::ColIsUpper(uint ColIndex, double MaxGapFract) const { const uint SeqCount = GetSeqCount(); uint UpperCount = 0; uint LowerCount = 0; uint GapCount = 0; for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { char c = GetChar(SeqIndex, ColIndex); if (isgap(c)) { ++GapCount; continue; } if (!isalpha(c)) continue; if (isupper(c)) ++UpperCount; else ++LowerCount; } if (UpperCount == 0 && LowerCount == 0) return false; if (UpperCount > 0 && LowerCount > 0) Die("Column %u has mixed case letters", ColIndex); if (double(GapCount)/SeqCount > MaxGapFract) return false; if (UpperCount == 0) return false; return true; } muscle-5.1.0/src/msa.h000066400000000000000000000117551424453062600145440ustar00rootroot00000000000000#ifndef MSA_h #define MSA_h struct PathEdge; class TextFile; class Seq; class ClusterNode; class NodeCounts; class DataBuffer; class Sequence; class MSA { public: unsigned m_uSeqCount; unsigned m_uColCount; unsigned m_uCacheSeqLength; unsigned m_uCacheSeqCount; char **m_szSeqs; char **m_szNames; static unsigned m_uIdCount; unsigned *m_IdToSeqIndex; unsigned *m_SeqIndexToId; public: MSA(); virtual ~MSA(); public: // Ways to create an MSA void FromStrings(const vector &Strings); void FromStrings2(const vector &Labels, vector &Seqs); void FromFile(TextFile &File); void FromFASTAFile(const string &FileName); void FromFASTAFile_PreserveCase(const string &FileName); void FromFASTAFile(TextFile &File); void FromSeq(const Seq &s); void FromSequence(const Sequence &s); void GetLabelToSeqIndex(vector &Labels, map &LabelToIndex) const; void ToFile(TextFile &File) const; void ToFASTAFile(TextFile &File) const; void ToFASTAFile(FILE *f) const; void ToFASTAFile(const string &FileName) const; void SetSize(unsigned uSeqCount, unsigned uColCount); void SetSeqCount(unsigned uSeqCount); char GetChar(unsigned uSeqIndex, unsigned uIndex) const; unsigned GetLetter(unsigned uSeqIndex, unsigned uIndex) const; unsigned GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const; const char *GetSeqName(unsigned uSeqIndex) const; void GetSeqLabel(uint SeqIndex, string &Label) const; unsigned GetSeqId(unsigned uSeqIndex) const; unsigned GetSeqIndex(unsigned uId) const; bool GetSeqIndex(unsigned uId, unsigned *ptruIndex) const; double GetOcc(unsigned uColIndex) const; bool IsGap(unsigned uSeqIndex, unsigned uColIndex) const; bool IsWildcard(unsigned uSeqIndex, unsigned uColIndex) const; bool IsGapColumn(unsigned uColIndex) const; uint GetGapCount(uint ColIndex) const; bool ColumnHasGap(unsigned uColIndex) const; bool IsGapSeq(unsigned uSeqIndex) const; void GetUngappedSeqStr(uint SeqIndex, string &SeqStr) const; void SetChar(unsigned uSeqIndex, unsigned uColIndex, char c); void SetSeqName(unsigned uSeqIndex, const char szName[]); void SetSeqId(unsigned uSeqIndex, unsigned uId); bool HasGap() const; bool IsLegalLetter(unsigned uLetter) const; void GetSeq(unsigned uSeqIndex, Seq &seq) const; void GetRowStr(unsigned uSeqIndex, string &SeqStr) const; const char *GetSeqCharPtr(uint SeqIndex) const; void Copy(const MSA &msa); double GetCons(unsigned uColIndex) const; double GetAvgCons() const; double GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const; double GetPctIdentityPair2(unsigned uSeqIndex1, unsigned uSeqIndex2) const; bool GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const; uint GetSeqIndex(const string &Label, bool FailOnError = true) const; void DeleteCol(unsigned uColIndex); void DeleteColumns(unsigned uColIndex, unsigned uColCount); void CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex); void DeleteSeq(unsigned uSeqIndex); bool IsEmptyCol(unsigned uColIndex) const; ALPHA GuessAlpha() const; void FixAlpha(); unsigned UniqueResidueTypes(unsigned uColIndex) const; unsigned GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const; const char *GetSeqBuffer(unsigned uSeqIndex) const; unsigned AlignedColIndexToColIndex(unsigned uAlignedColIndex) const; unsigned GetSeqLength(unsigned uSeqIndex) const; void GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrdPWID, unsigned *ptruPosCount) const; void LogMe() const; double GetPctGroupIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const; void Clear() { Free(); } unsigned GetSeqCount() const { return m_uSeqCount; } unsigned GetColCount() const { return m_uColCount; } static bool SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2, unsigned uSeqIndex2); void GetPosToCol(uint SeqIndex, vector &PosToCol) const; void GetColToPos(uint SeqIndex, vector &ColToPos) const; bool ColIsUpper(uint ColIndex, double MaxGapFract) const; static void SetIdCount(unsigned uIdCount); private: void Free(); void AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel); void ExpandCache(unsigned uSeqCount, unsigned uColCount); void GetNameFromFASTAAnnotationLine(const char szLine[], char szName[], unsigned uBytes); void CopyCol(unsigned uFromCol, unsigned uToCol); }; void DeleteGappedCols(MSA &msa); void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount, MSA &msaOut); void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat); void MSAAppend(MSA &msa1, const MSA &msa2); void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount, MSA &msaOut); void AssertMSAEq(const MSA &msa1, const MSA &msa2); void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2); void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount, MSA &msaOut); void SetMSAWeightsMuscle(MSA &msa); void SetClustalWWeightsMuscle(MSA &msa); void SetThreeWayWeightsMuscle(MSA &msa); #endif // MSA_h muscle-5.1.0/src/msa2.cpp000066400000000000000000000136541424453062600151610ustar00rootroot00000000000000#include "muscle.h" #include "msa.h" #include "tree.h" #include "seq.h" // These global variables are a hack to allow the tree // dependent iteration code to communicate the edge // used to divide the tree. The three-way weighting // scheme needs to know this edge in order to compute // sequence weights. static const Tree *g_ptrMuscleTree = 0; unsigned g_uTreeSplitNode1 = NULL_NEIGHBOR; unsigned g_uTreeSplitNode2 = NULL_NEIGHBOR; void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount, MSA &msaOut) { const unsigned uColCount = msaIn.GetColCount(); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex); msaOut.SetSeqName(uSeqIndex, ptrName); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex); msaOut.SetChar(uSeqIndex, uColIndex, c); } } } void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount, MSA &msaOut) { const unsigned uSeqCount = msaIn.GetSeqCount(); const unsigned uInColCount = msaIn.GetColCount(); if (uFromColIndex + uColCount - 1 > uInColCount) Die("MSAFromColRange, out of bounds"); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const char *ptrName = msaIn.GetSeqName(uSeqIndex); unsigned uId = msaIn.GetSeqId(uSeqIndex); msaOut.SetSeqName(uSeqIndex, ptrName); msaOut.SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex); msaOut.SetChar(uSeqIndex, uColIndex, c); } } } void DeleteGappedCols(MSA &msa) { unsigned uColIndex = 0; for (;;) { if (uColIndex >= msa.GetColCount()) break; if (msa.IsGapColumn(uColIndex)) msa.DeleteCol(uColIndex); else ++uColIndex; } } void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount, MSA &msaOut) { const unsigned uColCount = msaIn.GetColCount(); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut) { unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut]; const char *ptrName = msaIn.GetSeqName(uSeqIndexIn); unsigned uId = msaIn.GetSeqId(uSeqIndexIn); msaOut.SetSeqName(uSeqIndexOut, ptrName); msaOut.SetSeqId(uSeqIndexOut, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uSeqIndexIn, uColIndex); msaOut.SetChar(uSeqIndexOut, uColIndex, c); } } } void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2) { const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); if (uSeqCount1 != uSeqCount2) Die("Seq count differs"); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex) { Seq seq1; msa1.GetSeq(uSeqIndex, seq1); unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); Seq seq2; msa2.GetSeq(uSeqIndex2, seq2); if (!seq1.EqIgnoreCaseAndGaps(seq2)) { Log("Input:\n"); seq1.LogMe(); Log("Output:\n"); seq2.LogMe(); Die("Seq %s differ ", msa1.GetSeqName(uSeqIndex)); } } } void AssertMSAEq(const MSA &msa1, const MSA &msa2) { const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); if (uSeqCount1 != uSeqCount2) Die("Seq count differs"); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex) { Seq seq1; msa1.GetSeq(uSeqIndex, seq1); unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); Seq seq2; msa2.GetSeq(uSeqIndex2, seq2); if (!seq1.Eq(seq2)) { Log("Input:\n"); seq1.LogMe(); Log("Output:\n"); seq2.LogMe(); Die("Seq %s differ ", msa1.GetSeqName(uSeqIndex)); } } } static unsigned g_uMuscleIdCount; #define LOCAL_VERBOSE 0 // Append msa2 at the end of msa1 void MSAAppend(MSA &msa1, const MSA &msa2) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const unsigned uColCountCat = uColCount1 + uColCount2; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } } // "Catenate" two MSAs (by bad analogy with UNIX cat command). // msa1 and msa2 must have same sequence names, but possibly // in a different order. // msaCat is the combined alignment produce by appending // sequences in msa2 to sequences in msa1. void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const unsigned uColCountCat = uColCount1 + uColCount2; msaCat.SetSize(uSeqCount, uColCountCat); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex) { const char c = msa1.GetChar(uSeqIndex, uColIndex); msaCat.SetChar(uSeqIndex, uColIndex, c); } const char *ptrSeqName = msa1.GetSeqName(uSeqIndex); unsigned uSeqIndex2; msaCat.SetSeqName(uSeqIndex, ptrSeqName); bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2); if (bFound) { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } else { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-'); } } } muscle-5.1.0/src/msastats.cpp000066400000000000000000000025671424453062600161570ustar00rootroot00000000000000#include "myutils.h" #include "muscle.h" #include "msa.h" #include "textfile.h" #include "quarts.h" void cmd_msastats() { const string &MSAFileName = opt(msastats); MSA Aln; TextFile f(MSAFileName.c_str()); Aln.FromFASTAFile(f); f.Close(); const uint SeqCount = Aln.GetSeqCount(); const uint ColCount = Aln.GetColCount(); ProgressPrefix(false); ProgressLog("%10u Sequences\n", SeqCount); ProgressLog("%10u Columns\n", ColCount); vector Ls; for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { uint L = Aln.GetSeqLength(SeqIndex); Ls.push_back(L); } Quarts Q; GetQuarts(Ls, Q); ProgressLog("%10.1f Mean seq length", Q.Avg); ProgressLog(" min %u, median %u, max %u\n", Q.Min, Q.Med, Q.Max); vector GapPcts; uint Gap0 = 0; uint Gap50 = 0; for (uint Col = 0; Col < ColCount; ++Col) { uint GapCount = Aln.GetGapCount(Col); uint GapPct = (100*GapCount)/SeqCount; if (GapCount == 0) ++Gap0; if (GapPct < 50) ++Gap50; GapPcts.push_back(GapPct); } GetQuarts(GapPcts, Q); ProgressLog("%10.1f Mean col gap pct,", Q.Avg); ProgressLog(" min %u, median %u, max %u\n", Q.Min, Q.Med, Q.Max); ProgressLog("%10u Cols with no gaps (%.1f%% of cols)\n", Gap0, GetPct(Gap0, ColCount)); ProgressLog("%10u Cols with <50%% gaps (%.1f%% of cols)\n", Gap50, GetPct(Gap50, ColCount)); ProgressLog("\n"); ProgressPrefix(true); } muscle-5.1.0/src/msbuild.bash000066400000000000000000000002501424453062600160750ustar00rootroot00000000000000#!/bin/bash -e 'c:/Program Files/Microsoft Visual Studio/2022/Community/Msbuild/Current/Bin/MSBuild.exe' muscle.sln /t:muscle /p:Platform=x64 /p:Configuration=Release muscle-5.1.0/src/multisequence.cpp000066400000000000000000000151431424453062600171750ustar00rootroot00000000000000#include "muscle.h" #include "alpha3.h" #include "sort.h" void MultiSequence::LoadMFA(const string& filename, bool stripGaps) { // try opening file FileBuffer infile(filename.c_str()); if (infile.fail()) Die("Cannot open %s, errno=%d %s", filename.c_str(), errno, strerror(errno)); // if successful, then load using other LoadMFA() routine LoadMFA(infile, stripGaps); infile.close(); } #if SEQ_TRACE void MultiSequence::AssertSeqIds() const { const uint SeqCount = GetSeqCount(); asserta(SIZE(m_Owners) == SIZE(m_Seqs)); for (uint i = 0; i < SeqCount; ++i) m_Seqs[i]->AssertId(); } #endif void MultiSequence::Clear() { const uint SeqCount = GetSeqCount(); asserta(SIZE(m_Owners) == SIZE(m_Seqs)); for (uint i = 0; i < SeqCount; ++i) { if (m_Owners[i]) DeleteSequence(m_Seqs[i]); } m_Seqs.clear(); m_Owners.clear(); } void MultiSequence::Copy(const MultiSequence &rhs) { Clear(); const uint SeqCount = rhs.GetSeqCount(); for (uint i = 0; i < SeqCount; ++i) { const Sequence *r = rhs.GetSequence(i)->Clone(); AddSequence(r, true); } } void MultiSequence::LogMe() const { Log("\n"); Log("MultiSequence::LogMe(%p), %u seqs\n", this, GetSeqCount()); for (uint i = 0; i < GetSeqCount(); ++i) m_Seqs[i]->LogMe(); } uint MultiSequence::GetGSI(uint SeqIndex) const { const Sequence *Seq = GetSequence(SeqIndex); uint L = (uint) Seq->GetGSI(); return L; } uint MultiSequence::GetSeqLength(uint SeqIndex) const { const Sequence *Seq = GetSequence(SeqIndex); uint L = (uint) Seq->GetLength(); return L; } void MultiSequence::GetLengthOrder(vector &SeqIndexes) const { const uint SeqCount = GetSeqCount(); vector Ls; for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { uint L = (uint) GetSequence(SeqIndex)->GetLength(); Ls.push_back(L); } SeqIndexes.resize(SeqCount); const uint *PtrLs = Ls.data(); uint *PtrOrder = SeqIndexes.data(); QuickSortOrderDesc(PtrLs, SeqCount, PtrOrder); } void MultiSequence::AssertGSIs() const { const uint GlobalSeqCount = GetGlobalMSSeqCount(); const uint SeqCount = GetSeqCount(); for (uint i = 0; i < SeqCount; ++i) { const Sequence *Seq = GetSequence(i); asserta(Seq != 0); uint GSI = Seq->GetGSI(); asserta(GSI < GlobalSeqCount); } } void MultiSequence::LogGSIs(const char *Msg) const { const MultiSequence &GlobalMS = GetGlobalInputMS(); const uint GlobalSeqCount = GlobalMS.GetSeqCount(); Log("\n"); Log("LogGSIs()"); if (Msg != 0) Log(" %s ", Msg); const uint SeqCount = GetSeqCount(); Log("%u seqs\n", SeqCount); for (uint i = 0; i < SeqCount; ++i) { Log("[%5u]", i); const Sequence *Seq = GetSequence(i); uint GSI = (uint) Seq->GetGSI(); string Label = Seq->GetLabel(); asserta(GSI < GlobalSeqCount); const Sequence *GlobalSeq = GlobalMS.GetSequence(GSI); string GlobalLabel = GlobalSeq->GetLabel(); Log(" %5d", GSI); Log(" >%s", Label.c_str()); Log(" (%s)", GlobalLabel.c_str()); if (Label != GlobalLabel) Log(" <<< ERROR"); Log("\n"); } } void MultiSequence::LoadMFA(FileBuffer& infile, bool stripGaps) { if (infile.fail()) Die("LoadMFA read error"); set Labels; unsigned DupeCount = 0; for (;;) { Sequence *seq = Sequence::NewSequence(); bool Ok = seq->FromFileBuffer(infile, stripGaps); if (!Ok) { DeleteSequence(seq); break; } string Label = seq->m_Label; bool Dupe = false; for (uint i = 1; i < 100; ++i) { if (Labels.find(Label) == Labels.end()) break; Dupe = true; Ps(Label, "%s dupelabel%u", seq->m_Label.c_str(), i); } if (Dupe) { Log("Duplicate label >%s", seq->m_Label.c_str()); seq->m_Label = Label; ++DupeCount; } Labels.insert(Label); m_Seqs.push_back(seq); m_Owners.push_back(true); } if (DupeCount > 0) Warning("%u duplicate labels", DupeCount); } uint MultiSequence::GetColCount() const { assert(IsAligned()); const Sequence *Seq = GetSequence(0); uint L = Seq->GetLength(); return L; } bool MultiSequence::IsAligned() const { int N = GetNumSequences(); if (N == 0) return false; uint ColCount0 = GetSequence(0)->GetLength(); for (int i = 1; i < N; ++i) { int ColCount = GetSequence(i)->GetLength(); if (ColCount != ColCount0) return false; } return true; } uint MultiSequence::GetSeqIndex(const string &Label, bool FailOnError) const { int N = GetNumSequences(); for (int i = 0; i < N; ++i) { const Sequence *Seq = GetSequence(i); if (Seq->GetLabel() == Label) return i; } if (FailOnError) Die("Label not found >%s", Label.c_str()); return UINT_MAX; } bool MultiSequence::GuessIsNucleo() const { // If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap // letters belong to the nucleotide alphabet, guess nucleo. // Otherwise amino. const unsigned CHAR_COUNT = 100; const unsigned MIN_NUCLEO_PCT = 95; const unsigned SeqCount = GetSeqCount(); uint NucleoCount = 0; for (uint i = 0; i < 100; ++i) { uint SeqIndex = randu32()%SeqCount; const Sequence &seq = *GetSequence(SeqIndex); const uint L = seq.GetLength(); uint r = randu32(); const uint Pos = r%L; byte c = (byte) GetChar(SeqIndex, Pos); uint Letter = g_CharToLetterNucleo[c]; if (Letter < 4) ++NucleoCount; } if (NucleoCount > 75) return true; return false; } void MultiSequence::FromStrings(const vector &Labels, const vector &Seqs) { Clear(); const uint N = SIZE(Seqs); asserta(SIZE(Labels) == N); for (uint i = 0; i < N; ++i) { const string &Label = Labels[i]; const string &Str = Seqs[i]; Sequence *Seq = Sequence::NewSequence(); Seq->FromString(Label, Str); AddSequence(Seq, true); } } void MultiSequence::ToMSA(MSA &msa) const { uint SeqCount = GetSeqCount(); uint ColCount = GetColCount(); msa.SetSize(SeqCount, ColCount); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { const Sequence *Seq = GetSequence(SeqIndex); const byte *ByteSeq = Seq->GetBytePtr(); const char *Label = Seq->GetLabel().c_str(); uint L = Seq->GetLength(); char *CharSeq = myalloc(char, L+1); memcpy(CharSeq, ByteSeq, L); CharSeq[L] = 0; msa.m_szSeqs[SeqIndex] = CharSeq; msa.m_szNames[SeqIndex] = mystrsave(Label); } } double MultiSequence::GetMeanSeqLength() const { const uint SeqCount = GetSeqCount(); if (SeqCount == 0) return 0; double SumSeqLength = 0; for (uint i = 0; i < SeqCount; ++i) SumSeqLength += GetSequence(i)->GetLength(); return SumSeqLength/SeqCount; } uint MultiSequence::GetMaxSeqLength() const { const uint SeqCount = GetSeqCount(); uint MaxSeqLength = 0; for (uint i = 0; i < SeqCount; ++i) MaxSeqLength = max(MaxSeqLength, GetSequence(i)->GetLength()); return MaxSeqLength; } muscle-5.1.0/src/multisequence.h000066400000000000000000000054011424453062600166360ustar00rootroot00000000000000#pragma once #include #include "sequence.h" class MSA; class MultiSequence { public: vector m_Seqs; vector m_Owners; public: MultiSequence() { Clear(); } MultiSequence(FileBuffer& infile) { LoadMFA(infile); } MultiSequence(const string& filename) { LoadMFA(filename); } void Clear(); // void DeleteSeqs(); ~MultiSequence() { Clear(); } void FromStrings(const vector &Labels, const vector &Seqs); void Copy(const MultiSequence &rhs); void LoadMFA(const string& filename, bool stripGaps = false); void LoadMFA(FileBuffer& infile, bool stripGaps = false); void FromFASTA(const string& filename, bool stripGaps = false) { LoadMFA(filename, stripGaps); } uint GetSeqIndex(const string &Label, bool FailOnError = true) const; void ToMSA(MSA &msa) const; void AddSequence(const Sequence *sequence, bool Owner) { m_Seqs.push_back(sequence); m_Owners.push_back(Owner); } void WriteMFA(FILE *f) const { if (f == 0) return; for (uint i = 0; i < SIZE(m_Seqs); ++i) { const Sequence *Seq = m_Seqs[i]; Seq->WriteMFA(f); } } void WriteMFA(const string &FileName) const { if (FileName.empty()) return; FILE *f = CreateStdioFile(FileName); WriteMFA(f); CloseStdioFile(f); } const Sequence *GetSequence(int i) const { return m_Seqs[i]; } int GetNumSequences() const { return (int) SIZE(m_Seqs); } uint GetSeqCount() const { return SIZE(m_Seqs); } double GetMeanSeqLength() const; uint GetMaxSeqLength() const; uint GetChar(uint SeqIndex, uint ZeroBasedPos) const { const Sequence *seq = GetSequence((int) SeqIndex); char c = seq->GetPosition(int(ZeroBasedPos)+1); return c; } MultiSequence* Project(const set& indices); bool IsAligned() const; uint GetColCount() const; const string &GetLabelStr(uint SeqIndex) const { const Sequence *seq = GetSequence((int) SeqIndex); return seq->m_Label; } const char *GetLabel(uint SeqIndex) const { const Sequence *seq = GetSequence((int) SeqIndex); return seq->m_Label.c_str(); } const byte *GetByteSeq(uint SeqIndex, uint &L) const { const Sequence *seq = GetSequence((int) SeqIndex); const byte *ByteSeq = (const byte *) seq->m_CharVec.data() + 1; L = (uint) seq->GetLength(); return ByteSeq; } const byte *GetBytePtr(uint SeqIndex) const { const Sequence *seq = GetSequence((int) SeqIndex); const byte *BytePtr = seq->GetBytePtr(); return BytePtr; } bool GuessIsNucleo() const; void LogGSIs(const char *Msg = 0) const; void AssertGSIs() const; void GetLengthOrder(vector &SeqIndexes) const; uint GetSeqLength(uint SeqIndex) const; uint GetGSI(uint SeqIndex) const; void LogMe() const; #if SEQ_TRACE void AssertSeqIds() const; #endif }; muscle-5.1.0/src/muscle.h000066400000000000000000000123201424453062600152410ustar00rootroot00000000000000#pragma once #if DEBUG && !_DEBUG #define _DEBUG 1 #endif #if _DEBUG && !DEBUG #define DEBUG 1 #endif #if _MSC_VER #define TIMING 0 #endif #ifdef _MSC_VER // Miscrosoft compiler #pragma warning(disable : 4800) // int-bool conversion #pragma warning(disable : 4996) // deprecated names like strdup, isatty. #define brk(x) if (x) __debugbreak() #endif #include "myutils.h" #include "types.h" #include "multisequence.h" #include "textfile.h" #include "mysparsemx.h" #include "scoretype.h" #include "treeperm.h" #include "pairhmm.h" #include "alpha.h" #include "msa.h" #include "mpcflat.h" #include "kmerscan.h" #include "alpha3.h" #ifndef _WIN32 #define stricmp strcasecmp #define strnicmp strncasecmp #define _snprintf snprintf #define _fsopen(name, mode, share) fopen((name), (mode)) #endif const double VERY_NEGATIVE_DOUBLE = -9e29; const float VERY_NEGATIVE_FLOAT = (float) -9e29; const double BLOSUM_DIST = 0.62; // todo settable void RunSuper5(); void CalcEADistMx(FILE *f, MultiSequence* sequences, vector > &DistMx, vector *SparsePostVec = 0); void PermuteTree(const Tree &InputTree, Tree &TreeABC, Tree &TreeACB, Tree &TreeBCA, vector &LabelsA, vector &LabelsB, vector &LabelsC); void PermTree(Tree &InputTree, TREEPERM TP); void StringsToFile(const string &FileName, const vector &v); void MakeReplicateFileName_N(const string &Pattern, uint N, string &FileName); void MakeReplicateFileName(const string &Pattern, TREEPERM TP, uint PerturbSeed, string &FileName); MultiSequence &LoadGlobalInputMS(const string &FileName); MultiSequence &GetGlobalInputMS(); void ShowGlobalInputSeqStats(); double GetGlobalMSMeanSeqLength(); uint GetGlobalMSSeqCount(); uint GetGSICount(); uint GetAssertSameSeqsOkCount(); const Sequence &GetGlobalInputSeq(uint GSI); const string &GetGlobalInputSeqLabel(uint GSI); void ClearGlobalInputMS(); void CharVecToStr(const vector &Vec, string &Str); void LogAln(const Sequence &X, const Sequence &Y, const string &PathXY); void LogAln(const string &X, const string &Y, const string &PathXY); void ReadStringsFromFile(const string &FileName, vector &Strings); void GetGuideTreeJoinOrder(const Tree &GuideTree, const map &LabelToIndex, vector &Indexes1, vector &Indexes2); void ValidateJoinOrder(const vector &Indexes1, const vector &Indexes2); void _AssertSameLabels(const char *File, uint Line, const MultiSequence &MS); void _AssertSameSeqs(const char *File, uint Line, const MultiSequence &MS1, const MultiSequence &MS2); void _AssertSameSeqsVec(const char *File, uint Line, const MultiSequence &MS, vector &v); void _AssertSameSeqsJoin(const char *File, uint Line, const MultiSequence &MS1, const MultiSequence &MS2, const MultiSequence &MS12); void _AssertSeqsEqInput(const char *File, uint Line, const MultiSequence &MS); void _AssertSeqsEq(const char *FileName, uint LineNr, const MultiSequence &MSA1, const MultiSequence &MSA2); #define AssertSeqsEq(MSA1, MSA2) _AssertSeqsEq(__FILE__, __LINE__, MSA1, MSA2) #define AssertSeqsEqInput(MS) _AssertSeqsEqInput(__FILE__, __LINE__, MS) #define AssertSameLabels(MS) _AssertSameLabels(__FILE__, __LINE__, MS) #define AssertSameSeqs(MS1, MS2) _AssertSameSeqs(__FILE__, __LINE__, MS1, MS2) #define AssertSameSeqsVec(MS, v) _AssertSameSeqsVec(__FILE__, __LINE__, MS, v) #define AssertSameSeqsJoin(MS1, MS2, MS12) _AssertSameSeqsJoin(__FILE__, __LINE__, MS1, MS2, MS12) void LogFlatMx(const string &Name, const float *Flat, uint LX, uint LY); void LogFlatMxs(const string &Name, const float *Flat, uint LX, uint LY); void LogFlatMx1(const string &Name, const float *Flat, uint LX, uint LY); float AlignMSAsFlat(const string &aProgressStr, const MultiSequence &MSA1, const MultiSequence &MSA2, uint TargetPairCount, string &Path); void InitProbcons(); void AlignMSAsByPath(const MultiSequence &MSA1, const MultiSequence &MSA2, const string &Path, MultiSequence &MSA12); void CalcFwdFlat(const byte *X, uint LX, const byte *Y, uint LY, float *Flat); void CalcBwdFlat(const byte *X, uint LX, const byte *Y, uint LY, float *Flat); void CalcPostFlat(const float *FlatFwd, const float *FlatBwd, uint LX, uint LY, float *Post); float CalcAlnFlat(const float *Post, uint LX, uint LY, float *DPRows, char *TB, string &Path); void CalcPosteriorFlat3(const MultiSequence &MSA1, const MultiSequence &MSA2, const vector &SeqIndexes1, const vector &SeqIndexes2, const vector &SparseMxs, float *Flat); float AlignPairFlat(const Sequence *Seq1, const Sequence *Seq2, string &Path); float AlignPairFlat_SparsePost(const Sequence *Seq1, const Sequence *Seq2, string &Path, MySparseMx *SparsePost); void GetAllPairs(uint SeqCount, vector &SeqIndexes1, vector &SeqIndexes2); void GetAllPairs(uint SeqCount1, uint SeqCount2, vector &SeqIndexes1, vector &SeqIndexes2); void GetPairs(uint SeqCount1, uint SeqCount2, uint TargetPairCount, vector &SeqIndexes1, vector &SeqIndexes2); float GetPostPairsAlignedFlat(const string &aProgressStr, const MultiSequence &MSA1, const MultiSequence &MSA2, const vector &SeqIndexes1, const vector &SeqIndexes2, vector &SparsePosts); muscle-5.1.0/src/muscle.sln000066400000000000000000000020251424453062600156070ustar00rootroot00000000000000 Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 16 VisualStudioVersion = 16.0.31005.135 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "muscle", "muscle.vcxproj", "{57FBAB99-B961-4B38-B846-CA26D63384F2}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 Release|x64 = Release|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {57FBAB99-B961-4B38-B846-CA26D63384F2}.Debug|x64.ActiveCfg = Debug|x64 {57FBAB99-B961-4B38-B846-CA26D63384F2}.Debug|x64.Build.0 = Debug|x64 {57FBAB99-B961-4B38-B846-CA26D63384F2}.Release|x64.ActiveCfg = Release|x64 {57FBAB99-B961-4B38-B846-CA26D63384F2}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {5D3D4A98-605E-4EEF-9A45-7E8D9E9A55B9} EndGlobalSection EndGlobal muscle-5.1.0/src/muscle.vcxproj000066400000000000000000000444771424453062600165270ustar00rootroot00000000000000 Debug x64 Release x64 {57FBAB99-B961-4B38-B846-CA26D63384F2} muscle Win32Proj 10.0 muscle Application MultiByte v142 Application MultiByte v143 Application MultiByte v142 Application MultiByte v143 <_ProjectFileVersion>10.0.40219.1 Debug\ Debug\ true true Release\ Release\ false false SVN version Disabled WIN32;_DEBUG;_CONSOLE;_WIN32_WINNT=0x0400;%(PreprocessorDefinitions) true EnableFastChecks MultiThreadedDebug true Level3 EditAndContinue true Sync psapi.lib;user32.lib;%(AdditionalDependencies) $(OutDir)muscle.exe true $(OutDir)muscle.pdb Console false MachineX86 gitver gitver.bat Disabled WIN32;_DEBUG;_CONSOLE;_WIN32_WINNT=0x0400;%(PreprocessorDefinitions) EnableFastChecks MultiThreadedDebug NotUsing true Level3 ProgramDatabase true Sync true psapi.lib;user32.lib;%(AdditionalDependencies) $(OutDir)muscle.exe true $(OutDir)muscle.pdb Console false SVN version MaxSpeed OnlyExplicitInline true WIN32;NDEBUG;_CONSOLE;_WIN32_WINNT=0x0400;%(PreprocessorDefinitions) true MultiThreaded true true true Level3 ProgramDatabase true Sync psapi.lib;user32.lib;%(AdditionalDependencies) $(OutDir)muscle.exe true Console true true false MachineX86 gitver gitver.bat MaxSpeed OnlyExplicitInline true WIN32;NDEBUG;_CONSOLE;_WIN32_WINNT=0x0400;%(PreprocessorDefinitions) true MultiThreaded true true NotUsing true Level3 ProgramDatabase true Sync true Fast true Speed true psapi.lib;user32.lib;%(AdditionalDependencies) $(OutDir)muscle.exe true Console true true false UseLinkTimeCodeGeneration muscle-5.1.0/src/muscle.vcxproj.filters000066400000000000000000000427511424453062600201670ustar00rootroot00000000000000 {4488825b-09a5-4797-b906-ce5eaed73f04} cpp;c;cxx;def;odl;idl;hpj;bat;asm {0a7839f0-d95d-440e-a56c-d06c0cac06d6} h;hpp;hxx;hm;inl;inc Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Source Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files Header Files muscle-5.1.0/src/myopts.h000066400000000000000000000030651424453062600153120ustar00rootroot00000000000000#ifndef MY_VERSION #define MY_VERSION "5.2" #endif #define PROGRAM_NAME "muscle" //////////////////// // Commands #define C(x) STR_OPT(x) #include "cmds.h" //////////////////// STR_OPT(log) STR_OPT(output) STR_OPT(output1) STR_OPT(output2) STR_OPT(input2) STR_OPT(joinprefix) STR_OPT(joinpaths) STR_OPT(linkage) STR_OPT(joins) STR_OPT(tsvout) STR_OPT(html) STR_OPT(jalview) STR_OPT(guidetreein) STR_OPT(guidetreeout) STR_OPT(prefix) STR_OPT(suffix) STR_OPT(nodes) STR_OPT(label) STR_OPT(labels2) STR_OPT(savedir) STR_OPT(db) STR_OPT(label1) STR_OPT(label2) STR_OPT(subtreeout) STR_OPT(supertreeout) STR_OPT(refmsa) STR_OPT(ref) STR_OPT(refdir) STR_OPT(testdir) STR_OPT(testdir1); STR_OPT(testdir2); STR_OPT(outdir) STR_OPT(hmmin) STR_OPT(hmmout) STR_OPT(report) STR_OPT(accalnout) STR_OPT(perm) STR_OPT(calnout) UNS_OPT(threads) UNS_OPT(consiters) UNS_OPT(refineiters) UNS_OPT(randseed) UNS_OPT(paircount) UNS_OPT(n) UNS_OPT(splitcount) UNS_OPT(maxcoarse) UNS_OPT(perturb) UNS_OPT(replicates) UNS_OPT(maxcols) FLT_OPT(min_cons_pct) FLT_OPT(max_gap_fract) FLT_OPT(minea) FLT_OPT(super5_minea1) FLT_OPT(super4_minea1) FLT_OPT(super4_minea2) FLT_OPT(pctid) FLT_OPT(perturb_var) FLT_OPT(minconf) FLAG_OPT(quiet) FLAG_OPT(compilerinfo) FLAG_OPT(right) FLAG_OPT(scaledist) FLAG_OPT(eadist) FLAG_OPT(force_super4) FLAG_OPT(force_probcons) FLAG_OPT(allpairs) FLAG_OPT(nt) FLAG_OPT(amino) FLAG_OPT(accs) FLAG_OPT(verbose) FLAG_OPT(basename) FLAG_OPT(intsuffix) FLAG_OPT(stratified) FLAG_OPT(diversified) FLAG_OPT(randomchaintree) #undef FLAG_OPT #undef UNS_OPT #undef FLT_OPT #undef STR_OPT muscle-5.1.0/src/mysparsemx.cpp000066400000000000000000000122361424453062600165220ustar00rootroot00000000000000#include "myutils.h" #include "mysparsemx.h" void MySparseMx::AllocVec(uint Size) { if (Size <= m_MaxVecSize) return; if (m_MaxVecSize > 0) myfree(m_ValueVec); m_MaxVecSize = Size + 256; asserta(sizeof(float) == 4 && sizeof(uint) == 4); m_ValueVec = myalloc(byte, m_MaxVecSize*8); } float MySparseMx::GetMaxProbRow(uint i) const { const uint Offset = GetOffset(i); const uint Size = GetSize(i); float Max = 0; for (uint k = 0; k < Size; ++k) { float P = GetProb_Offset(Offset + k); Max = max(Max, P); } return Max; } uint MySparseMx::GetOffset(uint i) const { assert(i < m_LX); uint Offset = m_Offsets[i]; return Offset; } uint MySparseMx::GetSize(uint i) const { assert(i < m_LX); uint Offset = GetOffset(i); uint Size = m_Offsets[i+1] - Offset; return Size; } float MySparseMx::GetProb(uint i, uint j) const { uint Offset = GetOffset(i); uint Size = GetSize(i); for (uint k = 0; k < Size; ++k) { const uint *ptr_j = (uint *) (m_ValueVec + 8*Offset + 4); uint j2 = *ptr_j; if (j2 == j) { const float *ptr_P = (float *) (m_ValueVec + 8*Offset); return *ptr_P; } else if (j2 > j) return 0; ++Offset; } return 0; } void MySparseMx::AllocLX(uint LX) { #if 0//TRACE Log("%p->AllocLX(%u) max %u\n", this, LX, m_MaxLX); #endif if (LX <= m_MaxLX) return; if (m_MaxLX > 0) myfree(m_Offsets); m_MaxLX = LX + 128; m_Offsets = myalloc(uint, m_MaxLX+1); #if 0//TRACE Log("%p->AllocLX(%u) newmax %u m_Offsets=%p\n", this, LX, m_MaxLX, m_Offsets); #endif #if DEBUG for (uint i = 0; i <= m_MaxLX; ++i) m_Offsets[i] = UINT_MAX; #endif } void MySparseMx::UpdateFromPost(const MySparseMx &OldMx, const float *Post, uint SeqCount) { uint VecSize = OldMx.m_VecSize; uint LX = OldMx.GetLX(); uint LY = OldMx.GetLY(); AllocLX(LX); AllocVec(VecSize); m_LX = LX; m_LY = LY; for (uint i = 0; i < LX; ++i) m_Offsets[i] = OldMx.m_Offsets[i]; m_Offsets[LX] = OldMx.m_Offsets[LX]; for (uint i = 0; i < m_LX; ++i) { uint Offset = GetOffset(i); uint Size = GetSize(i); for (uint k = 0; k < Size; ++k) { uint Col = OldMx.GetCol_Offset(Offset + k); float P = Post[i*LY + Col]/SeqCount; SetProb_Offset(Offset + k, P); SetCol_Offset(Offset + k, Col); } } } void MySparseMx::FromPost(const float *Post, uint LX, uint LY) { asserta(sizeof(float) == sizeof(uint)); m_LX = LX; m_LY = LY; AllocLX(LX); uint Offset = 0; for (uint i = 0; i < LX; ++i) { m_Offsets[i] = Offset; for (uint j = 0; j < LY; ++j) if (Post[i*LY + j] >= MIN_SPARSE_PROB) ++Offset; } m_Offsets[LX] = Offset; m_VecSize = Offset; AllocVec(m_VecSize); Offset = 0; for (uint i = 0; i < LX; ++i) { for (uint j = 0; j < LY; ++j) { float P = Post[i*LY + j]; if (P >= MIN_SPARSE_PROB) { float *ptr_P = (float *) (m_ValueVec + 8*Offset); uint *ptr_j = (uint *) (m_ValueVec + 8*Offset + 4); *ptr_P = P; *ptr_j = j; ++Offset; } } } asserta(Offset == m_VecSize); } void MySparseMx::LogStats(const char *Msg) const { Log("MySparseMx(%s) LX=%u, LY=%u VecSize=%u\n", Msg, m_LX, m_LY, m_VecSize); } void MySparseMx::LogMe() const { Log("\n"); Log("MySparseMx(%p) LX=%u, LY=%u\n", this, m_LX, m_LY); #if 0 for (uint i = 0; i < m_LX; ++i) { uint Offset = GetOffset(i); uint Size = GetSize(i); Log("[Row %5u]", i); Log(" [off %5u]", Offset); Log(" [size %5u]", Size); for (uint k = 0; k < Size; ++k) { const float *ptr_P = (float *) (m_ValueVec + 8*Offset); const uint *ptr_j = (uint *) (m_ValueVec + 8*Offset + 4); Log(" %u=%.3g", *ptr_j, *ptr_P); ++Offset; } Log("\n"); } #endif Log("\n"); Log(" Row Size"); if (m_X != 0) Log(" x "); for (uint j = 0; j < m_LY; ++j) Log(" %8u", j); Log("\n"); if (m_Y != 0) { Log("\n"); Log(" "); for (uint j = 0; j < m_LY; ++j) Log(" %8c", m_Y[j]); Log("\n"); } for (uint i = 0; i < m_LX; ++i) { uint Size = GetSize(i); Log("%5u", i); Log(" %5u", Size); if (m_X != 0) Log(" %c ", m_X[i]); for (uint j = 0; j < m_LY; ++j) { float P = GetProb(i, j); if (P == 0) Log(" %8.8s", "."); else Log(" %8.3g", P); } Log("\n"); } } void MySparseMx::ToPost(float *Post) const { for (uint i = 0; i < m_LX*m_LY; ++i) Post[i] = 0; for (uint Row = 0; Row < m_LX; ++Row) { uint Offset = GetOffset(Row); uint Size = GetSize(Row); for (uint k = 0; k < Size; ++k) { float P = GetProb_Offset(Offset + k); uint Col = GetCol_Offset(Offset + k); Post[Row*m_LY + Col] = P; } } } void MySparseMx::GetColToRowLoHi(vector &ColToRowLo, vector &ColToRowHi) const { ColToRowLo.clear(); ColToRowHi.clear(); ColToRowLo.resize(m_LY, UINT_MAX); ColToRowHi.resize(m_LY, UINT_MAX); for (uint Row = 0; Row < m_LX; ++Row) { uint Offset = GetOffset(Row); uint Size = GetSize(Row); for (uint k = 0; k < Size; ++k) { uint Col = GetCol_Offset(Offset + k); assert(Col < m_LY); if (ColToRowLo[Col] == UINT_MAX) { ColToRowLo[Col] = Row; ColToRowHi[Col] = Row; } else { if (Row < ColToRowLo[Col]) ColToRowLo[Col] = Row; if (Row > ColToRowHi[Col]) ColToRowHi[Col] = Row; } } } } muscle-5.1.0/src/mysparsemx.h000066400000000000000000000034511424453062600161660ustar00rootroot00000000000000#pragma once const float MIN_SPARSE_PROB = 0.01f; const float MIN_SPARSE_SCORE = logf(MIN_SPARSE_PROB); // -4.6 class MySparseMx { public: uint m_LX = 0; uint m_LY = 0; uint m_VecSize = 0; uint m_MaxVecSize = 0; byte *m_ValueVec = 0; uint m_MaxLX = 0; uint *m_Offsets = 0; const byte *m_X = 0; const byte *m_Y = 0; public: MySparseMx() { m_LX = 0; m_LY = 0; m_VecSize = 0; m_MaxVecSize = 0; m_ValueVec = 0; m_MaxLX = 0; m_Offsets = 0; m_X = 0; m_Y = 0; } ~MySparseMx() { Clear(); } void Clear() { myfree(m_ValueVec); myfree(m_Offsets); m_ValueVec = 0; m_Offsets = 0; m_MaxVecSize = 0; m_MaxLX = 0; m_VecSize = 0; m_LX = 0; m_LY = 0; } float GetProb_Offset(uint Offset) const { const float *ptr_P = (float *) (m_ValueVec + 8*Offset); float P = *ptr_P; return P; } uint GetCol_Offset(uint Offset) const { const uint *ptr_j = (uint *) (m_ValueVec + 8*Offset + 4); uint Col = *ptr_j; return Col; } void SetProb_Offset(uint Offset, float P) { float *ptr_P = (float *) (m_ValueVec + 8*Offset); *ptr_P = P; } void SetCol_Offset(uint Offset, uint Col) const { uint *ptr_j = (uint *) (m_ValueVec + 8*Offset + 4); *ptr_j = Col; } void AllocLX(uint LX); void AllocVec(uint Size); void FromPost(const float *Post, uint LX, uint LY); void UpdateFromPost(const MySparseMx &OldMx, const float *Post, uint SeqCount); void GetColToRowLoHi(vector &ColToRowLo, vector &ColToRowHi) const; void ToPost(float *Post) const; float GetProb(uint i, uint j) const; uint GetOffset(uint i) const; uint GetSize(uint i) const; float GetMaxProbRow(uint i) const; void LogMe() const; void LogStats(const char *Msg = "") const; const uint GetLX() const { return m_LX; } const uint GetLY() const { return m_LY; } }; muscle-5.1.0/src/myutils.cpp000066400000000000000000001610271424453062600160230ustar00rootroot00000000000000#ifdef _MSC_VER #define _SCL_SECURE_NO_WARNINGS //#define _HAS_ITERATOR_DEBUGGING 0 //#define _ITERATOR_DEBUG_LEVEL 0 #define MYUTILS_CPP #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _MSC_VER #define WIN32_LEAN_AND_MEAN #include #include #include #include #include #else #include #include #include #include #include #include #include #endif #ifndef _MSC_VER #include #include #endif #include "myutils.h" unsigned g_AllocLine; const char *g_AllocFile; static map FileToFileName; const unsigned MY_IO_BUFSIZ = 32000; const unsigned MAX_FORMATTED_STRING_LENGTH = 64000; static char *g_IOBuffers[256]; static time_t g_StartTime = time(0); extern vector g_Argv; static double g_PeakMemUseBytes; static char g_TmpStr[64]; unsigned g_AllocCount; unsigned g_FreeCount; #if ALLOC_TOTALS uint64 g_AllocTotal; uint64 g_FreeTotal; #endif static const double LOG2 = log(2.0); static const double LOG10 = log(10.0); double mylog2(double x) { return log(x)/LOG2; } double mylog10(double x) { return log(x)/LOG10; } unsigned GetRequestedThreadCount() { static unsigned N = 1; static bool Done = false; if (Done) return N; unsigned MaxN = omp_get_max_threads(); unsigned CoreCount = GetCPUCoreCount(); static bool MessageShown = false; if (optset_threads) N = opt(threads); else { if (CoreCount > 20) { Progress("CPU has %u cores, defaulting to 20 threads\n", CoreCount); MessageShown = true; N = 20; } else N = CoreCount; } if (N > MaxN) { Warning("Max OMP threads %u", MaxN); N = MaxN; } if (N == 0) N = 1; Done = true; if (!MessageShown) { Progress("CPU has %u cores, running %u threads\n", CoreCount, N); MessageShown = true; } return N; } const char *GetPlatform() { #if BITS==32 asserta(sizeof(void *) == 4); #ifdef _MSC_VER return "win32"; #elif defined(__APPLE__) return "osx32"; #elif defined(__GNUC__) return "linux32"; #else #error "Unknown compiler" #endif #elif BITS==64 asserta(sizeof(void *) == 8); #ifdef _MSC_VER return "win64"; #elif defined(__arm64__) return "osxarm64"; #elif defined(__APPLE__) return "osx64"; #elif defined(__GNUC__) return "linux64"; #else #error "Unknown compiler" #endif #else #error "Bad BITS" #endif } const char *BaseName(const char *PathName) { const char *q = 0; for (const char *p = PathName; *p; ++p) { if (*p == '/' || *p == '\\') q = p + 1; } if (q != 0) return q; return PathName; } static void AllocBuffer(FILE *f) { #if DEBUG setbuf(f, 0); #else int fd = fileno(f); if (fd < 0 || fd >= 256) return; if (g_IOBuffers[fd] == 0) g_IOBuffers[fd] = myalloc(char, MY_IO_BUFSIZ); setvbuf(f, g_IOBuffers[fd], _IOFBF, MY_IO_BUFSIZ); #endif } static void FreeBuffer(FILE *f) { #if 0 int fd = fileno(f); if (fd < 0 || fd >= 256) return; if (g_IOBuffers[fd] == 0) return; myfree(g_IOBuffers[fd]); g_IOBuffers[fd] = 0; #endif } unsigned GetElapsedSecs() { return (unsigned) (time(0) - g_StartTime); } bool StdioFileExists(const string &FileName) { struct stat SD; int i = stat(FileName.c_str(), &SD); return i == 0; } void myassertfail(const char *Exp, const char *File, unsigned Line) { Die("%s(%u) assert failed: %s", File, Line, Exp); } bool myisatty(int fd) { return isatty(fd) != 0; } #ifdef _MSC_VER #include int fseeko(FILE *stream, off_t offset, int whence) { off_t FilePos = _fseeki64(stream, offset, whence); return (FilePos == -1L) ? -1 : 0; } #define ftello(fm) (off_t) _ftelli64(fm) #endif void LogStdioFileState(FILE *f) { unsigned long tellpos = (unsigned long) ftello(f); long fseek_pos = fseek(f, 0, SEEK_CUR); int fd = fileno(f); Log("FILE * %p\n", f); Log("fileno %d\n", fd); Log("feof %d\n", feof(f)); Log("ferror %d\n", ferror(f)); Log("ftell %ld\n", tellpos); Log("fseek %ld\n", fseek_pos); #if !defined(_GNU_SOURCE) && !defined(__APPLE_CC__) fpos_t fpos; int fgetpos_retval = fgetpos(f, &fpos); Log("fpos %ld (retval %d)\n", (long) fpos, fgetpos_retval); // Log("eof %d\n", _eof(fd)); #endif #ifdef _MSC_VER __int64 pos64 = _ftelli64(f); Log("_ftelli64 %lld\n", pos64); #endif if (FileToFileName.find(f) == FileToFileName.end()) Log("Not found in FileToFileName\n"); else Log("Name %s\n", FileToFileName[f].c_str()); } void ParseFileName(const string &FileName, string &Path, string &Name) { size_t n = string::npos; size_t n1 = FileName.rfind('/'); if (n1 != string::npos) n = n1; #if _MSC_VER size_t n2 = FileName.rfind('\\'); size_t n3 = FileName.rfind(':'); if (n2 != string::npos && n2 > n) n = n2; if (n3 != string::npos && n3 > n) n = n3; #endif if (n == string::npos) { Path = "."; Name = FileName; return; } Path = FileName.substr(0, n); Name = FileName.substr(n+1, string::npos); } #ifdef _MSC_VER void ReadDir(const string &DirName, vector &FileNames) { FileNames.clear(); if (DirName.find('?') != string::npos || DirName.find('*') != string::npos) Die("Invalid directory name '%s'", DirName.c_str()); string DirNameSlashStar = DirName; if (!EndsWith(DirName, "/") && !EndsWith(DirName, "\\")) DirNameSlashStar += "/"; DirNameSlashStar += "*"; struct _finddata_t FileInfo; intptr_t h = _findfirst(DirNameSlashStar.c_str(), &FileInfo); if (h == -1) Die("Directory not found '%s'", DirName.c_str()); for (;;) { string FileName = string(FileInfo.name); FileNames.push_back(FileName); int rc = _findnext(h, &FileInfo); if (rc != 0) break; FileNames.push_back(FileInfo.name); } _findclose(h); sort(FileNames.begin(), FileNames.end()); } #else void ReadDir(const string &DirName, vector &FileNames) { FileNames.clear(); DIR *h = opendir(DirName.c_str()); if (h == 0) Die("Directory not found '%s'", DirName.c_str()); for (;;) { struct dirent *d = readdir(h); if (d == 0) break; string FileName = string(d->d_name); FileNames.push_back(FileName); } closedir(h); sort(FileNames.begin(), FileNames.end()); } #endif FILE *OpenStdioFile(const string &FileName) { if (FileName == "") Die("Missing input file name"); const char *Mode = "rb"; FILE *f = fopen(FileName.c_str(), Mode); if (f == 0) { if (errno == EFBIG) { if (sizeof(off_t) == 4) Die("File too big for 32-bit version (sizeof(off_t)=%d): %s", sizeof(off_t), FileName.c_str()); else Die("Cannot open '%s', file too big (off_t=%u bits)", FileName.c_str(), sizeof(off_t)*8); } Die("Cannot open %s, errno=%d %s", FileName.c_str(), errno, strerror(errno)); } AllocBuffer(f); FileToFileName[f] = FileName; return f; } FILE *CreateStdioFile(const string &FileName) { if (FileName == "") // Die("Missing output file name"); return 0; FILE *f = fopen(FileName.c_str(), "wb+"); if (0 == f) Die("Cannot create %s, errno=%d %s", FileName.c_str(), errno, strerror(errno)); const unsigned MYBUFFSZ = 262144+8; char *buf = (char *) malloc(MYBUFFSZ); setvbuf(f, buf, _IOFBF, MYBUFFSZ); AllocBuffer(f); FileToFileName[f] = FileName; return f; } void SetStdioFilePos(FILE *f, uint32 Pos) { if (0 == f) Die("SetStdioFilePos failed, f=NULL"); int Ok = fseeko(f, Pos, SEEK_SET); off_t NewPos = ftello(f); if (Ok != 0 || Pos != NewPos) { LogStdioFileState(f); Die("SetStdioFilePos(%d) failed, Ok=%d NewPos=%d", (int) Pos, Ok, (int) NewPos); } } void SetStdioFilePos64(FILE *f, uint64 Pos) { if (0 == f) Die("SetStdioFilePos failed, f=NULL"); int Ok = fseeko(f, Pos, SEEK_SET); off_t NewPos = ftello(f); if (Ok != 0 || Pos != NewPos) { LogStdioFileState(f); Die("SetStdioFilePos64(%ul) failed, Ok=%d NewPos=%ul", (unsigned long) Pos, Ok, (unsigned long) NewPos); } } uint32 ReadStdioFile_NoFail(FILE *f, void *Buffer, uint32 Bytes) { asserta(f != 0); off_t PosBefore = ftello(f); size_t ElementsRead = fread(Buffer, Bytes, 1, f); off_t PosAfter = ftello(f); if (ElementsRead == 1) return Bytes; uint32 BytesRead = uint32(PosAfter - PosBefore); return BytesRead; } void ReadStdioFile(FILE *f, uint32 Pos, void *Buffer, uint32 Bytes) { asserta(f != 0); SetStdioFilePos(f, Pos); uint32 ElementsRead = (uint32) fread(Buffer, Bytes, 1, f); if (ElementsRead != 1) { LogStdioFileState(f); Die("ReadStdioFile failed, attempted %lu bytes, errno=%d", (unsigned long) Bytes, errno); } } void ReadStdioFile64(FILE *f, uint64 Pos, void *Buffer, uint64 Bytes) { asserta(f != 0); uint32 Bytes32 = (uint32) Bytes; asserta(Bytes32 == Bytes); SetStdioFilePos64(f, Pos); uint64 ElementsRead = (uint64) fread(Buffer, Bytes32, 1, f); if (ElementsRead != Bytes) { LogStdioFileState(f); Die("ReadStdioFile64 failed, attempted %lu bytes, errno=%d", (unsigned long) Bytes, errno); } } void ReadStdioFile(FILE *f, void *Buffer, uint32 Bytes) { asserta(f != 0); size_t ElementsRead = fread(Buffer, Bytes, 1, f); if (ElementsRead != 1) { LogStdioFileState(f); Die("ReadStdioFile64 failed, attempted %u bytes, errno=%d", Bytes, errno); } } void ReadStdioFile64(FILE *f, void *Buffer, uint64 Bytes) { asserta(f != 0); size_t ElementsRead = fread(Buffer, Bytes, 1, f); if (ElementsRead != 1) { LogStdioFileState(f); Die("ReadStdioFile64 failed, attempted %u bytes, errno=%d", Bytes, errno); } } byte *ReadAllStdioFile(FILE *f, uint32 &FileSize) { uint64 Pos = GetStdioFilePos64(f); uint64 FileSize64 = GetStdioFileSize64(f); #if BITS == 32 if (FileSize > UINT_MAX) Die("ReadAllStdioFile (32-bit): file too big"); #endif FileSize = uint32(FileSize64); SetStdioFilePos(f, 0); byte *Buffer = myalloc(byte, FileSize); ReadStdioFile(f, Buffer, FileSize); SetStdioFilePos64(f, Pos); return Buffer; } byte *ReadAllStdioFile64(const string &FileName, uint64 &FileSize) { FILE *f = OpenStdioFile(FileName); FileSize = GetStdioFileSize64(f); #if BITS==32 if (FileSize > UINT32_MAX) Die("File too big, requires 64-bit version: %s", FileName.c_str()); #endif byte *Buffer = ReadAllStdioFile64(f, FileSize); CloseStdioFile(f); return Buffer; } byte *ReadAllStdioFile64(FILE *f, uint64 &FileSize) { uint64 SavedPos = GetStdioFilePos64(f); FileSize = GetStdioFileSize64(f); #if BITS==32 if (FileSize > UINT32_MAX) Die("File too big, requires 64-bit version"); byte *Buffer = myalloc(byte, (uint32) FileSize); #else if (FileSize > UINT_MAX) Die("ReadAllStdioFile64, file too big %s", MemBytesToStr((double) FileSize)); unsigned uFileSize = (unsigned) FileSize; byte *Buffer = myalloc(byte, uFileSize); #endif uint64 Pos = 0; uint64 BytesLeft = FileSize; const uint64 ChunkSize = 0x40000000; // 1Gb for (;;) { if (BytesLeft == 0) break; uint64 BytesToRead = BytesLeft; if (BytesToRead > ChunkSize) BytesToRead = ChunkSize; ReadStdioFile64(f, Pos, Buffer + Pos, BytesToRead); BytesLeft -= BytesToRead; } SetStdioFilePos64(f, SavedPos); return Buffer; } byte *ReadAllStdioFile32(const std::string &FileName, uint32 &FileSize) { #if WIN32 FILE *f = OpenStdioFile(FileName); FileSize = GetStdioFileSize32(f); CloseStdioFile(f); HANDLE h = CreateFile(FileName.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); if (h == INVALID_HANDLE_VALUE) Die("ReadAllStdioFile:Open(%s) failed", FileName.c_str()); byte *Buffer = myalloc(byte, FileSize); DWORD BytesRead; ReadFile(h, Buffer, FileSize, &BytesRead, NULL); if (FileSize != BytesRead) Die("ReadAllStdioFile:Error reading %s, attempted %u got %u", FileName.c_str(), FileSize, (unsigned) BytesRead); CloseHandle(h); return Buffer; #else int h = open(FileName.c_str(), O_RDONLY); if (h < 0) Die("ReadAllStdioFile:Cannot open %s", FileName.c_str()); FileSize = lseek(h, 0, SEEK_END); #ifndef __APPLE__ if (FileSize == (off_t) (-1)) Die("ReadAllStdioFile:Error seeking %s", FileName.c_str()); #endif // byte *Buffer = myalloc(FileSize); size_t stBytes = (size_t) FileSize; if ((off_t) stBytes != FileSize) Die("ReadAllStdioFile: off_t overflow"); byte *Buffer = (byte *) myalloc(byte, stBytes); if (Buffer == 0) Die("ReadAllStdioFile: failed to allocate %s", MemBytesToStr((double) stBytes)); lseek(h, 0, SEEK_SET); size_t n = read(h, Buffer, stBytes); if (n != FileSize) Die("ReadAllStdioFile, Error reading %s, attempted %g got %g", FileName.c_str(), (double) FileSize, (double) n); close(h); return Buffer; #endif } void WriteStdioFile(FILE *f, uint32 Pos, const void *Buffer, uint32 Bytes) { if (0 == f) Die("WriteStdioFile failed, f=NULL"); SetStdioFilePos(f, Pos); size_t BytesWritten = fwrite(Buffer, 1, Bytes, f); if (BytesWritten != Bytes) { LogStdioFileState(f); Die("WriteStdioFile failed, attempted %ul bytes, wrote %ul bytes, errno=%d", (unsigned long) Bytes, (unsigned long) BytesWritten, errno); } } void WriteStdioFileStr(FILE *f, const char *s) { uint32 Bytes = ustrlen(s); WriteStdioFile(f, s, Bytes); } void WriteStdioFile(FILE *f, const void *Buffer, uint32 Bytes) { if (0 == f) Die("WriteStdioFile failed, f=NULL"); size_t BytesWritten = fwrite(Buffer, 1, Bytes, f); if (BytesWritten != Bytes) { LogStdioFileState(f); Die("WriteStdioFile failed, attempted %ul bytes, wrote %ul bytes, errno=%d", (unsigned long) Bytes, (unsigned long) BytesWritten, errno); } } void WriteStdioFile64(FILE *f, const void *Buffer, uint64 Bytes) { if (0 == f) Die("WriteStdioFile failed, f=NULL"); size_t BytesWritten = fwrite(Buffer, 1, Bytes, f); if (BytesWritten != Bytes) { LogStdioFileState(f); Die("WriteStdioFile failed, attempted %ul bytes, wrote %ul bytes, errno=%d", (unsigned long) Bytes, (unsigned long) BytesWritten, errno); } } // Return false on EOF, true if line successfully read. bool ReadLineStdioFile(FILE *f, char *Line, uint32 Bytes) { if (feof(f)) return false; if ((int) Bytes < 0) Die("ReadLineStdioFile: Bytes < 0"); char *RetVal = fgets(Line, (int) Bytes, f); if (NULL == RetVal) { if (feof(f)) return false; if (ferror(f)) Die("ReadLineStdioFile: errno=%d", errno); Die("ReadLineStdioFile: fgets=0, feof=0, ferror=0"); } if (RetVal != Line) Die("ReadLineStdioFile: fgets != Buffer"); size_t n = strlen(Line); if (n < 1 || Line[n-1] != '\n') Die("ReadLineStdioFile: line too long or missing end-of-line"); if (n > 0 && (Line[n-1] == '\r' || Line[n-1] == '\n')) Line[n-1] = 0; if (n > 1 && (Line[n-2] == '\r' || Line[n-2] == '\n')) Line[n-2] = 0; return true; } void ReadTabbedLineStdioFile(FILE *f, vector &Fields, unsigned FieldCount) { string Line; bool Ok = ReadLineStdioFile(f, Line); if (!Ok) Die("Unxpected end-of-file in tabbed text"); Split(Line, Fields, '\t'); unsigned n = SIZE(Fields); if (FieldCount != UINT_MAX && n != FieldCount) { Log("\n"); Log("Line='%s'\n", Line.c_str()); Die("Expected %u tabbed fields, got %u", FieldCount, n); } } // Return false on EOF, true if line successfully read. bool ReadLineStdioFile(FILE *f, string &Line) { Line.clear(); for (;;) { int c = fgetc(f); if (c == -1) { if (feof(f)) { if (!Line.empty()) return true; return false; } Die("ReadLineStdioFile, errno=%d", errno); } if (c == '\r') continue; if (c == '\n') return true; Line.push_back((char) c); } } void RenameStdioFile(const string &FileNameFrom, const string &FileNameTo) { int Ok = rename(FileNameFrom.c_str(), FileNameTo.c_str()); if (Ok != 0) Die("RenameStdioFile(%s,%s) failed, errno=%d %s", FileNameFrom.c_str(), FileNameTo.c_str(), errno, strerror(errno)); } void FlushStdioFile(FILE *f) { int Ok = fflush(f); if (Ok != 0) Die("fflush(%p)=%d,", f, Ok); } void CloseStdioFile(FILE *f) { if (f == 0) return; int Ok = fclose(f); if (Ok != 0) Die("fclose(%p)=%d", f, Ok); FreeBuffer(f); } uint32 GetStdioFilePos32(FILE *f) { off_t FilePos = ftello(f); if (FilePos < 0) Die("ftello=%d", (int) FilePos); if (FilePos > UINT32_MAX) Die("File offset too big for 32-bit version (%s)", MemBytesToStr((double) FilePos)); return (uint32) FilePos; } uint64 GetStdioFilePos64(FILE *f) { off_t FilePos = ftello(f); if (FilePos < 0) Die("ftello=%d", (int) FilePos); return (uint64) FilePos; } uint32 GetStdioFileSize32(FILE *f) { uint32 CurrentPos = GetStdioFilePos32(f); int Ok = fseeko(f, 0, SEEK_END); if (Ok < 0) Die("fseek in GetFileSize"); off_t Length = ftello(f); SetStdioFilePos(f, CurrentPos); if (Length < 0) Die("ftello in GetFileSize"); #if BITS == 32 if (Length > UINT32_MAX) Die("File size too big for 32-bit version (%s)", MemBytesToStr((double) Length)); #endif return (uint32) Length; } uint64 GetStdioFileSize64(FILE *f) { uint64 CurrentPos = GetStdioFilePos64(f); int Ok = fseeko(f, 0, SEEK_END); if (Ok < 0) Die("fseek in GetFileSize64"); off_t Length = ftello(f); SetStdioFilePos64(f, CurrentPos); if (Length < 0) Die("ftello in GetFileSize"); #if BITS == 32 if (Length > UINT32_MAX) Die("File size too big for 32-bit version (%s)", MemBytesToStr((double) Length)); #endif return (uint64) Length; } void MoveStdioFile(const string &FileName1, const string &FileName2) { if (StdioFileExists(FileName2)) DeleteStdioFile(FileName2); RenameStdioFile(FileName1, FileName2); } void DeleteStdioFile(const string &FileName) { int Ok = remove(FileName.c_str()); if (Ok != 0) Die("remove(%s) failed, errno=%d %s", FileName.c_str(), errno, strerror(errno)); } double GetUsableMemBytes() { double RAM = GetPhysMemBytes(); #if BITS==32 #ifdef _MSC_VER if (RAM > 2e9) return 2e9; #else if (RAM > 4e9) return 4e9; #endif #endif return RAM; } static char **g_ThreadStrs; static unsigned g_ThreadStrCount; static char *GetThreadStr() { unsigned ThreadIndex = GetThreadIndex(); if (ThreadIndex >= g_ThreadStrCount) { unsigned NewThreadStrCount = ThreadIndex + 4; char **NewThreadStrs = myalloc(char *, NewThreadStrCount); memset_zero(NewThreadStrs, NewThreadStrCount); if (g_ThreadStrCount > 0) memcpy(NewThreadStrs, g_ThreadStrs, g_ThreadStrCount*sizeof(char *)); g_ThreadStrs = NewThreadStrs; g_ThreadStrCount = NewThreadStrCount; } if (g_ThreadStrs[ThreadIndex] == 0) g_ThreadStrs[ThreadIndex] = myalloc(char, MAX_FORMATTED_STRING_LENGTH+1); char *Str = g_ThreadStrs[ThreadIndex]; return Str; } void myvstrprintf(string &Str, const char *Format, va_list ArgList) { char *szStr = GetThreadStr(); vsnprintf(szStr, MAX_FORMATTED_STRING_LENGTH-1, Format, ArgList); szStr[MAX_FORMATTED_STRING_LENGTH - 1] = '\0'; Str.assign(szStr); } void Pf(FILE *f, const char *Format, ...) { if (f == 0) return; va_list ArgList; va_start(ArgList, Format); vfprintf(f, Format, ArgList); va_end(ArgList); } void Ps(string &Str, const char *Format, ...) { va_list ArgList; va_start(ArgList, Format); myvstrprintf(Str, Format, ArgList); va_end(ArgList); } void Psa(string &Str, const char *Format, ...) { va_list ArgList; va_start(ArgList, Format); string Tmp; myvstrprintf(Tmp, Format, ArgList); va_end(ArgList); Str += Tmp; } void Psasc(string &Str, const char *Format, ...) { unsigned n = SIZE(Str); if (n > 0 && Str[n-1] != ';') Str += ";"; va_list ArgList; va_start(ArgList, Format); string Tmp; myvstrprintf(Tmp, Format, ArgList); va_end(ArgList); Str += Tmp; n = SIZE(Str); if (n > 0 && Str[n-1] != ';') Str += ";"; } FILE *g_fLog = 0; void SetLogFileName(const string &FileName) { if (g_fLog != 0) CloseStdioFile(g_fLog); g_fLog = 0; if (FileName.empty()) return; g_fLog = CreateStdioFile(FileName); setbuf(g_fLog, 0); } void Log(const char *Format, ...) { if (g_fLog == 0) return; va_list ArgList; va_start(ArgList, Format); vfprintf(g_fLog, Format, ArgList); va_end(ArgList); fflush(g_fLog); } void Die_(const char *Format, ...) { va_list ArgList; va_start(ArgList, Format); #pragma omp critical { static bool InDie = false; if (InDie) exit(1); InDie = true; string Msg; if (g_fLog != 0) setbuf(g_fLog, 0); myvstrprintf(Msg, Format, ArgList); fprintf(stderr, "\n\n"); Log("\n"); time_t t = time(0); Log("%s", asctime(localtime(&t))); for (unsigned i = 0; i < g_Argv.size(); i++) { fprintf(stderr, (i == 0) ? "%s" : " %s", g_Argv[i].c_str()); Log((i == 0) ? "%s" : " %s", g_Argv[i].c_str()); } fprintf(stderr, "\n"); Log("\n"); time_t CurrentTime = time(0); unsigned ElapsedSeconds = unsigned(CurrentTime - g_StartTime); const char *sstr = SecsToStr(ElapsedSeconds); Log("Elapsed time: %s\n", sstr); const char *szStr = Msg.c_str(); fprintf(stderr, "Elapsed time %s\n", SecsToHHMMSS(ElapsedSeconds)); fprintf(stderr, "Max memory %s\n", MemBytesToStr(g_PeakMemUseBytes)); fprintf(stderr, "\n---Fatal error---\n%s\n", szStr); Log("\n---Fatal error---\n%s\n", szStr); #ifdef _MSC_VER if (IsDebuggerPresent()) __debugbreak(); _CrtSetDbgFlag(0); #endif exit(1); } va_end(ArgList); } void Warning_(const char *Format, ...) { string Msg; va_list ArgList; va_start(ArgList, Format); myvstrprintf(Msg, Format, ArgList); va_end(ArgList); const char *szStr = Msg.c_str(); fprintf(stderr, "\nWARNING: %s\n\n", szStr); if (g_fLog != stdout) { Log("\nWARNING: %s\n", szStr); fflush(g_fLog); } } #ifdef _MSC_VER void mysleep(unsigned ms) { Sleep(ms); } #else void mysleep(unsigned ms) { usleep(ms); } #endif #ifdef _MSC_VER double GetMemUseBytes() { HANDLE hProc = GetCurrentProcess(); PROCESS_MEMORY_COUNTERS PMC; BOOL bOk = GetProcessMemoryInfo(hProc, &PMC, sizeof(PMC)); if (!bOk) return 1000000; double Bytes = (double) PMC.WorkingSetSize; if (Bytes > g_PeakMemUseBytes) g_PeakMemUseBytes = Bytes; return Bytes; } double GetPhysMemBytes() { MEMORYSTATUSEX MS; MS.dwLength = sizeof(MS); BOOL Ok = GlobalMemoryStatusEx(&MS); if (!Ok) return 0.0; return double(MS.ullTotalPhys); } #elif linux || __linux__ || __CYGWIN__ double GetPhysMemBytes() { int fd = open("/proc/meminfo", O_RDONLY); if (fd < 0) return 0.0; // MemTotal: 255908 kB char Line[128]; int n = read(fd, Line, sizeof(Line)); if (n < 0) return 0.0; Line[127] = 0; unsigned kb; n = sscanf(Line, "MemTotal: %u", &kb); if (n != 1) return 0.0; return double(kb)*1000.0; } double GetMemUseBytes() { static char statm[64]; static int PageSize = 1; if (0 == statm[0]) { PageSize = sysconf(_SC_PAGESIZE); pid_t pid = getpid(); sprintf(statm, "/proc/%d/statm", (int) pid); } int fd = open(statm, O_RDONLY); if (fd < 0) return 0.0; char Buffer[64]; int n = read(fd, Buffer, sizeof(Buffer) - 1); close(fd); fd = -1; if (n <= 0) return 0.0; Buffer[n] = 0; double Pages = atof(Buffer); double Bytes = Pages*PageSize; if (Bytes > g_PeakMemUseBytes) g_PeakMemUseBytes = Bytes; return Bytes; } #elif defined(__MACH__) #include #include #include #include #include #include #include #include #include #include #include #include #include // #include #include #include #include #define DEFAULT_MEM_USE 0.0 double GetMemUseBytes() { task_t mytask = mach_task_self(); struct task_basic_info ti; memset((void *) &ti, 0, sizeof(ti)); mach_msg_type_number_t count = TASK_BASIC_INFO_COUNT; kern_return_t ok = task_info(mytask, TASK_BASIC_INFO, (task_info_t) &ti, &count); if (ok == KERN_INVALID_ARGUMENT) return DEFAULT_MEM_USE; if (ok != KERN_SUCCESS) return DEFAULT_MEM_USE; double Bytes = (double ) ti.resident_size; if (Bytes > g_PeakMemUseBytes) g_PeakMemUseBytes = Bytes; return Bytes; } double GetPhysMemBytes() { uint64_t mempages = 0; size_t len = sizeof(mempages); int rc = sysctlbyname("hw.memsize", &mempages, &len, NULL, 0); if (rc < 0) return 0.0; return double(mempages); } #else double GetMemUseBytes() { return 0.0; } #endif #ifdef _MSC_VER void mylistdir(const string &DirName, vector &FileNames) { FileNames.clear(); bool First = true; HANDLE h = INVALID_HANDLE_VALUE; WIN32_FIND_DATA FFD; for (;;) { if (First) { string s = DirName + string("/*"); h = FindFirstFile(s.c_str(), &FFD); if (h == INVALID_HANDLE_VALUE) return; First = false; } else { BOOL Ok = FindNextFile(h, &FFD); if (!Ok) return; } FileNames.push_back(string(FFD.cFileName)); } } #else void mylistdir(const string &DirName, vector &FileNames) { FileNames.clear(); DIR *dir = opendir(DirName.c_str()); if (dir == 0) Die("Directory not found: %s", DirName.c_str()); for (;;) { struct dirent *dp = readdir(dir); if (dp == 0) break; FileNames.push_back(string(dp->d_name)); } closedir(dir); } #endif double GetPeakMemUseBytes() { return g_PeakMemUseBytes; } const char *SecsToHHMMSS(unsigned Secs) { unsigned HH = Secs/3600; unsigned MM = (Secs - HH*3600)/60; unsigned SS = Secs%60; if (HH == 0) sprintf(g_TmpStr, "%02u:%02d", MM, SS); else sprintf(g_TmpStr, "%02u:%02u:%02u", HH, MM, SS); return g_TmpStr; } const char *SecsToStr(double Secs) { if (Secs >= 60.0) return SecsToHHMMSS((unsigned) Secs); if (Secs < 1e-6) sprintf(g_TmpStr, "%.2gs", Secs); else if (Secs < 1e-3) sprintf(g_TmpStr, "%.2fms", Secs*1e3); else if (Secs < 1.0) sprintf(g_TmpStr, "%.3fs", Secs); else if (Secs < 10.0) sprintf(g_TmpStr, "%.2fs", Secs); else sprintf(g_TmpStr, "%.1fs", Secs); return g_TmpStr; } const char *MemBytesToStr(double Bytes) { if (Bytes < 1e4) sprintf(g_TmpStr, "%.1fb", Bytes); else if (Bytes < 1e6) sprintf(g_TmpStr, "%.1fkb", Bytes/1e3); else if (Bytes < 10e6) sprintf(g_TmpStr, "%.1fMb", Bytes/1e6); else if (Bytes < 1e9) sprintf(g_TmpStr, "%.0fMb", Bytes/1e6); else if (Bytes < 100e9) sprintf(g_TmpStr, "%.1fGb", Bytes/1e9); else sprintf(g_TmpStr, "%.0fGb", Bytes/1e9); return g_TmpStr; } bool IsValidFloatStr(const char *s) { char *p = 0; double d = strtod(s, &p); bool Bad = (p == 0 || *p != 0); return !Bad; } bool IsValidFloatStr(const string &s) { return IsValidFloatStr(s.c_str()); } double StrToFloat(const string &s, bool StarIsDblMax) { return StrToFloat(s.c_str(), StarIsDblMax); } double StrToFloat(const char *s, bool StarIsDblMax) { if (StarIsDblMax && s[0] == '*' && s[1] == 0) return DBL_MAX; if (!IsValidFloatStr(s)) Die("Invalid floating-point number '%s'", s); return atof(s); } double StrToMemBytes(const string &s) { unsigned n = SIZE(s); if (n == 0) return 0.0; double d = StrToFloat(s.c_str()); char c = toupper(s[n-1]); if (isdigit(c)) return d; else if (c == 'K') return 1000.0*d; else if (c == 'M') return 1e6*d; else if (c == 'G') return 1e9*d; else Die("Invalid amount of memory '%s'", s.c_str()); return 0.0; } bool Replace(string &s, const string &a, const string &b) { size_t n = s.find(a); if (n == string::npos) return false; string t; for (size_t i = 0; i < n; ++i) t += s[i]; size_t m = a.size(); for (size_t i = n + m; i < n; ++i) t += s[i]; s = t; return true; } bool EndsWith(const string &s, const string &t) { unsigned n = SIZE(s); unsigned m = SIZE(t); if (n < m) return false; for (unsigned i = 0; i < m; ++i) if (s[n-i-1] != t[m-i-1]) return false; return true; } bool IsUintStr(const char *s) { if (!isdigit(*s++)) return false; while (*s) if (!isdigit(*s++)) return false; return true; } unsigned StrToUint(const char *s, bool StarIsUnitMax) { if (StarIsUnitMax && s[0] == '*' && s[1] == 0) return UINT_MAX; if (!IsUintStr(s)) Die("Invalid integer '%s'", s); unsigned n = 0; while (char c = *s++) { if (!isdigit(c)) return n; n = n*10 + (c - '0'); } return n; } uint64 StrToUint64(const char *s) { if (!IsUintStr(s)) Die("Invalid integer '%s'", s); uint64 n = 0; while (char c = *s++) { if (!isdigit(c)) return n; n = n*10 + (c - '0'); } return n; } uint64 StrToUint64(const string &s) { return StrToUint64(s.c_str()); } unsigned StrToUint(const string &s, bool StarIsUnitMax) { return StrToUint(s.c_str(), StarIsUnitMax); } const char *IntToStr2(uint64 i) { static char *TmpStr = 0; if (TmpStr == 0) TmpStr = (char *) malloc(64); if (i < 9999) sprintf(TmpStr, "%u", (unsigned) i); else if (i < UINT_MAX) sprintf(TmpStr, "%u (%s)", (unsigned) i, IntToStr(i)); else return IntToStr(i); return TmpStr; } const char *PctToStr(double Pct) { if (Pct == 0.0) sprintf(g_TmpStr, "0%%"); else if (Pct < 0.1) sprintf(g_TmpStr, "%.3g%%", Pct); else if (Pct >= 0.1) sprintf(g_TmpStr, "%.2f%%", Pct); else sprintf(g_TmpStr, "%.3f%%", Pct); return g_TmpStr; } const char *IntToStr(uint64 i) { double d = (double) i; if (i < 10000) sprintf(g_TmpStr, "%u", (unsigned) i); else if (i < 1e6) sprintf(g_TmpStr, "%.1fk", d/1e3); else if (i < 100e6) sprintf(g_TmpStr, "%.1fM", d/1e6); else if (i < 1e9) sprintf(g_TmpStr, "%.0fM", d/1e6); else if (i < 10e9) sprintf(g_TmpStr, "%.1fG", d/1e9); else if (i < 100e9) sprintf(g_TmpStr, "%.0fG", d/1e9); else sprintf(g_TmpStr, "%.3g", d); return g_TmpStr; } const char *Int64ToStr(uint64 i) { double d = (double) i; if (i < 10000) sprintf(g_TmpStr, "%u", (unsigned) i); else if (i < 1e6) sprintf(g_TmpStr, "%.1fk", d/1e3); else if (i < 10e6) sprintf(g_TmpStr, "%.1fM", d/1e6); else if (i < 1e9) sprintf(g_TmpStr, "%.0fM", d/1e6); else if (i < 10e9) sprintf(g_TmpStr, "%.1fG", d/1e9); else if (i < 100e9) sprintf(g_TmpStr, "%.0fG", d/1e9); else sprintf(g_TmpStr, "%.3g", d); return g_TmpStr; } const char *FloatToStr(double d) { double a = fabs(d); if (a < 0.01) sprintf(g_TmpStr, "%.3g", a); else if (a >= 0.01 && a < 1) sprintf(g_TmpStr, "%.3f", a); else if (a <= 10 && a >= 1) { double intpart; if (modf(a, &intpart) < 0.05) sprintf(g_TmpStr, "%.0f", d); else sprintf(g_TmpStr, "%.1f", d); } else if (a > 10 && a < 10000) sprintf(g_TmpStr, "%.1f", d); else if (a < 1e6) sprintf(g_TmpStr, "%.1fk", d/1e3); else if (a < 10e6) sprintf(g_TmpStr, "%.1fM", d/1e6); else if (a < 1e9) sprintf(g_TmpStr, "%.1fM", d/1e6); else if (a < 999e9) sprintf(g_TmpStr, "%.1fG", d/1e9); else if (a < 999e12) sprintf(g_TmpStr, "%.1fT", d/1e9); else sprintf(g_TmpStr, "%.3g", d); return g_TmpStr; } const char *FloatToStr(uint64 u) { return FloatToStr(double(u)); } const char *IntFloatToStr(double d) { double a = fabs(d); if (a < 1.0) sprintf(g_TmpStr, "%.3g", a); else if (a <= 10) sprintf(g_TmpStr, "%.0f", d); else if (a > 10 && a < 10000) sprintf(g_TmpStr, "%.0f", d); else if (a < 1e6) sprintf(g_TmpStr, "%.1fk", d/1e3); else if (a < 10e6) sprintf(g_TmpStr, "%.1fM", d/1e6); else if (a < 1e9) sprintf(g_TmpStr, "%.1fM", d/1e6); else if (a < 10e9) sprintf(g_TmpStr, "%.1fG", d/1e9); else if (a < 100e9) sprintf(g_TmpStr, "%.1fG", d/1e9); else sprintf(g_TmpStr, "%.3g", d); return g_TmpStr; } static string g_CurrentProgressLine; static string g_ProgressDesc; static unsigned g_ProgressIndex; static unsigned g_ProgressCount; static unsigned g_CurrProgressLineLength; static unsigned g_LastProgressLineLength; static unsigned g_CountsInterval; static unsigned g_StepCalls; static time_t g_TimeLastOutputStep; static string &GetProgressPrefixStr(string &s) { double Bytes = GetMemUseBytes(); unsigned Secs = GetElapsedSecs(); s = string(SecsToHHMMSS(Secs)); if (Bytes > 0) { s.push_back(' '); char Str[32]; sprintf(Str, "%-6s", MemBytesToStr(Bytes)); s += string(Str); } s.push_back(' '); return s; } const char *GetProgressPrefixCStr() { static string s; GetProgressPrefixStr(s); return s.c_str(); } const char *GetElapsedTimeStr(string &s) { unsigned Secs = GetElapsedSecs(); s = string(SecsToHHMMSS(Secs)); return s.c_str(); } const char *GetMaxRAMStr(string &s) { char Str[32]; sprintf(Str, "%5s", MemBytesToStr(g_PeakMemUseBytes)); s = string(Str); return s.c_str(); } static bool g_ProgressPrefixOn = true; bool ProgressPrefix(bool On) { bool OldValue = g_ProgressPrefixOn; g_ProgressPrefixOn = On; return OldValue; } void ProgressLog(const char *Format, ...) { string Str; va_list ArgList; va_start(ArgList, Format); myvstrprintf(Str, Format, ArgList); va_end(ArgList); Log("%s", Str.c_str()); bool SavedPrefix = g_ProgressPrefixOn; g_ProgressPrefixOn = false; Progress("%s", Str.c_str()); g_ProgressPrefixOn = SavedPrefix; } void ProgressLogPrefix(const char *Format, ...) { string Str; va_list ArgList; va_start(ArgList, Format); myvstrprintf(Str, Format, ArgList); va_end(ArgList); Log("%s\n", Str.c_str()); Progress("%s\n", Str.c_str()); } void Pr(FILE *f, const char *Format, ...) { if (f == 0) return; va_list args; va_start(args, Format); vfprintf(f, Format, args); va_end(args); } void Progress(const char *Format, ...) { if (opt(quiet)) return; string Str; va_list ArgList; va_start(ArgList, Format); myvstrprintf(Str, Format, ArgList); va_end(ArgList); #if 0 Log("Progress("); for (unsigned i = 0; i < Str.size(); ++i) { char c = Str[i]; if (c == '\r') Log("\\r"); else if (c == '\n') Log("\\n"); else Log("%c", c); } Log(")\n"); #endif //0 for (unsigned i = 0; i < Str.size(); ++i) { if (g_ProgressPrefixOn && g_CurrProgressLineLength == 0) { string s; GetProgressPrefixStr(s); for (unsigned j = 0; j < s.size(); ++j) { fputc(s[j], stderr); ++g_CurrProgressLineLength; } } char c = Str[i]; if (c == '\n' || c == '\r') { for (unsigned j = g_CurrProgressLineLength; j < g_LastProgressLineLength; ++j) fputc(' ', stderr); if (c == '\n') g_LastProgressLineLength = 0; else g_LastProgressLineLength = g_CurrProgressLineLength; g_CurrProgressLineLength = 0; fputc(c, stderr); } else { fputc(c, stderr); ++g_CurrProgressLineLength; } } } void LogProgramInfoAndCmdLine() { if (g_fLog == 0) return; time_t Now = time(0); struct tm *t = localtime(&Now); const char *TimeStr = asctime(t); string Ver; GetVersionString(Ver); Log("%s", Ver.c_str()); Log(" built %s %s\n", __DATE__, __TIME__); Log("Started %s", TimeStr); // there is a newline in TimeStr #ifdef _MSC_VER const char *e = getenv("CYGTZ"); if (e != 0 && strcmp(e, "YES") == 0) putenv("TZ="); #endif PrintCmdLine(g_fLog); } void LogElapsedTimeAndRAM() { time_t Now = time(0); struct tm *t = localtime(&Now); const char *s = asctime(t); unsigned Secs = GetElapsedSecs(); Log("\n"); Log("Finished %s", s); // there is a newline in s Log("Elapsed time %s\n", SecsToHHMMSS((unsigned) Secs)); Log("Max memory %s\n", MemBytesToStr(g_PeakMemUseBytes)); #if WIN32 && DEBUG // Skip exit(), which can be very slow in DEBUG build // VERY DANGEROUS practice, because it skips global destructors. // But if you know the rules, you can break 'em, right? ExitProcess(0); #endif } const char *PctStr(double x, double y) { if (y == 0) { if (x == 0) return "100%"; else return "inf%"; } static char Str[16]; double p = x*100.0/y; if (p < 1) sprintf(Str, "%5.2g%%", p); else sprintf(Str, "%5.1f%%", p); return Str; } string &GetProgressLevelStr(string &s) { unsigned Index = g_ProgressIndex; unsigned Count = g_ProgressCount; if (Count == UINT_MAX) { if (Index == UINT_MAX) s = "100%"; else { char Tmp[16]; sprintf(Tmp, "%u", Index); s = Tmp; } } else s = string(PctStr(Index+1, Count)); s += string(" ") + g_ProgressDesc; return s; } static const char *DefaultPCB() { return "Processing"; } static FN_PROGRESS_CALLBACK g_PCB = DefaultPCB; void SetPCB(FN_PROGRESS_CALLBACK PCB) { g_PCB = PCB; } static FILE *g_fProg; static double g_ProgFileSize; static unsigned g_ProgFileTick; static const char *g_ProgFileMsg = "Processing"; static string g_ProgFileStr; void ProgressFileInit(FILE *f, const char *Format, ...) { g_fProg = f; g_ProgFileSize = (double) GetStdioFileSize64(f); g_ProgFileTick = 0; if (Format == 0) g_ProgFileMsg = "Processing"; else { va_list ArgList; va_start(ArgList, Format); myvstrprintf(g_ProgFileStr, Format, ArgList); va_end(ArgList); g_ProgFileMsg = g_ProgFileStr.c_str(); } ProgressStep(0, 1000, "%s", g_ProgFileMsg); } void ProgressFileStep(const char *Format, ...) { double Pos = (double) GetStdioFilePos64(g_fProg); unsigned Tick = (unsigned) ((Pos*998.0)/g_ProgFileSize); if (Tick <= g_ProgFileTick) return; if (Format != 0) { va_list ArgList; va_start(ArgList, Format); myvstrprintf(g_ProgFileStr, Format, ArgList); va_end(ArgList); g_ProgFileMsg = g_ProgFileStr.c_str(); } ProgressStep(Tick, 1000, "%s", g_ProgFileMsg); g_ProgFileTick = Tick; } void ProgressFileDone(const char *Format, ...) { if (Format != 0) { va_list ArgList; va_start(ArgList, Format); myvstrprintf(g_ProgFileStr, Format, ArgList); va_end(ArgList); g_ProgFileMsg = g_ProgFileStr.c_str(); } ProgressStep(999, 1000, "%s", g_ProgFileMsg); } #if TIMING static time_t g_LastLogTimerSecs; #endif void ProgressCallback(unsigned i, unsigned N) { if (opt(quiet)) return; if (i == 0) { g_ProgressIndex = 0; g_ProgressCount = N; g_CountsInterval = 1; g_StepCalls = 0; g_TimeLastOutputStep = 0; if (g_CurrProgressLineLength > 0) Progress("\n"); } bool IsLastStep = (i == UINT_MAX || i + 1 == N); if (!IsLastStep) { ++g_StepCalls; if (g_StepCalls%g_CountsInterval != 0) return; time_t Now = time(0); if (Now == g_TimeLastOutputStep) { if (g_CountsInterval < 128) g_CountsInterval = (g_CountsInterval*3)/2; else g_CountsInterval += 64; return; } else { time_t Secs = Now - g_TimeLastOutputStep; if (Secs > 1) g_CountsInterval = unsigned(g_CountsInterval/(Secs*8)); } if (g_CountsInterval < 1) g_CountsInterval = 1; g_TimeLastOutputStep = Now; } g_ProgressIndex = i; Progress(" %s", PctStr(i+1, N)); Progress(" %s\r", (g_PCB)()); if (IsLastStep) { g_CountsInterval = 1; fputc('\n', stderr); } } void ProgressStep64(uint64 i64, uint64 N64, const char *Msg) { unsigned i; if (i64 == 0) i = 0; else if (i64 + 1 == N64) i = 999; else i = unsigned(double(i64)*997.0/double(N64)) + 1; ProgressStep(i, 1000, Msg); } void ProgressStep(unsigned i, unsigned N, const char *Format, ...) { if (opt(quiet)) return; if (i == 0) { string Str; va_list ArgList; va_start(ArgList, Format); myvstrprintf(Str, Format, ArgList); va_end(ArgList); g_ProgressDesc = Str; g_ProgressIndex = 0; g_ProgressCount = N; g_CountsInterval = 1; g_StepCalls = 0; g_TimeLastOutputStep = 0; if (g_CurrProgressLineLength > 0) Progress("\n"); } assert(N == g_ProgressCount); if (i >= N && i != UINT_MAX) { static bool WarningDone = false; if (!WarningDone) { Warning("ProgressStep(%u,%u)", i, N); WarningDone = true; } return; } bool IsLastStep = (i == UINT_MAX || i + 1 == N); if (!IsLastStep) { ++g_StepCalls; if (g_StepCalls%g_CountsInterval != 0) return; time_t Now = time(0); if (Now == g_TimeLastOutputStep) { if (g_CountsInterval < 128) g_CountsInterval = (g_CountsInterval*3)/2; else g_CountsInterval += 64; return; } else { time_t Secs = Now - g_TimeLastOutputStep; if (Secs > 1) g_CountsInterval = unsigned(g_CountsInterval/(Secs*8)); } if (g_CountsInterval < 1) g_CountsInterval = 1; g_TimeLastOutputStep = Now; } g_ProgressIndex = i; if (i > 0) { va_list ArgList; va_start(ArgList, Format); myvstrprintf(g_ProgressDesc, Format, ArgList); } string LevelStr; GetProgressLevelStr(LevelStr); Progress(" %s\r", LevelStr.c_str()); if (IsLastStep) { g_CountsInterval = 1; fputc('\n', stderr); string s; GetProgressPrefixStr(s); Log("%s %s\n", s.c_str(), LevelStr.c_str()); } } static unsigned GetStructPack() { struct { char a; char b; } x; return (unsigned) (&x.b - &x.a); } void CompilerInfo() { printf("%u bits\n", BITS); #ifdef __GNUC__ printf("__GNUC__\n"); #endif #ifdef __APPLE__ printf("__APPLE__\n"); #endif #ifdef _MSC_VER printf("_MSC_VER %d\n", _MSC_VER); #endif #define x(t) printf("sizeof(" #t ") = %d\n", (int) sizeof(t)); x(int) x(long) x(float) x(double) x(void *) x(off_t) x(size_t) #undef x printf("pack(%u)\n", GetStructPack()); #ifdef _FILE_OFFSET_BITS printf("_FILE_OFFSET_BITS = %d\n", _FILE_OFFSET_BITS); #else printf("_FILE_OFFSET_BITS not defined\n"); #endif exit(0); } bool StartsWith(const char *S, const char *T) { for (;;) { char t = *T++; if (t == 0) return true; char s = *S++; if (s != t) return false; } } void Reverse(string &s) { unsigned n = SIZE(s); string t; for (unsigned i = 0; i < n; ++i) t += s[n-i-1]; s = t; } bool StartsWith(const string &S, const char *T) { return StartsWith(S.c_str(), T); } bool StartsWith(const string &s, const string &t) { return StartsWith(s.c_str(), t.c_str()); } void ToUpper(const string &s, string &t) { t.clear(); const unsigned n = SIZE(s); for (unsigned i = 0; i < n; ++i) t.push_back(toupper(s[i])); } void ToLower(const string &s, string &t) { t.clear(); const unsigned n = SIZE(s); for (unsigned i = 0; i < n; ++i) t.push_back(tolower(s[i])); } void StripWhiteSpace(string &Str) { unsigned n = SIZE(Str); unsigned FirstNonWhite = UINT_MAX; unsigned LastNonWhite = UINT_MAX; for (unsigned i = 0; i < n; ++i) { char c = Str[i]; if (!isspace(c)) { if (FirstNonWhite == UINT_MAX) FirstNonWhite = i; LastNonWhite = i; } } if (FirstNonWhite == UINT_MAX) return; string t; for (unsigned i = FirstNonWhite; i <= LastNonWhite; ++i) { char c = Str[i]; t += c; } Str = t; } void Split(const string &Str, vector &Fields, char Sep) { Fields.clear(); const unsigned Length = (unsigned) Str.size(); string s; for (unsigned i = 0; i < Length; ++i) { char c = Str[i]; if ((Sep == 0 && isspace(c)) || c == Sep) { if (!s.empty() || Sep != 0) Fields.push_back(s); s.clear(); } else s.push_back(c); } if (!s.empty()) Fields.push_back(s); } void GetVersionString(string &s) { const char *GIT_VER = #include "gitver.txt" ; const char *Flags = "" #if DEBUG "D" #endif #if TIMING "T" #endif ; if (GIT_VER == 0) GIT_VER = "-"; Ps(s, "%s %s.%s%s [%s]", PROGRAM_NAME, MY_VERSION, GetPlatform(), Flags, GIT_VER); } void PrintVersion(FILE *f) { if (f == 0) return; string s; GetVersionString(s); fputs(s.c_str(), f); fputc('\n', f); fprintf(f, "Built %s %s\n", __DATE__, __TIME__); } void cmd_version() { PrintVersion(stdout); printf("\n"); exit(0); } void PrintBanner(FILE *f) { if (f == 0) return; string s; GetVersionString(s); double RAM = GetPhysMemBytes(); fprintf(f, "\n"); fprintf(f, "%s", s.c_str()); fprintf(f, " %s RAM", MemBytesToStr(RAM)); fprintf(f, ", %u cores\n", GetCPUCoreCount()); fprintf(f, "Built %s %s\n", __DATE__, __TIME__); fprintf(f, "(C) Copyright 2004-2021 Robert C. Edgar.\n"); fprintf(f, "https://drive5.com\n\n"); } void PrintCmdLine(FILE *f) { if (f == 0) return; for (unsigned i = 0; i < SIZE(g_Argv); ++i) fprintf(f, "%s ", g_Argv[i].c_str()); fprintf(f, "\n"); } void GetCmdLine(string &s) { s.clear(); for (unsigned i = 0; i < SIZE(g_Argv); ++i) { if (i > 0) s += " "; s += g_Argv[i]; } } char *mystrsave(const char *s) { unsigned n = unsigned(strlen(s)); char *t = myalloc(char, n+1); memcpy(t, s, n+1); return t; } unsigned myipow(unsigned x, unsigned y) { unsigned result = 1; for (unsigned k = 0; k < y; ++k) { if (result > UINT_MAX/x) Die("myipow(%u, %u), overflow", x, y); result *= x; } return result; } uint64 myipow64(unsigned x, unsigned y) { uint64 result = 1; for (unsigned k = 0; k < y; ++k) { if (result > uint64(UINT64_MAX)/uint64(x)) Die("myipow(%u, %u), overflow", x, y); result *= x; } return result; } void LogInt(unsigned i, unsigned w) { if (w == UINT_MAX) { if (i < 9999) Log("%u", i); else Log("%u (%s)", i, IntToStr(i)); } else { if (i < 9999) Log("%*u", w, i); else Log("%*u (%s)", w, i, IntToStr(i)); } } void Logu(unsigned u, unsigned w, unsigned prefixspaces) { for (unsigned i = 0; i < prefixspaces; ++i) Log(" "); if (u == UINT_MAX) Log("%*.*s", w, w, "*"); else Log("%*u", w, u); } void Logf(float x, unsigned w, unsigned prefixspaces) { for (unsigned i = 0; i < prefixspaces; ++i) Log(" "); if (x == FLT_MAX) Log("%*.*s", w, w, "*"); else Log("%*.2f", w, x); } static uint32 g_SLCG_state = 1; // Simple Linear Congruential Generator // Bad properties; used just to initialize the better generator. // Numerical values used by Microsoft C, according to wikipedia: // http://en.wikipedia.org/wiki/Linear_congruential_generator static uint32 g_SLCG_a = 214013; static uint32 g_SLCG_c = 2531011; static uint32 SLCG_rand() { g_SLCG_state = g_SLCG_state*g_SLCG_a + g_SLCG_c; return g_SLCG_state; } static void SLCG_srand(uint32 Seed) { g_SLCG_state = Seed; for (int i = 0; i < 10; ++i) SLCG_rand(); } /*** A multiply-with-carry random number generator, see: http://en.wikipedia.org/wiki/Multiply-with-carry The particular multipliers used here were found on the web where they are attributed to George Marsaglia. ***/ static bool g_InitRandDone = false; static uint32 g_X[5]; static void InitRand() { if (g_InitRandDone) return; // Do this first to avoid recursion g_InitRandDone = true; unsigned Seed = 1; if (optset_randseed) { Seed = opt(randseed); if (Seed == 0) Seed = (unsigned) (time(0)*getpid()); } Log("Random number seed %u\n", Seed); ResetRand(Seed); } static void IncrementRand() { uint64 Sum = 2111111111*(uint64) g_X[3] + 1492*(uint64) g_X[2] + 1776*(uint64) g_X[1] + 5115*(uint64) g_X[0] + g_X[4]; g_X[3] = g_X[2]; g_X[2] = g_X[1]; g_X[1] = g_X[0]; g_X[4] = (uint32) (Sum >> 32); g_X[0] = (uint32) Sum; } uint32 RandInt32() { InitRand(); IncrementRand(); return g_X[0]; } unsigned randu32() { return (unsigned) RandInt32(); } uint64 randu64() { union { struct { uint32 u32[2]; }; uint64 u64; } x; x.u32[0] = randu32(); x.u32[1] = randu32(); return x.u64; } void ResetRand(unsigned Seed) { g_InitRandDone = true; SLCG_srand(Seed); for (unsigned i = 0; i < 5; i++) g_X[i] = SLCG_rand(); for (unsigned i = 0; i < 100; i++) IncrementRand(); } unsigned GetCPUCoreCount() { #ifdef _MSC_VER SYSTEM_INFO SI; GetSystemInfo(&SI); unsigned n = SI.dwNumberOfProcessors; if (n == 0 || n > 64) return 1; return n; #else long n = sysconf(_SC_NPROCESSORS_ONLN); if (n <= 0) return 1; return (unsigned) n; #endif } unsigned GetThreadIndex() { return omp_get_thread_num(); } // MUST COME AT END BECAUSE OF #undefs #undef myalloc #undef myfree #if RCE_MALLOC #undef mymalloc #undef myfree #undef myfree2 static unsigned g_NewCalls; static unsigned g_FreeCalls; static double g_InitialMemUseBytes; static double g_TotalAllocBytes; static double g_TotalFreeBytes; static double g_NetBytes; static double g_MaxNetBytes; void LogAllocStats() { Log("\n"); Log(" Allocs %u\n", g_NewCalls); Log(" Frees %u\n", g_FreeCalls); Log("Initial alloc %s\n", MemBytesToStr(g_InitialMemUseBytes)); Log(" Total alloc %s\n", MemBytesToStr(g_TotalAllocBytes)); Log(" Total free %s\n", MemBytesToStr(g_TotalFreeBytes)); Log(" Net bytes %s\n", MemBytesToStr(g_NetBytes)); Log("Max net bytes %s\n", MemBytesToStr(g_MaxNetBytes)); Log(" Peak total %s\n", MemBytesToStr(g_MaxNetBytes + g_InitialMemUseBytes)); } void *mymalloc(unsigned n, unsigned bytes, const char *FileName, int Line) { void *rce_malloc(unsigned bytes, const char *FileName, int Line); return rce_malloc(n*bytes, FileName, Line); } void myfree(void *p, const char *FileName, int Line) { void rce_free(void *p, const char *FileName, int Line); rce_free(p, FileName, Line); } void myfree2(void *p, unsigned bytes, const char *FileName, int Line) { void rce_free(void *p, const char *FileName, int Line); rce_free(p, FileName, Line); } #else // RCE_MALLOC #if ALLOC_TOTALS void LogAllocSummary() { extern unsigned g_AllocCount; extern unsigned g_FreeCount; extern uint64 g_AllocTotal; extern uint64 g_FreeTotal; double RAM = GetMemUseBytes(); Log("RAM %s", MemBytesToStr(RAM)); Log(", malloc %s", MemBytesToStr(g_AllocTotal)); Log(", free %s", MemBytesToStr(g_FreeTotal)); Log(", net %s\n", MemBytesToStr(g_AllocTotal - g_FreeTotal)); } #endif void *mymalloc64(unsigned BytesPerObject, uint64 N) { uint64 Bytes = N*BytesPerObject; if (Bytes >= UINT32_MAX - 1024) Die("Memory object >4Gb, probably due to long seqences"); byte *p = (byte *) malloc(Bytes); if (p == 0) Die("myalloc64(%u, %.3g) failed", BytesPerObject, double(N)); return p; } void *mymalloc(unsigned n, unsigned bytes) { ++g_AllocCount; uint64 Bytes64 = uint64(n)*uint64(bytes); if (Bytes64 > uint64(UINT_MAX)) Die("%s(%u): mymalloc(%u, %u) overflow", g_AllocFile, g_AllocLine, n, bytes); #if ALLOC_TOTALS g_AllocTotal += Bytes64; Bytes64 += 4; #endif uint32 Bytes32 = uint32(Bytes64); void *p = malloc(Bytes32); if (0 == p) { double b = GetMemUseBytes(); double Total = b + double(Bytes32); #if BITS==32 if (Total > 2e9) { Log("\n%s(%u): Out of memory, mymalloc(%u, %u), curr %.3g bytes, total %.3g (%s)\n", g_AllocFile, g_AllocLine, n, bytes, b, Total, MemBytesToStr(Total)); Die("Memory limit of 32-bit process exceeded, 64-bit build required"); } #endif fprintf(stderr, "\n%s(%u): Out of memory mymalloc(%u), curr %.3g bytes", g_AllocFile, g_AllocLine, (unsigned) bytes, b); #if DEBUG && defined(_MSC_VER) asserta(_CrtCheckMemory()); #endif Die("%s(%u): Out of memory, mymalloc(%u, %u), curr %.3g bytes, total %.3g (%s)\n", g_AllocFile, g_AllocLine, n, bytes, b, Total, MemBytesToStr(Total)); } #if ALLOC_TOTALS *((uint32 *) p) = Bytes32; return (void *) ((byte *) p + 4); #else return p; #endif } void myfree(void *p) { if (p == 0) return; ++g_FreeCount; #if ALLOC_TOTALS uint32 *pi = (uint32 *) p; uint32 Bytes32 = *(pi - 1); g_FreeTotal += Bytes32; free((void *) (pi - 1)); #else free(p); #endif } #endif // RCE_MALLOC void CompilerInfo(); vector g_Argv; #define FLAG_OPT(Name) bool opt_##Name; bool optset_##Name; bool optused_##Name; #define UNS_OPT(Name) unsigned opt_##Name; bool optset_##Name; bool optused_##Name; #define FLT_OPT(Name) double opt_##Name; bool optset_##Name; bool optused_##Name; #define STR_OPT(Name) string opt_##Name; bool optset_##Name; bool optused_##Name; #include "myopts.h" static void CheckUsedOpt(bool Set, bool Used, const char *Name) { if (Set && !Used) Warning("Option -%s not used", Name); } void CheckUsedOpts(bool LogAll) { #define FLAG_OPT(Name) CheckUsedOpt(optset_##Name, optused_##Name, #Name); #define UNS_OPT(Name) CheckUsedOpt(optset_##Name, optused_##Name, #Name); #define FLT_OPT(Name) CheckUsedOpt(optset_##Name, optused_##Name, #Name); #define STR_OPT(Name) CheckUsedOpt(optset_##Name, optused_##Name, #Name); #include "myopts.h" } static void CmdLineErr(const char *Format, ...) { fprintf(stderr, "\n\n"); va_list ArgList; va_start(ArgList, Format); fprintf(stderr, "Invalid command line\n"); vfprintf(stderr, Format, ArgList); fprintf(stderr, "\n\n"); va_end(ArgList); exit(1); } static void GetArgsFromFile(const string &FileName, vector &Args) { Args.clear(); FILE *f = OpenStdioFile(FileName); string Line; while (ReadLineStdioFile(f, Line)) { size_t n = Line.find('#'); if (n != string::npos) Line = Line.substr(0, n); vector Fields; Split(Line, Fields, 0); Args.insert(Args.end(), Fields.begin(), Fields.end()); } CloseStdioFile(f); } static bool TryFlagOpt(const char *OptName) { #define FLAG_OPT(Name) if (strcmp(OptName, #Name) == 0) { opt_##Name = true; optset_##Name = true; return true; } #define UNS_OPT(Name) /* empty */ #define FLT_OPT(Name) /* empty */ #define STR_OPT(Name) /* empty */ #include "myopts.h" return false; } static bool TryUnsOpt(const char *OptName, const char *Value) { #define UNS_OPT(Name) if (strcmp(OptName, #Name) == 0) { opt_##Name = StrToUint(Value); optset_##Name = true; return true; } #define FLAG_OPT(Name) /* empty */ #define FLT_OPT(Name) /* empty */ #define STR_OPT(Name) /* empty */ #include "myopts.h" return false; } static bool TryFloatOpt(const char *OptName, const char *Value) { #define FLT_OPT(Name) if (strcmp(OptName, #Name) == 0) { opt_##Name = StrToFloat(Value); optset_##Name = true; return true; } #define UNS_OPT(Name) /* empty */ #define FLAG_OPT(Name) /* empty */ #define STR_OPT(Name) /* empty */ #include "myopts.h" return false; } static bool TryStrOpt(const char *OptName, const char *Value) { #define STR_OPT(Name) if (strcmp(OptName, #Name) == 0) { opt_##Name = mystrsave(Value); optset_##Name = true; return true; } #define UNS_OPT(Name) /* empty */ #define FLT_OPT(Name) /* empty */ #define FLAG_OPT(Name) /* empty */ #include "myopts.h" return false; } void MyCmdLine(int argc, char **argv) { if (argc == 1) { void Usage(FILE *f); Usage(stdout); exit(0); } for (unsigned i = 0; i < (unsigned) argc; ) { const string &Arg = argv[i]; if (Arg == "file:" && i + 1 < (unsigned) argc) { const string &FileName = argv[i+1]; vector Args; GetArgsFromFile(FileName, Args); for (unsigned k = 0; k < SIZE(Args); ++k) g_Argv.push_back(Args[k]); i += 2; } else { g_Argv.push_back(Arg); i += 1; } } const unsigned ArgCount = SIZE(g_Argv); unsigned ArgIndex = 1; for (;;) { if (ArgIndex >= ArgCount) break; const string &Arg = g_Argv[ArgIndex]; if (Arg.size() > 1 && Arg[0] == '-') { string LongName = (Arg.size() > 2 && Arg[1] == '-' ? Arg.substr(2) : Arg.substr(1)); if (LongName == "version") { void cmd_version(); cmd_version(); return; } bool IsFlag = TryFlagOpt(LongName.c_str()); if (IsFlag) { ++ArgIndex; continue; } ++ArgIndex; if (ArgIndex >= ArgCount) CmdLineErr("Invalid option or missing value -%s", LongName.c_str()); const char *Value = g_Argv[ArgIndex].c_str(); bool IsUns = TryUnsOpt(LongName.c_str(), Value); if (IsUns) { ++ArgIndex; continue; } bool IsFloat = TryFloatOpt(LongName.c_str(), Value); if (IsFloat) { ++ArgIndex; continue; } bool IsStr = TryStrOpt(LongName.c_str(), Value); if (IsStr) { ++ArgIndex; continue; } CmdLineErr("Unknown option %s", LongName.c_str()); } else if ((byte) Arg[0] > 127) CmdLineErr("Invalid 8-bit byte in '%s' (did you paste from web page?)", Arg.c_str()); else CmdLineErr("Expected -option_name or --option_name, got '%s'", Arg.c_str()); } #if TIMING if (opt_threads > 1) Die("--threads > 1 && TIMING"); #endif if (opt_compilerinfo) { CompilerInfo(); exit(0); } } void GetAccFromLabel(const string &Label, string &Acc) { Acc.clear(); for (uint i = 0; i < SIZE(Label); ++i) { char c = Label[i]; if (isalnum(c) || c == '_') Acc += c; else return; } } void GetBaseName(const string &PathName, string &BaseName) { BaseName.clear(); const uint n = SIZE(PathName); if (n == 0) return; uint Start = 0; for (uint i = 0; i + 1 < n; ++i) { char c = PathName[i]; if (c == '/' || c == '\\') Start = i + 1; } for (uint i = Start; i < n; ++i) { char c = PathName[i]; if (i + n == n && c == '/') break; BaseName += c; } const char *s = BaseName.c_str(); #define x(e) { const char *p = strstr(s, e); if (p != 0) BaseName.resize(p-s); } x(".afa") x(".fa") x(".aln") x(".msa") #undef x } void SeqToFasta(FILE *f, const string &Seq, const string &Label) { SeqToFasta(f, (const byte *) Seq.c_str(), SIZE(Seq), Label.c_str()); } void SeqToFasta(FILE *f, const byte *Seq, unsigned L, const char *Label) { if (f == 0) return; if (L == 0) return; if (Label != 0) fprintf(f, ">%s\n", Label); const unsigned ROWLEN = 80; if (ROWLEN == 0) { WriteStdioFile(f, Seq, L); fputc('\n', f); return; } unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN; for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex) { unsigned From = BlockIndex*ROWLEN; unsigned To = From + ROWLEN; if (To >= L) To = L; for (unsigned Pos = From; Pos < To; ++Pos) fputc(Seq[Pos], f); fputc('\n', f); } } // Fisher-Yates shuffle: // To shuffle an array a of n elements (indices 0 .. n-1): // for i from n - 1 downto 1 do // j := random integer with 0 <= j <= i // exchange a[j] and a[i] void Shuffle(vector &v) { const unsigned N = SIZE(v); for (unsigned i = N - 1; i >= 1; --i) { unsigned j = randu32()%(i + 1); unsigned vi = v[i]; unsigned vj = v[j]; v[i] = vj; v[j] = vi; } } void Dirize(string &Dir) { if (!EndsWith(Dir, "/")) Dir += "/"; } muscle-5.1.0/src/myutils.h000066400000000000000000000332431424453062600154660ustar00rootroot00000000000000#ifndef myutils_h #define myutils_h #ifdef _MSC_VER #if !defined(MYUTILS_CPP) #define _SCL_SECURE_NO_WARNINGS #endif // !defined(IN_MYUTILS_CPP) #endif #define RCE_MALLOC 0 #define TRACK_ALLOC 0 #define ALLOC_TOTALS 0 #if defined(__x86_64__) || defined(_M_X64) || defined(__arm64__) #define BITS 64 #else #define BITS 32 #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // isatty() #ifdef WIN32 #include #else #include #endif #ifndef _MSC_VER #define _stricmp strcasecmp #endif //#ifndef max //#define max(x, y) ((x) >= (y) ? (x) : (y)) //#define min(x, y) ((x) <= (y) ? (x) : (y)) //#endif using namespace std; #ifdef _MSC_VER #include #pragma warning(disable: 4996) // deprecated functions #define _CRT_SECURE_NO_DEPRECATE 1 #endif #if defined(_DEBUG) && !defined(DEBUG) #define DEBUG 1 #endif #if defined(DEBUG) && !defined(_DEBUG) #define _DEBUG 1 #endif #ifndef NDEBUG #define DEBUG 1 #define _DEBUG 1 #endif #define byte __mybyte__ typedef unsigned char byte; typedef unsigned short uint16; typedef unsigned uint32; typedef unsigned uint; // typedefs for int64 and uint64 #if defined(_MSC_VER) typedef __int64 int64; typedef unsigned __int64 uint64; #elif defined(__GNUC__) typedef long int64; typedef unsigned long uint64; #else #error "int64 typedefs" #endif #if BITS==32 typedef uint32 uintb; #else typedef uint64 uintb; #endif #ifndef UINT32_MAX const uint32 UINT32_MAX = (~(uint32(0))); #endif #ifndef UINT64_MAX const uint64 UINT64_MAX = (~(uint64(0))); #endif #ifndef SIZE_T_MAX const size_t SIZE_T_MAX = (~(size_t(1))); #endif void myassertfail(const char *Exp, const char *File, unsigned Line); #undef assert #ifdef NDEBUG #define assert(exp) ((void)0) #define myassert(exp) ((void)0) #else #define assert(exp) (void)( (exp) || (myassertfail(#exp, __FILE__, __LINE__), 0) ) #define myassert(exp) (void)( (exp) || (myassertfail(#exp, __FILE__, __LINE__), 0) ) #endif #define asserta(exp) (void)( (exp) || (myassertfail(#exp, __FILE__, __LINE__), 0) ) #define ureturn(x) return (x) #if DEBUG && defined(_MSC_VER) #if RCE_MALLOC #define _chkmem() rce_chkmem() #else #define _chkmem() asserta(_CrtCheckMemory()) #endif #endif #if ALLOC_TOTALS void LogAllocSummary(); #endif #define NotUsed(v) ((void *) &v) // pom=plus or minus, tof=true or false, yon=yes or no static inline char pom(bool Plus) { return Plus ? '+' : '-'; } static inline char tof(bool x) { return x ? 'T' : 'F'; } static inline char yon(bool x) { return x ? 'Y' : 'N'; } static inline const char *YesOrNo(bool x) { return x ? "Yes" : "No"; } static inline const char *plurals(unsigned n) { return n == 1 ? "" : "s"; } const char *GetPlatform(); unsigned GetElapsedSecs(); void mysleep(unsigned ms); void mylistdir(const string &DirName, vector &FileNames); void PrintVersion(FILE *f); #if RCE_MALLOC void *rce_malloc(unsigned n, unsigned bytes, const char *FileName, int Line); void rce_free(void *p, const char *FileName, int LineNr); void rce_chkmem(); void rce_assertvalidptr_(void *p, const char *FileName, int LineNr); #define rce_assertvalidptr(p) rce_assertvalidptr_(p, __FILE__, __LINE__) void rce_dumpptr_(void *p, const char *FileName, int LineNr); #define rce_dumpptr(p) rce_dumpptr_(p, __FILE__, __LINE__) #define mymalloc(n, m) rce_malloc((n), (m), __FILE__, __LINE__) #define myfree(p) rce_free(p, __FILE__, __LINE__) #define myalloc(t, n) (t *) rce_malloc((n)*sizeof(t), __FILE__, __LINE__) #else // RCE_MALLOC void *mymalloc(unsigned n, unsigned bytes); void myfree(void *p); #define rce_chkmem() /* empty */ extern unsigned g_AllocLine; extern const char *g_AllocFile; #define myalloc(t, n) (t *) (g_AllocLine = __LINE__, g_AllocFile = __FILE__, (mymalloc((n), sizeof(t)))) void *mymalloc64(unsigned BytesPerObject, uint64 N); #define myalloc64(t, n) (t *) mymalloc64(sizeof(t), (n)); #endif // RCE_MALLOC #if TRACK_ALLOC #undef myalloc #undef myfree void *myalloc_track(unsigned Bytes, const char *FileName, int LineNr); void myfree_track(void *p, const char *FileName, int LineNr); void myalloc_trace(bool On); #define myalloc(t, n) (t *) myalloc_track(sizeof(t)*(n), __FILE__, __LINE__) #define myfree(p) myfree_track(p, __FILE__, __LINE__) #endif // TRACK_ALLOC #define SIZE(c) unsigned((c).size()) #define RoundUp(Bytes, BlockSize) ((Bytes) + ((BlockSize) - (Bytes)%(BlockSize))) bool myisatty(int fd); #ifdef _MSC_VER #define off_t __int64 #endif void GetVersionString(string &s); // Stdio functions without "nr of bytes" arg. FILE *OpenStdioFile(const string &FileName); FILE *CreateStdioFile(const string &FileName); void CloseStdioFile(FILE *f); bool ReadLineStdioFile(FILE *f, string &Line); void ReadTabbedLineStdioFile(FILE *f, vector &Fields, unsigned FieldCount); void FlushStdioFile(FILE *f); bool StdioFileExists(const string &FileName); void LogStdioFileState(FILE *f); void RenameStdioFile(const string &FileNameFrom, const string &FileNameTo); void MoveStdioFile(const string &FileName1, const string &FileName2); void DeleteStdioFile(const string &FileName); void WriteStdioFileStr(FILE *f, const char *s); void Pr(FILE *f, const char *Format, ...); void ParseFileName(const string &FileName, string &Path, string &Name); // void ReadDir(const string &DirName, vector &FileNames); // Stdio functions with size args: byte *ReadAllStdioFile32(FILE *f, uint32 &FileSize); byte *ReadAllStdioFile64(FILE *f, uint64 &FileSize); byte *ReadAllStdioFile(FILE *f, uint32 &FileSize); byte *ReadAllStdioFile64(FILE *f, uint64 &FileSize); byte *ReadAllStdioFile32(const string &FileName, uint32 &FileSize); byte *ReadAllStdioFile64(const string &FileName, uint64 &FileSize); bool ReadLineStdioFile(FILE *f, char *Line, uint32 Bytes); bool ReadLineStdioFile64(FILE *f, char *Line, uint64 Bytes); void SetStdioFilePos(FILE *f, uint32 Pos); void SetStdioFilePos64(FILE *f, uint64 Pos); uint32 GetStdioFilePos32(FILE *f); uint64 GetStdioFilePos64(FILE *f); uint32 GetStdioFileSize32(FILE *f); uint64 GetStdioFileSize64(FILE *f); #if BITS==32 #define uintB uint32 #define GetStdioFilePosB GetStdioFilePos32 #define GetStdioFileSizeB GetStdioFileSize32 #define SetStdioFilePosB SetStdioFilePos #else #define uintB uint64 #define GetStdioFilePosB GetStdioFilePos64 #define GetStdioFileSizeB GetStdioFileSize64 #define SetStdioFilePosB SetStdioFilePos64 #endif uint32 ReadStdioFile_NoFail(FILE *f, void *Buffer, uint32 Bytes); void ReadStdioFile(FILE *f, uint32 Pos, void *Buffer, uint32 Bytes); void ReadStdioFile64(FILE *f, uint64 Pos, void *Buffer, uint64 Bytes); void ReadStdioFile(FILE *f, void *Buffer, uint32 Bytes); void ReadStdioFile64(FILE *f, void *Buffer, uint64 Bytes); void WriteStdioFile(FILE *f, uint32 Pos, const void *Buffer, uint32 Bytes); void WriteStdioFile64(FILE *f, uint64 Pos, const void *Buffer, uint64 Bytes); void WriteStdioFile(FILE *f, const void *Buffer, uint32 Bytes); void WriteStdioFile64(FILE *f, const void *Buffer, uint64 Bytes); #define MAGIC(a, b, c, d) uint32(uint32(a)<<24 | uint32(b)<<16 | uint32(c)<<8 | (d)) void Pf(FILE *f, const char *szFormat, ...); void Ps(string &Str, const char *szFormat, ...); void Psa(string &Str, const char *szFormat, ...); void Psasc(string &Str, const char *szFormat, ...); void SetLogFileName(const string &FileName); void Log(const char *szFormat, ...); #if DEBUG void Die_(const char *szFormat, ...); void Warning_(const char *szFormat, ...); typedef void (*PTR_PRINTFLIKE_FN)(const char *Format, ...); static inline PTR_PRINTFLIKE_FN DiePtr(const char *FileName, unsigned LineNr) { fprintf(stderr, "\n\n%s(%u): ", FileName, LineNr); Log("\n\n%s(%u): ", FileName, LineNr); return Die_; } static inline PTR_PRINTFLIKE_FN WarningPtr(const char *FileName, unsigned LineNr) { fprintf(stderr, "\n\n%s(%u): ", FileName, LineNr); Log("\n\n%s(%u): ", FileName, LineNr); return Warning_; } #define Die (*DiePtr(__FILE__, __LINE__)) #define Warning (*WarningPtr(__FILE__, __LINE__)) #else void Die_(const char *szFormat, ...); void Warning_(const char *szFormat, ...); #define Die Die_ #define Warning Warning_ #endif typedef const char *(*FN_PROGRESS_CALLBACK)(); void SetPCB(FN_PROGRESS_CALLBACK PCB); void ProgressCallback(unsigned i, unsigned N); bool ProgressPrefix(bool On); void ProgressStep(unsigned i, unsigned N, const char *Format, ...); void ProgressStep64(uint64 i, uint64 N, const char *Msg); void Progress(const char *szFormat, ...); void Progress(const string &Str); void ProgressLog(const char *szFormat, ...); void ProgressLogPrefix(const char *Format, ...); const char *GetProgressPrefixCStr(); void ProgressFileInit(FILE *f, const char *Format = 0, ...); void ProgressFileStep(const char *Format = 0, ...); void ProgressFileDone(const char *Format = 0, ...); void LogElapsedTimeAndRAM(); void LogProgramInfoAndCmdLine(); void Help(); inline unsigned ustrlen(const char *s) { return (unsigned) strlen(s); } inline unsigned ustrlen(const string &s) { return SIZE(s); } char *mystrsave(const char *s); unsigned myipow(unsigned x, unsigned y); uint64 myipow64(unsigned x, unsigned y); static inline unsigned atou(const char *s) { return unsigned(atoi(s)); } static inline unsigned atou(const string &s) { return unsigned(atoi(s.c_str())); } unsigned GetThreadIndex(); double GetMemUseBytes(); double GetPeakMemUseBytes(); double GetPhysMemBytes(); double GetUsableMemBytes(); // https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ inline bool feq(double x, double y) { if (x == y) return true; double X = fabs(x); double Y = fabs(y); double Max = max(X, Y); double Diff = fabs(X-Y); return Diff < Max*0.01; } #define asserteq(x, y) assert(feq(x, y)) #define assertaeq(x, y) asserta(feq(x, y)) #define memset_zero(a, n) memset((a), 0, (n)*sizeof(a[0])) void ResetRand(unsigned Seed); unsigned randu32(); uint64 randu64(); void Split(const string &Str, vector &Fields, char Sep = '\t'); void StripWhiteSpace(string &Str); bool StartsWith(const string &s, const string &t); bool StartsWith(const char *s, const char *t); void ToUpper(const string &s, string &t); void ToLower(const string &s, string &t); void Reverse(string &s); static inline double GetRatio(double x, double y) { if (y == 0) { asserta(x == 0); return 0; } return x/y; } static inline double GetPct(double x, double y) { return 100.0*GetRatio(x, y); } static inline double GetPct64(int64 x, int64 y) { return 100.0*GetRatio(double(x), double(y)); } const char *PctToStr(double Pct); double GetMemUseBytes(); void PrintCmdLine(FILE *f); void PrintBanner(FILE *f); const char *MemBytesToStr(double Bytes); static inline const char *MemBytesToStr(uint64 Bytes) { return MemBytesToStr((double) Bytes); } double StrToMemBytes(const string &s); double StrToFloat(const char *s, bool StarIsDblMax = false); double StrToFloat(const string &s, bool StarIsDblMax = false); bool IsValidFloatStr(const char *s); bool IsValidFloatStr(const string &s); const char *GetElapsedTimeStr(string &s); const char *GetMaxRAMStr(string &s); const char *BaseName(const char *PathName); const char *IntToStr(uint64 i); const char *IntToStr2(uint64 i); const char *Int64ToStr(uint64 i); const char *FloatToStr(double d); const char *FloatToStr(uint64 i); const char *IntFloatToStr(double d); const char *SecsToStr(double Secs); bool IsUintStr(const char *s); unsigned StrToUint(const char *s, bool StarIsUnitMax = false); unsigned StrToUint(const string &s, bool StarIsUnitMax = false); uint64 StrToUint64(const char *s); uint64 StrToUint64(const string &s); bool EndsWith(const string &s, const string &t); bool Replace(string &s, const string &a, const string &b); double mylog2(double x); double mylog10(double x); void LogInt(unsigned i, unsigned w = UINT_MAX); void Logu(unsigned u, unsigned w, unsigned prefixspaces = 2); void Logf(float x, unsigned w, unsigned prefixspaces = 2); const char *SecsToHHMMSS(unsigned Secs); unsigned GetCPUCoreCount(); void MyCmdLine(int argc, char **argv); //void CmdLineErr(const char *Format, ...); void GetCmdLine(string &s); #define FLAG_OPT(Name) extern bool opt_##Name; extern bool optset_##Name; extern bool optused_##Name; #define UNS_OPT(Name) extern unsigned opt_##Name; extern bool optset_##Name; extern bool optused_##Name; #define FLT_OPT(Name) extern double opt_##Name; extern bool optset_##Name; extern bool optused_##Name; #define STR_OPT(Name) extern string opt_##Name; extern bool optset_##Name; extern bool optused_##Name; #include "myopts.h" #define opt(Name) (optused_##Name = true, opt_##Name) #define optd(Name, Default) (optset_##Name ? (optused_##Name = true, opt_##Name) : Default) void CheckUsedOpts(bool LogAll); extern FILE *g_fLog; void SeqToFasta(FILE *f, const string &Seq, const string &Label); void SeqToFasta(FILE *f, const byte *Seq, unsigned L, const char *Label); void SeqToFastq(FILE *f, const byte *Seq, unsigned L, const char *Qual, const char *Label); void SeqToFastaRC(FILE *f, const byte *Seq, unsigned L, const char *Label); void RevCompSeq(const byte *Seq, unsigned L, byte *RCSeq); void LogAllocs(); unsigned GetRequestedThreadCount(); void Dirize(string &Dir); inline char mytoupper(char c) { return c & (~0x20); } inline char mytoupper(byte c) { return c & (~0x20); } inline bool myislower(byte c) { return (c & 0x20) != 0; } inline bool myislower(char c) { return (c & 0x20) != 0; } bool AccChar(char c); void GetAccFromLabel(const string &Label, string &Acc); void GetBaseName(const string &PathName, string &BaseName); void Shuffle(vector &v); #define NO_TRACE 0 #define TMP_TRACE 2 // true but not 1, for grep_trace #define REMOVEME 1 #endif // myutils_h muscle-5.1.0/src/pairhmm.cpp000066400000000000000000000045321424453062600157470ustar00rootroot00000000000000#include "myutils.h" #include "pairhmm.h" #include "hmmparams.h" float PairHMM::m_StartScore[HMMSTATE_COUNT]; float PairHMM::m_TransScore[HMMSTATE_COUNT][HMMSTATE_COUNT]; float PairHMM::m_MatchScore[256][256]; float PairHMM::m_InsScore[256]; /*** M IX IY JX JY [0] [1] [2] [3] [4] M [0] 0.960 0.012 0.012 0.00801 0.00801 IX [1] 0.603 0.397 0 0 0 IY [2] 0.603 0 0.397 0 0 JX [3] 0.101 0 0 0.899 0 JY [4] 0.101 0 0 0 0.899 ***/ static vector > transMat; static void ConstructTransMat(const vector& gapOpen, const vector& gapExtend) { transMat.clear(); transMat.resize(HMMSTATE_COUNT); for (uint i = 0; i < HMMSTATE_COUNT; ++i) transMat[i].resize(HMMSTATE_COUNT); transMat[0][0] = 1; for (uint i = 0; i < InsertStateCount; i++) { transMat[0][2 * i + 1] = gapOpen[2 * i]; transMat[0][2 * i + 2] = gapOpen[2 * i + 1]; transMat[0][0] -= (gapOpen[2 * i] + gapOpen[2 * i + 1]); transMat[2 * i + 1][2 * i + 1] = gapExtend[2 * i]; transMat[2 * i + 2][2 * i + 2] = gapExtend[2 * i + 1]; transMat[2 * i + 1][2 * i + 2] = 0; transMat[2 * i + 2][2 * i + 1] = 0; transMat[2 * i + 1][0] = 1 - gapExtend[2 * i]; transMat[2 * i + 2][0] = 1 - gapExtend[2 * i + 1]; } asserta(transMat[0][0] > 0); } void PairHMM::Create2(const vector& initDistribMat, const vector > &transMat, const vector& emitSingle, const vector > &emitPairs) { for (int i = 0; i < HMMSTATE_COUNT; i++) m_StartScore[i] = log(initDistribMat[i]); for (int i = 0; i < HMMSTATE_COUNT; i++) for (int j = 0; j < HMMSTATE_COUNT; j++) m_TransScore[i][j] = log(transMat[i][j]); for (int i = 0; i < 256; i++) m_InsScore[i] = log(emitSingle[i]); for (int i = 0; i < 256; i++) for (int j = 0; j < 256; j++) m_MatchScore[i][j] = log(emitPairs[i][j]); } void PairHMM::Create(const vector& initDistribMat, const vector& gapOpen, const vector& gapExtend, const vector>& emitPairs, const vector& emitSingle) { ConstructTransMat(gapOpen, gapExtend); Create2(initDistribMat, transMat, emitSingle, emitPairs); } muscle-5.1.0/src/pairhmm.h000066400000000000000000000021141424453062600154060ustar00rootroot00000000000000#ifndef pairhmm_h #define pairhmm_h #include "scoretype.h" #include "multisequence.h" #include "hmmparams.h" static const uint DEFAULT_CONSISTENCY_ITERS = 2; static const uint DEFAULT_REFINE_ITERS = 100; enum HMMSTATE { HMMSTATE_M = 0, HMMSTATE_IX = 1, HMMSTATE_IY = 2, HMMSTATE_JX = 3, HMMSTATE_JY = 4, HMMSTATE_COUNT = 5 }; static const uint InsertStateCount = 2; class PairHMM { public: static float m_StartScore[HMMSTATE_COUNT]; static float m_TransScore[HMMSTATE_COUNT][HMMSTATE_COUNT]; static float m_MatchScore[256][256]; static float m_InsScore[256]; public: static void Create(const vector& initDistribMat, const vector& gapOpen, const vector& gapExtend, const vector>& emitPairs, const vector& emitSingle); static void Create2(const vector& initDistribMat, const vector > &transMat, const vector& emitSingle, const vector > &emitPairs); static void WriteParamsReport(const string &FileName); static void WriteParamsReport(FILE *f); static void FixUT(); }; #endif muscle-5.1.0/src/permutetree.cpp000066400000000000000000000077121424453062600166560ustar00rootroot00000000000000#include "muscle.h" #include "tree.h" #include "sort.h" void DivideTree(const Tree &InputTree, uint Node, Tree &Subtree, Tree &Supertree); void JoinTrees(const Tree &Tree1, const Tree &Tree2, Tree &OutputTree, float NewEdgeLength); void StringsToFile(const string &FileName, const vector &v) { if (FileName.empty()) return; FILE *f = CreateStdioFile(FileName); const uint N = SIZE(v); for (uint i = 0; i < N; ++i) { fputs(v[i].c_str(), f); fputc('\n', f); } CloseStdioFile(f); } void DivideTreeFraction(const Tree &InputTree, double Fract, Tree &Tree1, Tree &Tree2) { asserta(Fract > 0 && Fract < 1); const uint InputLeafCount = InputTree.GetLeafCount(); asserta(InputLeafCount >= 3); asserta(InputTree.IsRooted()); const uint NodeCount = InputTree.GetNodeCount(); uint BestNode = UINT_MAX; uint BestLeafCount = UINT_MAX; uint BestDiff = UINT_MAX; uint TargetLeafCount = uint(InputLeafCount*Fract + 0.5); if (TargetLeafCount == 0) TargetLeafCount = 1; for (uint Node = 0; Node < NodeCount; ++Node) { uint SubtreeLeafCount = InputTree.GetSubtreeLeafCount(Node); uint Diff = (SubtreeLeafCount > TargetLeafCount ? SubtreeLeafCount - TargetLeafCount : TargetLeafCount - SubtreeLeafCount); if (BestNode == UINT_MAX || Diff < BestDiff) { BestNode = Node; BestDiff = Diff; BestLeafCount = SubtreeLeafCount; } } DivideTree(InputTree, BestNode, Tree1, Tree2); } void PermuteTree(const Tree &InputTree, Tree &TreeABC, Tree &TreeACB, Tree &TreeBCA, vector &LabelsA, vector &LabelsB, vector &LabelsC) { LabelsA.clear(); LabelsB.clear(); LabelsC.clear(); const uint InputLeafCount = InputTree.GetLeafCount(); if (InputLeafCount < 10) { TreeABC.Copy(InputTree); TreeACB.Copy(InputTree); TreeBCA.Copy(InputTree); return; } const float NewEdgeLength = 0.1f; Tree TreeA; Tree TreeBC; Tree TreeB; Tree TreeC; DivideTreeFraction(InputTree, 0.33, TreeA, TreeBC); DivideTreeFraction(TreeBC, 0.5, TreeB, TreeC); TreeA.GetLeafLabels(LabelsA); TreeB.GetLeafLabels(LabelsB); TreeC.GetLeafLabels(LabelsC); Tree JoinAB; JoinTrees(TreeA, TreeB, JoinAB, NewEdgeLength); JoinTrees(JoinAB, TreeC, TreeABC, NewEdgeLength); Tree JoinAC; JoinTrees(TreeA, TreeC, JoinAC, NewEdgeLength); JoinTrees(JoinAC, TreeB, TreeACB, NewEdgeLength); Tree JoinBC; JoinTrees(TreeB, TreeC, JoinBC, NewEdgeLength); JoinTrees(JoinBC, TreeA, TreeBCA, NewEdgeLength); TreeABC.Ladderize(true); TreeACB.Ladderize(true); TreeBCA.Ladderize(true); } void PermTree(Tree &InputTree, TREEPERM TP) { if (TP == TP_None) return; uint LeafCount = InputTree.GetLeafCount(); if (LeafCount < 10) return; Tree TreeABC; Tree TreeACB; Tree TreeBCA; vector LabelsA; vector LabelsB; vector LabelsC; PermuteTree(InputTree, TreeABC, TreeACB, TreeBCA, LabelsA, LabelsB, LabelsC); switch (TP) { case TP_ABC: InputTree.Copy(TreeABC); break; case TP_ACB: InputTree.Copy(TreeACB); break; case TP_BCA: InputTree.Copy(TreeBCA); break; default: asserta(false); } } void cmd_permute_tree() { const string &InputFileName = opt(permute_tree); const string &OutputFileName = opt(output); Tree InputTree; InputTree.FromFile(InputFileName); Tree TreeABC; Tree TreeACB; Tree TreeBCA; vector LabelsA; vector LabelsB; vector LabelsC; PermuteTree(InputTree, TreeABC, TreeACB, TreeBCA, LabelsA, LabelsB, LabelsC); if (optset_prefix) { const char *Prefix = opt(prefix).c_str(); string FileNameABC; string FileNameACB; string FileNameBCA; Ps(FileNameABC, "%sABC.newick", Prefix); Ps(FileNameACB, "%sACB.newick", Prefix); Ps(FileNameBCA, "%sBCA.newick", Prefix); TreeABC.ToFile(FileNameABC); TreeACB.ToFile(FileNameACB); TreeBCA.ToFile(FileNameBCA); StringsToFile(string(Prefix) + "labelsA.txt", LabelsA); StringsToFile(string(Prefix) + "labelsB.txt", LabelsB); StringsToFile(string(Prefix) + "labelsC.txt", LabelsC); } } muscle-5.1.0/src/perturbhmm.cpp000066400000000000000000000036771424453062600165100ustar00rootroot00000000000000#include "muscle.h" static void Perturb(float &P, float Var) { asserta(Var >= 0 && Var < 1); uint Pct = randu32()%100; float Fract = Pct/100.0f; asserta(Fract >= 0 && Fract <= 1); float Lo = 1.0f - Var; float Hi = 1.0f + Var; float d = Lo + (Hi - Lo)*Fract; P *= d; } void HMMParams::PerturbProbs(uint Seed) { if (Seed == 0) return; ResetRand(Seed); asserta(m_Var > 0 && m_Var < 1); for (uint i = 0; i < SIZE(m_Trans); ++i) Perturb(m_Trans[i], m_Var); const uint AlphaSize = GetAlphaSize(); for (uint i = 0; i < AlphaSize; ++i) for (uint j = 0; j <= i; ++j) { float P = m_Emits[i][j]; Perturb(P, m_Var); m_Emits[i][j] = P; m_Emits[j][i] = P; } Normalize(); } void HMMParams::Compare(const HMMParams &HP1, const HMMParams &HP2, float &MeanTransDelta, float &MeanEmitDelta) { const uint NT = SIZE(HP1.m_Trans); const uint AlphaSize = HP1.GetAlphaSize(); asserta(SIZE(HP2.m_Trans) == NT); asserta(SIZE(HP1.m_Emits) == AlphaSize); asserta(SIZE(HP2.m_Emits) == AlphaSize); float SumT = 0; for (uint i = 0; i < NT; ++i) { float P1 = HP1.m_Trans[i]; float P2 = HP2.m_Trans[i]; SumT += abs(P1 - P2); } MeanTransDelta = SumT/NT; float SumE = 0; for (uint i = 0; i < AlphaSize; ++i) { for (uint j = 0; j < AlphaSize; ++j) { float P1 = HP1.m_Emits[i][j]; float P2 = HP2.m_Emits[i][j]; SumE += abs(P1 - P2); } } MeanEmitDelta = SumE/(AlphaSize*AlphaSize); } static void Run1(uint Iter) { bool Nucleo = opt(nt); HMMParams HPDef; HPDef.FromDefaults(Nucleo); HMMParams HP; HP.FromDefaults(Nucleo); HP.PerturbProbs(Iter); float MeanTransDelta; float MeanEmitDelta; HMMParams::Compare(HPDef, HP, MeanTransDelta, MeanEmitDelta); ProgressLog("Iter %u, trans %8.6f, emit %8.6f\n", Iter, MeanTransDelta, MeanEmitDelta); } void cmd_perturbhmm() { const string &sIters = opt(perturbhmm); const uint ITERS = StrToUint(sIters); for (uint Iter = 0; Iter < ITERS; ++Iter) Run1(Iter); } muscle-5.1.0/src/pprog.cpp000066400000000000000000000264641424453062600154510ustar00rootroot00000000000000#include "muscle.h" #include "pprog.h" void ReadStringsFromFile(const string &FileName, vector &Strings) { Strings.clear(); FILE *f = OpenStdioFile(FileName); string Line; while (ReadLineStdioFile(f, Line)) Strings.push_back(Line); } void InvertPath(const string &Path, string &InvertedPath) { InvertedPath.clear(); const uint n = SIZE(Path); for (uint i = 0; i < n; ++i) { char c = Path[i]; if (c == 'B') InvertedPath.push_back('B'); else if (c == 'X') InvertedPath.push_back('Y'); else if (c == 'Y') InvertedPath.push_back('X'); else Die("Invalid path char '%c'", c); } } void ValidatePath(const string &Path, uint LX, uint LY) { uint nX = 0; uint nY = 0; for (uint i = 0; i < SIZE(Path); ++i) { char c = toupper(Path[i]); switch (c) { case 'X': ++nX; break; case 'Y': ++nY; break; case 'B': ++nX, ++nY; break; default: asserta(false); } } asserta(nX == LX); asserta(nY == LY); } void AlignMSAsByPath(const MultiSequence &MSA1, const MultiSequence &MSA2, const string &Path, MultiSequence &MSA12) { uint LX = MSA1.GetColCount(); uint LY = MSA2.GetColCount(); ValidatePath(Path, LX, LY); const uint SeqCount1 = MSA1.GetSeqCount(); const uint SeqCount2 = MSA2.GetSeqCount(); for (int SeqIndex = 0; SeqIndex < (int) SeqCount1; ++SeqIndex) { const Sequence *Seq1 = MSA1.GetSequence(SeqIndex); Sequence *AlignedSeq1 = Seq1->AddGapsPath(Path, 'X'); MSA12.AddSequence(AlignedSeq1, true); } for (int SeqIndex = 0; SeqIndex < MSA2.GetNumSequences(); ++SeqIndex) { const Sequence *Seq2 = MSA2.GetSequence(SeqIndex); Sequence *AlignedSeq2 = Seq2->AddGapsPath(Path, 'Y'); MSA12.AddSequence(AlignedSeq2, true); } } void PProg::DeleteIndexesFromPending(uint Index1, uint Index2) { bool Found1 = false; bool Found2 = false; vector NewPending; for (uint i = 0; i < SIZE(m_Pending); ++i) { uint Index = m_Pending[i]; if (Index == Index1) { asserta(!Found1); Found1 = true; continue; } if (Index == Index2) { asserta(!Found2); Found2 = true; continue; } NewPending.push_back(Index); } asserta(Found1); asserta(Found2); asserta(SIZE(NewPending) + 2 == SIZE(m_Pending)); m_Pending = NewPending; } void PProg::FindBestPair(uint &BestIndex1, uint &BestIndex2) const { const uint N = SIZE(m_Pending); asserta(N >= 2); BestIndex1 = m_Pending[0]; BestIndex2 = m_Pending[1]; asserta(BestIndex1 < m_NodeCount); asserta(BestIndex2 < m_NodeCount); float BestScore = m_ScoreMx[BestIndex1][BestIndex2]; for (uint i = 0; i < N; ++i) { uint Indexi = m_Pending[i]; for (uint j = i+1; j < N; ++j) { uint Indexj = m_Pending[j]; asserta(Indexi < SIZE(m_ScoreMx)); asserta(Indexj < SIZE(m_ScoreMx[Indexi])); float Score = m_ScoreMx[Indexi][Indexj]; if (Score > BestScore) { BestScore = Score; BestIndex1 = Indexi; BestIndex2 = Indexj; } } } } const string &PProg::GetMSALabel(uint Index) const { asserta(Index < SIZE(m_MSALabels)); const string &Label = m_MSALabels[Index]; asserta(!Label.empty()); return Label; } void PProg::SetMSA(uint Index, const MultiSequence &MSA) { asserta(Index < SIZE(m_MSAs)); // asserta(m_MSAs[Index] == 0); // TODO: memory leak m_MSAs[Index] = &MSA; } void PProg::SetMSALabel(uint Index, const string &Label) { asserta(Index < SIZE(m_MSALabels)); asserta(m_MSALabels[Index].empty()); m_MSALabels[Index] = Label; asserta(m_MSALabelToIndex.find(Label) == m_MSALabelToIndex.end()); m_MSALabelToIndex[Label] = Index; } const MultiSequence &PProg::GetMSA(uint Index) const { asserta(Index < SIZE(m_MSAs)); const MultiSequence *MSA = m_MSAs[Index]; asserta(MSA != 0); return *MSA; } const MultiSequence &PProg::GetFinalMSA() const { asserta(m_InputMSACount > 0); uint FinalIndex = 2*(m_InputMSACount - 1); asserta(FinalIndex + 1 == SIZE(m_MSAs)); const MultiSequence *MSA = m_MSAs[FinalIndex]; asserta(MSA != 0); return *MSA; } void PProg::SetMSAs(const vector &MSAs, const vector &MSALabels) { m_MSALabelToIndex.clear(); m_InputMSACount = SIZE(MSAs); asserta(SIZE(MSALabels) == m_InputMSACount); m_MSAs = MSAs; m_MSALabels = MSALabels; uint TotalMSACount = 2*m_InputMSACount - 1; m_MSAs.resize(TotalMSACount, 0); m_MSALabels.resize(TotalMSACount, ""); for (uint MSAIndex = 0; MSAIndex < m_InputMSACount; ++MSAIndex) { const string &MSALabel = MSALabels[MSAIndex]; asserta(m_MSALabelToIndex.find(MSALabel) == m_MSALabelToIndex.end()); m_MSALabelToIndex[MSALabel] = MSAIndex; } } void PProg::LoadMSAs(const vector &FileNames, bool &IsNucleo) { m_MSALabelToIndex.clear(); m_MSAs.clear(); m_MSALabels.clear(); m_InputMSACount = SIZE(FileNames); asserta(m_InputMSACount > 1); uint TotalMSACount = 2*m_InputMSACount - 1; m_MSAs.resize(TotalMSACount, 0); m_MSALabels.resize(TotalMSACount, ""); m_JoinCount = m_InputMSACount - 1; m_NodeCount = m_InputMSACount + m_JoinCount; for (uint MSAIndex = 0; MSAIndex < m_InputMSACount; ++MSAIndex) { const string &FileName = FileNames[MSAIndex]; ProgressStep(MSAIndex, m_InputMSACount, "Reading %s", FileName.c_str()); MultiSequence &MSA = *new MultiSequence; MSA.LoadMFA(FileName); bool IsNuc = MSA.GuessIsNucleo(); if (MSAIndex == 0) IsNucleo = IsNuc; else asserta(IsNucleo == IsNuc); string MSALabel; GetBaseName(FileName, MSALabel); SetMSALabel(MSAIndex, MSALabel); SetMSA(MSAIndex, MSA); } } void PProg::Run() { m_JoinMSAIndexes1.clear(); m_JoinMSAIndexes2.clear(); m_ScoreMx.clear(); m_PathMx.clear(); for (uint i = 0; i < m_InputMSACount; ++i) m_Pending.push_back(i); m_JoinCount = m_InputMSACount - 1; m_NodeCount = m_InputMSACount + m_JoinCount; AlignAllInputPairs(); for (m_JoinIndex = 0; m_JoinIndex < m_JoinCount; ++m_JoinIndex) { ProgressLog("____________________________________________\n"); ProgressLog("Join %u/%u, pending %u\n", m_JoinIndex+1, m_JoinCount, SIZE(m_Pending)); uint Index1; uint Index2; FindBestPair(Index1, Index2); asserta(Index1 != Index2); Join_ByPrecomputedPath(Index1, Index2); AlignNewToPending(); } } void PProg::AlignAllInputPairs() { const uint PairCount = (m_InputMSACount*(m_InputMSACount - 1))/2; m_ScoreMx.resize(m_NodeCount); m_PathMx.resize(m_NodeCount); for (uint i = 0; i < m_NodeCount; ++i) { m_ScoreMx[i].resize(m_NodeCount); m_PathMx[i].resize(m_NodeCount); } uint PairIndex = 0; for (uint MSAIndex1 = 0; MSAIndex1 < m_InputMSACount; ++MSAIndex1) { const string &MSALabel1 = GetMSALabel(MSAIndex1); const MultiSequence &MSA1 = GetMSA(MSAIndex1); const uint SeqCount1 = MSA1.GetSeqCount(); for (uint MSAIndex2 = MSAIndex1+1; MSAIndex2 < m_InputMSACount; ++MSAIndex2) { ++PairIndex; Progress("Input pair %u / %u (%.1f%%)\n", PairIndex, PairCount, GetPct(PairIndex, PairCount)); const string &MSALabel2 = GetMSALabel(MSAIndex2); const MultiSequence &MSA2 = GetMSA(MSAIndex2); const uint SeqCount2 = MSA2.GetSeqCount(); string Path; float Score = AlignMSAsFlat(MSALabel1 + "+" + MSALabel2, MSA1, MSA2, m_TargetPairCount, Path); const uint ColCount1 = MSA1.GetColCount(); const uint ColCount2 = MSA2.GetColCount(); ValidatePath(Path, ColCount1, ColCount2); string InvertedPath; InvertPath(Path, InvertedPath); ValidatePath(InvertedPath, ColCount2, ColCount1); m_ScoreMx[MSAIndex1][MSAIndex2] = Score; m_ScoreMx[MSAIndex2][MSAIndex1] = Score; m_PathMx[MSAIndex1][MSAIndex2] = Path; m_PathMx[MSAIndex2][MSAIndex1] = InvertedPath; } } } void PProg::LogPending(const string &s) const { Log("\nLogPending(%s) m_JoinIndex=%u\n", s.c_str(), m_JoinIndex); for (uint i = 0; i < SIZE(m_Pending); ++i) { uint Index = m_Pending[i]; const MultiSequence &MSA = GetMSA(Index); uint SeqCount = MSA.GetSeqCount(); uint ColCount = MSA.GetColCount(); Log(" [%4u] seqs=%u,cols=%u %s\n", Index, SeqCount, ColCount, GetMSALabel(Index).c_str()); } } void PProg::Join_ByPrecomputedPath(uint Index1, uint Index2) { LogPending("Join start"); asserta(SIZE(m_JoinMSAIndexes1) == m_JoinIndex); asserta(SIZE(m_JoinMSAIndexes2) == m_JoinIndex); m_JoinMSAIndexes1.push_back(Index1); m_JoinMSAIndexes2.push_back(Index2); uint NewMSAIndex = m_InputMSACount + m_JoinIndex; string NewMSALabel; Ps(NewMSALabel, "Join%u", m_JoinIndex+1); const string &MSALabel1 = m_MSALabels[Index1]; const string &MSALabel2 = m_MSALabels[Index2]; const MultiSequence &MSA1 = GetMSA(Index1); const MultiSequence &MSA2 = GetMSA(Index2); MultiSequence *MSA12 = new MultiSequence; const string &Path = m_PathMx[Index1][Index2]; AlignMSAsByPath(MSA1, MSA2, Path, *MSA12); AssertSeqsEq(MSA1, *MSA12); AssertSeqsEq(MSA2, *MSA12); ProgressLog("Join %u/%u best pair %u, %u\n", m_JoinIndex+1, m_JoinCount, Index1, Index2); Log(" Join_%u.X=%s\n", m_JoinIndex+1, MSALabel1.c_str()); Log(" Join_%u.Y=%s\n", m_JoinIndex+1, MSALabel2.c_str()); SetMSA(NewMSAIndex, *MSA12); SetMSALabel(NewMSAIndex, NewMSALabel); string JoinFileName = "."; if (optset_savedir) { string Prefix = opt(savedir); Dirize(Prefix); Ps(JoinFileName, "%sjoin%u", Prefix.c_str(), m_JoinIndex); ProgressLog("Writing join MSA: %s\n", JoinFileName.c_str()); MSA12->WriteMFA(JoinFileName); } const uint JoinedSeqCount = MSA12->GetSeqCount(); const uint JoinedColCount = MSA12->GetColCount(); m_Pending.push_back(NewMSAIndex); uint PendingCountBeforeJoin = SIZE(m_Pending); DeleteIndexesFromPending(Index1, Index2); uint PendingCountAfterJoin = SIZE(m_Pending); asserta(PendingCountAfterJoin + 2 == PendingCountBeforeJoin); LogPending("Join end"); } void PProg::AlignNewToPending() { const uint PendingCount = SIZE(m_Pending); LogPending("AlignNewToPending"); asserta(PendingCount > 0); const uint NewIndex = m_Pending[PendingCount-1]; const string &NewMSALabel = m_MSALabels[NewIndex]; const MultiSequence *NewMSA = &GetMSA(NewIndex); for (uint i = 0; i + 1 < PendingCount; ++i) { ProgressLog("Join %u/%u new vs. pending %u/%u\n\n", m_JoinIndex+1, m_JoinCount, i+1, PendingCount); uint Index = m_Pending[i]; const MultiSequence *MSA = &GetMSA(Index); const string &MSALabeli = m_MSALabels[i]; string Path; float Score = AlignMSAsFlat(NewMSALabel + "+" + MSALabeli, *NewMSA, *MSA, m_TargetPairCount, Path); string InvertedPath; InvertPath(Path, InvertedPath); m_ScoreMx[NewIndex][Index] = Score; m_ScoreMx[Index][NewIndex] = Score; m_PathMx[NewIndex][Index] = Path; m_PathMx[Index][NewIndex] = InvertedPath; } } void PProg::WriteGuideTree(const string &FileName) const { if (FileName.empty()) return; Tree GuideTree; MakeGuideTreeFromJoinOrder(m_JoinMSAIndexes1, m_JoinMSAIndexes2, m_MSALabelToIndex, GuideTree); GuideTree.ToFile(FileName); } void cmd_pprog() { PProg PP; vector MSAFileNames; ReadStringsFromFile(opt(pprog), MSAFileNames); const uint MSACount = SIZE(MSAFileNames); asserta(MSACount > 1); const string &OutputFileName = opt(output); PP.m_TargetPairCount = DEFAULT_TARGET_PAIR_COUNT; if (optset_paircount) PP.m_TargetPairCount = int(opt(paircount)); bool IsNucleo; PP.LoadMSAs(MSAFileNames, IsNucleo); SetAlpha(IsNucleo ? ALPHA_Nucleo : ALPHA_Amino); InitProbcons(); PP.Run(); asserta(SIZE(PP.m_Pending) == 1); uint Index = PP.m_Pending[0]; const MultiSequence &FinalMSA = PP.GetFinalMSA(); FinalMSA.WriteMFA(OutputFileName); PP.WriteGuideTree(opt(guidetreeout)); } muscle-5.1.0/src/pprog.h000066400000000000000000000036051424453062600151060ustar00rootroot00000000000000#pragma once #include "tree.h" static const uint DEFAULT_TARGET_PAIR_COUNT = 2000; static const uint DEFAULT_MAX_COARSE_SEQS = 500; class PProg { public: uint m_InputMSACount = 0; uint m_JoinCount = 0; uint m_NodeCount = 0; uint m_TargetPairCount = DEFAULT_TARGET_PAIR_COUNT; uint m_MaxCoarseSeqs = DEFAULT_MAX_COARSE_SEQS; map m_MSALabelToIndex; vector m_MSALabels; vector m_MSAs; vector m_Pending; vector > m_ScoreMx; vector > m_PathMx; uint m_JoinIndex = 0; vector m_JoinMSAIndexes1; vector m_JoinMSAIndexes2; public: void LoadMSAs(const vector &FileNames, bool &IsNucleo); void SetMSAs(const vector &MSAs, const vector &MSALabels); void Run(); void RunGuideTree(const Tree &GuideTree); void Run2(const vector &Indexes1, const vector &Indexes2); void DeleteIndexesFromPending(uint Index1, uint Index2); void FindBestPair(uint &BestIndex1, uint &BestIndex2) const; void AlignAllInputPairs(); void AlignAndJoin(uint Index1, uint Index2); void Join_ByPrecomputedPath(uint Index1, uint Index2); void AlignNewToPending(); void LogPending(const string &s) const; const MultiSequence &GetMSA(uint Index) const; const string &GetMSALabel(uint Index) const; void SetMSA(uint Index, const MultiSequence &MSA); void SetMSALabel(uint Index, const string &Label); const MultiSequence &GetFinalMSA() const; void WriteGuideTree(const string &FileName) const; }; void MakeGuideTreeFromJoinOrder(const vector &Indexes1, const vector &Indexes2, const map &LabelToIndex, Tree &GuideTree); void GetGuideTreeJoinOrder(const Tree &GuideTree, const map &LabelToIndex, vector &Indexes1, vector &Indexes2); void ValidateJoinOrder(const vector &Indexes1, const vector &Indexes2); muscle-5.1.0/src/pprog2.cpp000066400000000000000000000055721424453062600155300ustar00rootroot00000000000000#include "myutils.h" #include "muscle.h" #include "pprog.h" void ReadStringsFromFile(const string &FileName, vector &Strings); void PProg::AlignAndJoin(uint Index1, uint Index2) { m_JoinMSAIndexes1.push_back(Index1); m_JoinMSAIndexes2.push_back(Index2); const MultiSequence &MSA1 = GetMSA(Index1); const MultiSequence &MSA2 = GetMSA(Index2); AssertSameLabels(MSA1); AssertSameLabels(MSA2); const string &MSALabel1 = m_MSALabels[Index1]; const string &MSALabel2 = m_MSALabels[Index2]; string ProgressStr; Ps(ProgressStr, "Join %u / %u", m_JoinIndex+1, m_JoinCount); string Path; AlignMSAsFlat(ProgressStr, MSA1, MSA2, m_TargetPairCount, Path); string MSALabel12; Ps(MSALabel12, "Join_%u", m_JoinIndex+1); MultiSequence *MSA12 = new MultiSequence; AlignMSAsByPath(MSA1, MSA2, Path, *MSA12); AssertSeqsEq(MSA1, *MSA12); AssertSeqsEq(MSA2, *MSA12); AssertSameSeqsJoin(MSA1, MSA2, *MSA12); AssertSameLabels(*MSA12); if (Index1 >= m_InputMSACount) { delete &MSA1; m_MSAs[Index1] = 0; } if (Index2 >= m_InputMSACount) { delete &MSA2; m_MSAs[Index2] = 0; } uint NewMSAIndex = m_InputMSACount + m_JoinIndex; SetMSA(NewMSAIndex, *MSA12); if (optset_savedir) { string Prefix = opt(savedir); Dirize(Prefix); string JoinFileName; Ps(JoinFileName, "%sjoin%u", Prefix.c_str(), m_JoinIndex); ProgressLog("Writing join MSA: %s\n", JoinFileName.c_str()); MSA12->WriteMFA(JoinFileName); } } void PProg::Run2(const vector &Indexes1, const vector &Indexes2) { asserta(m_InputMSACount > 0); m_JoinCount = m_InputMSACount - 1; m_NodeCount = m_InputMSACount + m_JoinCount; asserta(SIZE(Indexes1) == m_JoinCount); asserta(SIZE(Indexes2) == m_JoinCount); ValidateJoinOrder(Indexes1, Indexes2); for (m_JoinIndex = 0; m_JoinIndex < m_JoinCount; ++m_JoinIndex) { uint Index1 = Indexes1[m_JoinIndex]; uint Index2 = Indexes2[m_JoinIndex]; AlignAndJoin(Index1, Index2); } } void cmd_pprog2() { PProg PP; vector MSAFileNames; ReadStringsFromFile(opt(pprog2), MSAFileNames); const uint MSACount = SIZE(MSAFileNames); asserta(MSACount > 1); const string &OutputFileName = opt(output); PP.m_TargetPairCount = DEFAULT_TARGET_PAIR_COUNT; if (optset_paircount) PP.m_TargetPairCount = int(opt(paircount)); bool IsNucleo; PP.LoadMSAs(MSAFileNames, IsNucleo); SetAlpha(IsNucleo ? ALPHA_Nucleo : ALPHA_Amino); InitProbcons(); vector Indexes1; vector Indexes2; FILE *f = OpenStdioFile(opt(joins)); string Line; vector Fields; while (ReadLineStdioFile(f, Line)) { Split(Line, Fields, '\t'); asserta(SIZE(Fields) == 2); uint Index1 = StrToUint(Fields[0]); uint Index2 = StrToUint(Fields[1]); Indexes1.push_back(Index1); Indexes2.push_back(Index2); } CloseStdioFile(f); f = 0; PP.Run2(Indexes1, Indexes2); const MultiSequence &FinalMSA = PP.GetFinalMSA(); FinalMSA.WriteMFA(OutputFileName); } muscle-5.1.0/src/pprogt.cpp000066400000000000000000000022311424453062600156170ustar00rootroot00000000000000#include "myutils.h" #include "muscle.h" #include "textfile.h" #include "tree.h" #include "pprog.h" #include void ReadStringsFromFile(const string &FileName, vector &Strings); void PProg::RunGuideTree(const Tree &GuideTree) { asserta(m_InputMSACount > 0); m_JoinCount = m_InputMSACount - 1; m_NodeCount = m_InputMSACount + m_JoinCount; vector Indexes1; vector Indexes2; GetGuideTreeJoinOrder(GuideTree, m_MSALabelToIndex, Indexes1, Indexes2); Run2(Indexes1, Indexes2); } void cmd_pprogt() { vector MSAFileNames; ReadStringsFromFile(opt(pprogt), MSAFileNames); const uint MSACount = SIZE(MSAFileNames); asserta(MSACount > 1); const string &OutputFileName = opt(output); PProg PP; PP.m_TargetPairCount = DEFAULT_TARGET_PAIR_COUNT; if (optset_paircount) PP.m_TargetPairCount = int(opt(paircount)); bool IsNucleo; PP.LoadMSAs(MSAFileNames, IsNucleo); SetAlpha(IsNucleo ? ALPHA_Nucleo : ALPHA_Amino); InitProbcons(); Tree T; T.FromFile(opt(guidetreein)); PP.RunGuideTree(T); const MultiSequence &FinalMSA = PP.GetFinalMSA(); FinalMSA.WriteMFA(OutputFileName); PP.WriteGuideTree(opt(guidetreeout)); } muscle-5.1.0/src/probcons.cpp000066400000000000000000000026071424453062600161400ustar00rootroot00000000000000#include "muscle.h" void ProgressLogInputSummary(const string &FileName, const MultiSequence &Seqs) { const uint SeqCount = (uint) Seqs.GetNumSequences(); uint MinL = 0; uint MaxL = 0; for (uint i = 0; i < SeqCount; ++i) { const Sequence &Seq = *Seqs.GetSequence(i); uint L = (uint) Seq.GetLength(); if (i == 0) { MinL = L; MaxL = L; } else { MinL = min(MinL, L); MaxL = max(MaxL, L); } } ProgressLog("\n"); ProgressLog("Input %s\n", FileName.c_str()); ProgressLog("Seqs %u\n", SeqCount); ProgressLog("MinL %u\n", MinL); ProgressLog("MaxL %u\n", MaxL); ProgressLog("\n"); } void ProgressLogMSASummary(const string &Str, const MultiSequence &MSA) { const uint SeqCount = (uint) MSA.GetNumSequences(); const uint ColCount = (uint) MSA.GetColCount(); ProgressLog("\n"); ProgressLog("%s\n", Str.c_str()); ProgressLog("Seqs %u\n", SeqCount); ProgressLog("Cols %u\n", ColCount); ProgressLog("\n"); } //void RunMTProbcons(MultiSequence &InputSeqs) // { // bool IsNucleo = InputSeqs.GuessIsNucleo(); // if (IsNucleo) // SetAlpha(ALPHA_Nucleo); // else // SetAlpha(ALPHA_Amino); // // const string &OutputFileName = opt(output); // // InitProbcons(); // // MultiSequence* alignment = RunMPC(&InputSeqs); // // alignment->WriteMFA(opt(output)); // // string s; // Psa(s, "MSA %s", OutputFileName.c_str()); // ProgressLogMSASummary(s, *alignment); // } muscle-5.1.0/src/progalnflat.cpp000066400000000000000000000042401424453062600166170ustar00rootroot00000000000000#include "muscle.h" #include "tree.h" #include "mpcflat.h" void MPCFlat::FreeProgMSAs() { const uint n = SIZE(m_ProgMSAs); for (uint i = 0; i < n; ++i) { MultiSequence *MSA = m_ProgMSAs[i]; if (MSA != 0) delete MSA; } m_ProgMSAs.clear(); } void MPCFlat::FreeSparsePosts() { for (uint i = 0; i < SIZE(m_SparsePosts1); ++i) { if (m_SparsePosts1[i] != 0) { delete m_SparsePosts1[i]; m_SparsePosts1[i] = 0; } } for (uint i = 0; i < SIZE(m_SparsePosts2); ++i) { if (m_SparsePosts2[i] != 0) { delete m_SparsePosts2[i]; m_SparsePosts2[i] = 0; } } m_SparsePosts1.clear(); m_SparsePosts2.clear(); } void MPCFlat::ProgAln(uint JoinIndex) { uint Index1 = m_JoinIndexes1[JoinIndex]; uint Index2 = m_JoinIndexes2[JoinIndex]; assert(Index1 < SIZE(m_ProgMSAs)); assert(Index2 < SIZE(m_ProgMSAs)); MultiSequence *MSA1 = m_ProgMSAs[Index1]; MultiSequence *MSA2 = m_ProgMSAs[Index2]; assert(MSA1 != 0); assert(MSA2 != 0); MultiSequence *MSA12 = AlignAlns(*MSA1, *MSA2); #if 0//TRACE uint SeqCount12 = MSA12->GetSeqCount(); uint Index12 = SIZE(m_ProgMSAs); uint SeqCount1 = MSA1->GetSeqCount(); uint SeqCount2 = MSA2->GetSeqCount(); Log("Flat Join %u(%u) + %u(%u) = %u(%u)\n", Index1, SeqCount1, Index2, SeqCount2, Index12, SeqCount12); #endif m_ProgMSAs.push_back(MSA12); const uint SeqCount = GetSeqCount(); delete MSA1; delete MSA2; m_ProgMSAs[Index1] = 0; m_ProgMSAs[Index2] = 0; } void MPCFlat::ProgressiveAlign() { const uint SeqCount = m_InputSeqs->GetSeqCount(); const uint JoinCount = SeqCount - 1; const uint NodeCount = SeqCount + JoinCount; for (uint i = 0; i < SeqCount; ++i) { const Sequence *Seq = m_InputSeqs->GetSequence(i); MultiSequence *MS = new MultiSequence; MS->AddSequence(Seq, false); m_ProgMSAs.push_back(MS); } asserta(SIZE(m_JoinIndexes1) == JoinCount); asserta(SIZE(m_JoinIndexes2) == JoinCount); ValidateJoinOrder(m_JoinIndexes1, m_JoinIndexes2); for (uint JoinIndex = 0; JoinIndex < JoinCount; ++JoinIndex) ProgAln(JoinIndex); asserta(SIZE(m_ProgMSAs) == NodeCount); m_MSA = m_ProgMSAs[NodeCount-1]; m_ProgMSAs[NodeCount-1] = 0; FreeProgMSAs(); asserta(m_MSA != 0); } muscle-5.1.0/src/project.cpp000066400000000000000000000047301424453062600157600ustar00rootroot00000000000000#include "muscle.h" ///////////////////////////////////////////////////////////////// // MultiSequence::Project() // // Given a set of indices, extract all sequences from the current // MultiSequence object whose index is included in the set. // Then, project the multiple alignments down to the desired // subset, and return the projection as a new MultiSequence // object. ///////////////////////////////////////////////////////////////// MultiSequence* MultiSequence::Project(const set& indices) { vector *> newPtrs(indices.size()); assert(indices.size() != 0); // grab old data //vector::iterator> oldPtrs(indices.size()); //for (set::const_iterator iter = indices.begin(); // iter != indices.end(); ++iter) // oldPtrs[i++] = GetSequence(*iter)->GetDataPtr(); int i = 0; vector oldPtrs(indices.size()); for (set::const_iterator iter = indices.begin(); iter != indices.end(); ++iter) oldPtrs[i++] = GetSequence(*iter)->GetCharPtr1(); // compute new length int oldLength = GetSequence(*indices.begin())->GetLength(); int newLength = 0; for (i = 1; i <= oldLength; i++) { // check to see if there is a gap in every sequence of the set bool found = false; for (int j = 0; !found && j < (int)indices.size(); j++) found = (oldPtrs[j][i] != '-'); // if not, then this column counts towards the sequence length if (found) newLength++; } // build new alignments for (i = 0; i < (int)indices.size(); i++) { newPtrs[i] = new vector; assert(newPtrs[i]); newPtrs[i]->push_back('@'); } // add all needed columns for (i = 1; i <= oldLength; i++) { // make sure column is not gapped in all sequences in the set bool found = false; for (int j = 0; !found && j < (int)indices.size(); j++) found = (oldPtrs[j][i] != '-'); // if not, then add it if (found) { for (int j = 0; j < (int)indices.size(); j++) newPtrs[j]->push_back(oldPtrs[j][i]); } } // wrap sequences in MultiSequence object MultiSequence* ret = new MultiSequence(); i = 0; for (set::const_iterator iter = indices.begin(); iter != indices.end(); ++iter) { const Sequence *OldSeq = GetSequence(*iter); Sequence *NewSeq = NewSequence(); asserta(NewSeq != 0); vector *DataPtr = newPtrs[i++]; const string &Label = OldSeq->m_Label; uint GSI = OldSeq->GetGSI(); uint SMI = OldSeq->GetSMI(); NewSeq->Create(DataPtr, Label, GSI, SMI); ret->AddSequence(NewSeq, true); } return ret; } muscle-5.1.0/src/pwpath.h000066400000000000000000000044321424453062600152610ustar00rootroot00000000000000#ifndef PWPath_h #define PWPath_h /*** Each PWEdge in a PWPath specifies a column in a pair-wise (PW) alignment. "Path" is by analogy with the path through an HMM. Edge types are: 'M' LetterA + LetterB 'D' LetterA + GapB 'I' GapB + LetterA The mnemomic is Match, Delete, Insert (with respect to A). Here is a global alignment of sequences A and B. A: AMQT-F B: -M-TIF The path for this example is: Edge cType uPrefixLengthA uPrefixLengthB 0 D 1 0 1 M 2 1 2 D 3 1 3 M 4 2 4 I 4 3 5 M 5 4 Given the starting positions in each alignment (e.g., column zero for a global alignment), the prefix length fields are redundant; they are included only for convenience and as a sanity check, we are not trying to optimize for speed or space here. We use prefix lengths rather than column indexes because of the problem of representing the special case of a gap in the first position. ***/ class Seq; class MSA; class SatchmoParams; class PW; class TextFile; class PWScore; class PWEdge { public: char cType; unsigned uPrefixLengthA; unsigned uPrefixLengthB; bool Equal(const PWEdge &e) const { return uPrefixLengthA == e.uPrefixLengthA && uPrefixLengthB == e.uPrefixLengthB && cType == e.cType; } }; class PWPath { // Disable compiler defaults private: PWPath &operator=(const PWPath &rhs); PWPath(const PWPath &rhs); public: PWPath(); virtual ~PWPath(); public: void Clear(); void FromStr(const char Str[]); void Copy(const PWPath &Path); void AppendEdge(const PWEdge &Edge); void AppendEdge(char cType, unsigned uPrefixLengthA, unsigned uPrefixLengthB); void PrependEdge(const PWEdge &Edge); unsigned GetEdgeCount() const { return m_uEdgeCount; } const PWEdge &GetEdge(unsigned uEdgeIndex) const; void Validate(const PWScore &PWS) const; void Validate() const; void LogMe() const; void FromFile(TextFile &File); void ToFile(TextFile &File) const; void FromMSAPair(const MSA &msaA, const MSA &msaB); void AssertEqual(const PWPath &Path) const; bool Equal(const PWPath &Path) const; unsigned GetMatchCount() const; unsigned GetDeleteCount() const; unsigned GetInsertCount() const; private: void ExpandPath(unsigned uAdditionalEdgeCount); private: unsigned m_uEdgeCount; unsigned m_uArraySize; PWEdge *m_Edges; }; #endif // PWPath_h muscle-5.1.0/src/qscore.cpp000066400000000000000000000156011424453062600156050ustar00rootroot00000000000000#include "muscle.h" // O(NL) computation of PREFAB Q score and Balibase TC score. // Algorithm based on an idea due to Chuong (Tom) Do. // Each position in the reference alignment is annotated with // the column number C in the test alignment where the same // letter is found. A pair of identical Cs in the same reference // column indicates a correctly aligned pair of letters. void cmd_qscore() { const string TestFileName = opt(qscore); const string RefFileName = opt(ref); MSA msaTest; MSA msaRef; msaTest.FromFASTAFile(TestFileName); extern bool g_FASTA_Upper; bool SaveUpper = g_FASTA_Upper; g_FASTA_Upper = false; msaRef.FromFASTAFile(RefFileName); g_FASTA_Upper = SaveUpper; double Q = 0; double TC = 0; uint SeqDiffCount = 0; uint64 CorrectPairCount = 0; uint64 RefAlignedPairCount = 0; if (opt(verbose)) { Log("RefCol RefAln NonGapped TestAll CorrCols Ref\n"); Log("------ ------ --------- ------- -------- ---\n"); // 6 9 7 8 } const uint RefSeqCount = msaRef.GetSeqCount(); const uint TestSeqCount = msaTest.GetSeqCount(); const uint RefColCount = msaRef.GetColCount(); const uint TestColCount = msaTest.GetColCount(); map RefSeqNameToIndex; vector RefToTestSeqIndex(RefSeqCount); for (uint RefSeqIndex = 0; RefSeqIndex < RefSeqCount; ++RefSeqIndex) { const string SeqName = msaRef.GetSeqName(RefSeqIndex); RefToTestSeqIndex[RefSeqIndex] = UINT_MAX; RefSeqNameToIndex[SeqName] = RefSeqIndex; } uint FoundCount = 0; for (uint TestSeqIndex = 0; TestSeqIndex < TestSeqCount; ++TestSeqIndex) { const string SeqName = msaTest.GetSeqName(TestSeqIndex); map::const_iterator p = RefSeqNameToIndex.find(SeqName); if (p != RefSeqNameToIndex.end()) { uint RefSeqIndex = p->second; if (RefSeqIndex == UINT_MAX) Die("UINT_MAX"); RefToTestSeqIndex[RefSeqIndex] = TestSeqIndex; ++FoundCount; } } if (FoundCount == 0) Die("No reference labels found in test MSA"); if (FoundCount > RefSeqCount) Warning("%u reference sequences not found in test MSA", RefSeqCount - FoundCount); // TestColIndex[i] is the one-based (not zero-based!) test column index // of the letter found in the current column of the reference alignment // (or the most recent letter if the reference column is gapped, or zero // if no letter has yet been found). Here, seq index i is for msaRef. vector TestColIndex(TestSeqCount, 0); // TestColIndexCount[i] is the number of times that a letter from test // column i (one-based!) appears in the current reference column. vector TestColIndexCount(TestColCount+1, 0); // TestColIndexes[i] is the column index in the test alignment of // the i'th non-gapped position in the current reference column. vector TestColIndexes; uint RefAlignedColCount = 0; uint CorrectColCount = 0; if (opt(verbose)) { Log("RefCol RefAln NonGapped TestAll CorrCols Ref\n"); Log("------ ------ --------- ------- -------- ---\n"); // 6 9 7 8 } for (uint RefColIndex = 0; RefColIndex < RefColCount; RefColIndex++) { TestColIndexes.clear(); TestColIndexes.reserve(RefSeqCount); // NonGappedCount is the number of non-gapped positions in the current // reference column. uint NonGappedCount = 0; uint FirstTestColIndex = UINT_MAX; bool RefColIsAligned = false; bool TestColAllCorrect = true; bool TestAllAligned = true; for (uint RefSeqIndex = 0; RefSeqIndex < RefSeqCount; RefSeqIndex++) { uint TestSeqIndex = RefToTestSeqIndex[RefSeqIndex]; if (TestSeqIndex == UINT_MAX) continue; char cRef = msaRef.GetChar(RefSeqIndex, RefColIndex); if (!isgap(cRef)) { char cTest = 0; uint Col = TestColIndex[TestSeqIndex]; do cTest = msaTest.GetChar(TestSeqIndex, Col++); while (isgap(cTest)); if (toupper(cRef) != toupper(cTest)) { ++SeqDiffCount; Warning("Test seq %u (%s) differs from ref seq %u (%s), ref col %u=%c, test=%c", TestSeqIndex, msaTest.GetSeqName(TestSeqIndex), RefSeqIndex, msaRef.GetSeqName(RefSeqIndex), RefColIndex, cRef, cTest); } if (isalpha(cRef) && (isupper(cRef) || cRef == 'x')) { RefColIsAligned = true; ++NonGappedCount; if (isupper(cTest)) { TestColIndexes.push_back(Col); ++(TestColIndexCount[Col]); if (FirstTestColIndex == UINT_MAX) FirstTestColIndex = Col; else { if (FirstTestColIndex != Col) TestColAllCorrect = false; } } else TestAllAligned = false; } else { if (RefColIsAligned) { Log("\n"); Log("Ref col: "); for (uint RefSeqIndex = 0; RefSeqIndex < RefSeqCount; RefSeqIndex++) Log("%c", msaRef.GetChar(RefSeqIndex, RefColIndex)); Log("\n"); Die("Ref col %u has both upper- and lower-case letters", RefColIndex); } } TestColIndex[TestSeqIndex] = Col; } } if (RefColIsAligned && NonGappedCount > 1) { ++RefAlignedColCount; if (TestColAllCorrect && TestAllAligned) ++CorrectColCount; } uint ColPairCount = 0; for (vector::const_iterator p = TestColIndexes.begin(); p != TestColIndexes.end(); ++p) { uint Col = *p; uint Count = TestColIndexCount[Col]; if (Count > 0) ColPairCount += Count*(Count - 1)/2; TestColIndexCount[Col] = 0; } CorrectPairCount += ColPairCount; RefAlignedPairCount += NonGappedCount*(NonGappedCount - 1)/2; if (opt(verbose)) { Log("%6u %6c %9u %7c %8u ", RefColIndex, RefColIsAligned ? 'T' : 'F', NonGappedCount, TestColAllCorrect ? 'T' : 'F', CorrectColCount); for (uint RefSeqIndex = 0; RefSeqIndex < RefSeqCount; RefSeqIndex++) { uint TestSeqIndex = RefToTestSeqIndex[RefSeqIndex]; if (TestSeqIndex == UINT_MAX) continue; char cRef = msaRef.GetChar(RefSeqIndex, RefColIndex); Log("%c", cRef); } Log("\n"); } } if (RefAlignedPairCount == 0) Q = 0; else Q = (double) CorrectPairCount / (double) RefAlignedPairCount; if (RefAlignedColCount == 0) { Warning("reference alignment %s has no aligned (upper-case) columns\n", RefFileName.c_str()); TC = 0; } else TC = (double) CorrectColCount / (double) RefAlignedColCount; if (opt(verbose)) { Log(" ------ --------\n"); Log("%6.6s %6u %9.9s %7.7s %8u\n", "", RefAlignedColCount, "", "", CorrectColCount); Log("\n"); Log("CorrectPairCount %u\n", CorrectPairCount); Log("RefAlignedPairCount %u\n", RefAlignedPairCount); Log("CorrectColCount %u\n", CorrectColCount); Log("RefAlignedColCount %u\n", RefAlignedColCount); Log("Q %.4f\n", Q); Log("TC %.4f\n", TC); } if (SeqDiffCount > 0) Warning("%u seq diffs ignored", SeqDiffCount); ProgressLog("%s Q=%.3g, TC=%.3g\n", TestFileName.c_str(), Q, TC); } muscle-5.1.0/src/qscore2.cpp000066400000000000000000000034631424453062600156720ustar00rootroot00000000000000#include "muscle.h" #include "qscorer.h" void cmd_qscore2() { const string TestFileName = opt(qscore2); const string RefFileName = opt(ref); double MaxGapFract = optd(max_gap_fract, 1.0); string Name; GetBaseName(TestFileName.c_str(), Name); MSA Test; MSA Ref; Test.FromFASTAFile(TestFileName); Ref.FromFASTAFile_PreserveCase(RefFileName); QScorer QS; QS.m_MaxGapFract = MaxGapFract; QS.Run(Test, Ref); ProgressLog("%s: Q=%.4f, TC=%.4f\n", Name.c_str(), QS.m_Q, QS.m_TC); } void cmd_qscoredir() { const string NamesFileName = opt(qscoredir); string TestDir = opt(testdir); string RefDir = opt(refdir); const string OutputFileName = opt(output); double MaxGapFract = 0.5; if (optset_max_gap_fract) MaxGapFract = opt(max_gap_fract); Dirize(TestDir); Dirize(RefDir); vector Names; ReadStringsFromFile(NamesFileName, Names); FILE *fOut = CreateStdioFile(OutputFileName); float SumQ = 0; float SumTC = 0; float AvgQ = 0; float AvgTC = 0; const uint NameCount = SIZE(Names); for (uint i = 0; i < NameCount; ++i) { ProgressStep(i, NameCount, "%s Q %.2f TC %.2f", TestDir.c_str(), AvgQ, AvgTC); const string &Name = Names[i]; const string &TestFileName = TestDir + Name; const string &RefFileName = RefDir + Name; MSA Test; MSA Ref; Test.FromFASTAFile(TestFileName); extern bool g_FASTA_Upper; bool SaveUpper = g_FASTA_Upper; g_FASTA_Upper = false; Ref.FromFASTAFile(RefFileName); g_FASTA_Upper = SaveUpper; QScorer QS; QS.m_MaxGapFract = MaxGapFract; QS.Run(Test, Ref); Pf(fOut, "set=%s q=%.4f tc=%.4f\n", Name.c_str(), QS.m_Q, QS.m_TC); SumQ += QS.m_Q; SumTC += QS.m_TC; AvgQ = SumQ/(i+1); AvgTC = SumTC/(i+1); } Pf(fOut, "testdir=%s n=%u avgq=%.4f avgtc=%.4f\n", TestDir.c_str(), NameCount, AvgQ, AvgTC); CloseStdioFile(fOut); } muscle-5.1.0/src/qscoreefa.cpp000066400000000000000000000014341424453062600162600ustar00rootroot00000000000000#include "muscle.h" #include "ensemble.h" #include "qscorer.h" void cmd_qscore_efa() { const string EfaFileName = opt(qscore_efa); const string RefFileName = opt(ref); const string OutputFileName = opt(output); double MaxGapFract = optd(max_gap_fract, 1.0); Ensemble E; E.FromFile(EfaFileName); MSA RefMSA; RefMSA.FromFASTAFile_PreserveCase(RefFileName); string RefName; GetBaseName(RefFileName, RefName); QScorer QS; QS.m_MaxGapFract = MaxGapFract; const uint MSACount = E.GetMSACount(); for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const MSA &TestMSA = E.GetMSA(MSAIndex); const string &TestName = E.GetMSAName(MSAIndex); QS.Run(TestMSA, RefMSA); ProgressLog("%s %s Q=%.4f TC=%.4f\n", RefName.c_str(), TestName.c_str(), QS.m_Q, QS.m_TC); } } muscle-5.1.0/src/qscorer.cpp000066400000000000000000000224021424453062600157640ustar00rootroot00000000000000#include "muscle.h" #include "qscorer.h" void QScorer::Clear() { m_Test = 0; m_Ref = 0; m_RefAlignedColCount = 0; m_Labels.clear(); m_RefSeqIndexes.clear(); m_TestSeqIndexes.clear(); m_RefSeqIndexToTestSeqIndex.clear(); m_RefCols.clear(); m_RefUngappedCounts.clear(); m_PosToTestColVec.clear(); m_PosToRefColVec.clear(); m_TestColToPosVec.clear(); m_RefColToPosVec.clear(); m_RefColToTestColVec.clear(); m_TestColToBestRefCol.clear(); m_MaxFracts.clear(); m_BestTestCols.clear(); m_TotalPairs = 0; m_TotalCols = 0; m_CorrectPairs = 0; m_CorrectCols = 0; m_Q = 0; m_TC = 0; m_RefLabels.clear(); m_RefLabelToSeqIndex.clear(); m_TestColToCount.clear(); } void QScorer::InitRefLabels() { m_RefLabels.clear(); m_RefLabelToSeqIndex.clear(); const uint RefSeqCount = GetRefSeqCount(); for (uint RefSeqIndex = 0; RefSeqIndex < RefSeqCount; ++RefSeqIndex) { const string Label = (const string) m_Ref->GetSeqName(RefSeqIndex); if (m_RefLabelToSeqIndex.find(Label) != m_RefLabelToSeqIndex.end()) Die("Dupe ref label >%s", Label.c_str()); m_RefLabels.push_back(Label); m_RefLabelToSeqIndex[Label] = RefSeqIndex; } } void QScorer::InitRefToTest() { const uint RefSeqCount = GetRefSeqCount(); const uint TestSeqCount = GetTestSeqCount(); m_RefSeqIndexToTestSeqIndex.clear(); m_RefSeqIndexToTestSeqIndex.resize(RefSeqCount, UINT_MAX); for (uint TestSeqIndex = 0; TestSeqIndex < TestSeqCount; ++TestSeqIndex) { const string Label = (const string) m_Test->GetSeqName(TestSeqIndex); map::const_iterator p = m_RefLabelToSeqIndex.find(Label); if (p == m_RefLabelToSeqIndex.end()) continue; uint RefSeqIndex = p->second; asserta(RefSeqIndex < RefSeqCount); if (m_RefSeqIndexToTestSeqIndex[RefSeqIndex] != UINT_MAX) Warning("Ref label found twice in test MSA >%s", Label.c_str()); m_RefSeqIndexToTestSeqIndex[RefSeqIndex] = TestSeqIndex; m_Labels.push_back(Label); m_RefSeqIndexes.push_back(RefSeqIndex); m_TestSeqIndexes.push_back(TestSeqIndex); } } void QScorer::InitColPosVecs1(uint i) { const uint TestColCount = GetTestColCount(); const uint RefColCount = GetRefColCount(); uint RefSeqIndex = m_RefSeqIndexes[i]; uint TestSeqIndex = m_TestSeqIndexes[i]; const string &Label = m_Labels[i]; const string &TestLabel = m_Test->GetSeqName(TestSeqIndex); const string &RefLabel = m_Ref->GetSeqName(RefSeqIndex); asserta(TestLabel == RefLabel); m_Ref->GetPosToCol(RefSeqIndex, m_PosToRefColVec[i]); m_Test->GetPosToCol(TestSeqIndex, m_PosToTestColVec[i]); const uint RefUngappedLength = SIZE(m_PosToRefColVec[i]); const uint TestUngappedLength = SIZE(m_PosToTestColVec[i]); asserta(RefUngappedLength == TestUngappedLength); m_Ref->GetColToPos(RefSeqIndex, m_RefColToPosVec[i]); m_Test->GetColToPos(TestSeqIndex, m_TestColToPosVec[i]); asserta(SIZE(m_RefColToPosVec[i]) == RefColCount); asserta(SIZE(m_TestColToPosVec[i]) == TestColCount); const uint L = SIZE(m_PosToRefColVec[i]); const uint Lt = SIZE(m_PosToTestColVec[i]); if (L != Lt) Die("Seq lengths differ ref=%u, test=%u >%s", L, Lt, Label.c_str()); m_RefColToTestColVec[i].resize(RefColCount, UINT_MAX); for (uint RefCol = 0; RefCol < RefColCount; ++RefCol) { uint Pos = m_RefColToPosVec[i][RefCol]; if (Pos == UINT_MAX) m_RefColToTestColVec[i][RefCol] = UINT_MAX; else { asserta(Pos < SIZE(m_PosToTestColVec[i])); uint TestCol = m_PosToTestColVec[i][Pos]; asserta(TestCol < TestColCount); char TestChar = m_Test->GetChar(TestSeqIndex, TestCol); char RefChar = m_Ref->GetChar(RefSeqIndex, RefCol); asserta(!isgap(TestChar) && !isgap(RefChar)); if (toupper(TestChar) != toupper(RefChar)) Die("Sequences differ pos %u test %c ref %c >%s", Pos, TestChar, RefChar, Label.c_str()); m_RefColToTestColVec[i][RefCol] = TestCol; } } } void QScorer::InitColPosVecs() { const uint N = SIZE(m_RefSeqIndexes); if (N == 0) Die("No ref labels found in test MSA"); m_PosToTestColVec.clear(); m_PosToRefColVec.clear(); m_TestColToPosVec.clear(); m_RefColToPosVec.clear(); m_RefColToTestColVec.clear(); m_PosToTestColVec.resize(N); m_PosToRefColVec.resize(N); m_TestColToPosVec.resize(N); m_RefColToPosVec.resize(N); m_RefColToTestColVec.resize(N); for (uint i = 0; i < N; ++i) InitColPosVecs1(i); } void QScorer::InitRefCols() { const uint RefColCount = GetRefColCount(); m_RefCols.clear(); for (uint RefCol = 0; RefCol < RefColCount; ++RefCol) if (m_Ref->ColIsUpper(RefCol, m_MaxGapFract)) m_RefCols.push_back(RefCol); } void QScorer::InitRefUngappedCounts() { m_RefAlignedColCount = SIZE(m_RefCols); if (m_RefAlignedColCount == 0) Die("Qscorer: No upper case columns in ref"); m_RefUngappedCounts.clear(); const uint N = SIZE(m_RefSeqIndexes); for (uint k = 0; k < m_RefAlignedColCount; ++k) { uint RefCol = m_RefCols[k]; uint UngappedCount = 0; for (uint i = 0; i < N; ++i) { uint RefSeqIndex = m_RefSeqIndexes[i]; char c = m_Ref->GetChar(RefSeqIndex, RefCol); if (!isgap(c)) ++UngappedCount; } m_RefUngappedCounts.push_back(UngappedCount); } } void QScorer::DoRefCol(uint k) { uint RefCol = m_RefCols[k]; const uint RefSeqCount = GetRefSeqCount(); uint64 CorrectPairsCol = 0; vector TestColIndexesFound; const uint N = SIZE(m_RefSeqIndexes); uint TestLetterCount = 0; for (uint i = 0; i < N; ++i) { uint TestCol = m_RefColToTestColVec[i][RefCol]; if (TestCol != UINT_MAX) { ++TestLetterCount; asserta(TestCol < SIZE(m_TestColToCount)); if (m_TestColToCount[TestCol] == 0) TestColIndexesFound.push_back(TestCol); m_TestColToCount[TestCol] += 1; } } uint64 MaxCount = 0; uint BestTestCol = UINT_MAX; for (uint j = 0; j < SIZE(TestColIndexesFound); ++j) { uint TestCol = TestColIndexesFound[j]; uint Count = m_TestColToCount[TestCol]; asserta(Count > 0); uint Count64 = Count; if (Count > MaxCount) { MaxCount = Count; BestTestCol = TestCol; } CorrectPairsCol += (Count64*(Count64 - 1))/2; // Reset so all zero at start of next DoRefCol(). m_TestColToCount[TestCol] = 0; } m_CorrectPairs += CorrectPairsCol; if (MaxCount <= TestLetterCount/2) BestTestCol = UINT_MAX; m_BestTestCols.push_back(BestTestCol); float MaxFract = float(MaxCount)/RefSeqCount; m_MaxFracts.push_back(MaxFract); asserta(k < SIZE(m_RefUngappedCounts)); uint64 UngappedCount = m_RefUngappedCounts[k]; uint64 UngappedPairCount = (UngappedCount*(UngappedCount - 1))/2; m_TotalPairs += UngappedPairCount; asserta(UngappedPairCount >= CorrectPairsCol); if (UngappedPairCount == CorrectPairsCol) ++m_CorrectCols; } void QScorer::DoRefCols() { m_BestTestCols.clear(); m_MaxFracts.clear(); m_TestColToCount.clear(); const uint TestColCount = GetTestColCount(); m_TestColToCount.resize(TestColCount, 0); m_CorrectPairs = 0; m_CorrectCols = 0; for (uint k = 0; k < m_RefAlignedColCount; ++k) DoRefCol(k); } void QScorer::SetTestColToBestRefCol() { const uint TestColCount = GetTestColCount(); m_TestColToBestRefCol.resize(TestColCount, UINT_MAX); for (uint k = 0; k < m_RefAlignedColCount; ++k) { uint RefCol = m_RefCols[k]; uint BestTestCol = m_BestTestCols[k]; if (RefCol == UINT_MAX || BestTestCol == UINT_MAX) continue; asserta(BestTestCol < SIZE(m_TestColToBestRefCol)); m_TestColToBestRefCol[BestTestCol] = RefCol; } } void QScorer::Run(const MSA &Test, const MSA &Ref) { Clear(); m_Test = &Test; m_Ref = &Ref; const uint TestSeqCount = Test.GetSeqCount(); const uint RefSeqCount = Ref.GetSeqCount(); const uint TestColCount = Test.GetColCount(); const uint RefColCount = Ref.GetColCount(); InitRefLabels(); InitRefToTest(); InitColPosVecs(); InitRefCols(); InitRefUngappedCounts(); DoRefCols(); SetTestColToBestRefCol(); m_Q = float(m_CorrectPairs)/float(m_TotalPairs); m_TC = float(m_CorrectCols)/float(m_RefAlignedColCount); } void QScorer::UpdateRefLetterCountsCol(uint k, vector > &LetterCountsVec) const { asserta(k < SIZE(m_RefCols)); asserta(k < SIZE(m_BestTestCols)); uint RefCol = m_RefCols[k]; uint BestTestCol = m_BestTestCols[k]; const uint N = SIZE(m_RefSeqIndexes); asserta(SIZE(m_TestSeqIndexes) == N); for (uint i = 0; i < N; ++i) { uint Pos = m_RefColToPosVec[i][RefCol]; if (Pos == UINT_MAX) continue; uint TestCol = m_PosToTestColVec[i][Pos]; if (TestCol == BestTestCol) { uint RefSeqIndex = m_RefSeqIndexes[i]; LetterCountsVec[RefSeqIndex][RefCol] += 1; } } } /*** LetterCountsVec[RefSeqIndex][RefColIndex] is the number of times this position appears in the best-match test column, will be zero or one on first pass, can be incremented by calling again with more test MSAs with the same ref MSA. ***/ void QScorer::UpdateRefLetterCounts(vector > &LetterCountsVec) const { const uint RefSeqCount = GetRefSeqCount(); const uint RefColCount = GetRefColCount(); if (LetterCountsVec.empty()) { LetterCountsVec.clear(); LetterCountsVec.resize(RefSeqCount); for (uint RefSeqIndex = 0; RefSeqIndex < RefSeqCount; ++RefSeqIndex) LetterCountsVec[RefSeqIndex].resize(RefColCount, 0); } else { asserta(RefSeqCount > 0); asserta(SIZE(LetterCountsVec) == RefSeqCount); asserta(SIZE(LetterCountsVec[0]) == RefColCount); } const uint K = SIZE(m_RefCols); for (uint k = 0; k < K; ++k) UpdateRefLetterCountsCol(k, LetterCountsVec); } muscle-5.1.0/src/qscorer.h000066400000000000000000000031001424453062600154230ustar00rootroot00000000000000#pragma once class QScorer { public: const MSA *m_Test; const MSA *m_Ref; double m_MaxGapFract = 1.0; uint m_RefAlignedColCount = 0; vector m_Labels; vector m_RefSeqIndexes; vector m_TestSeqIndexes; vector m_RefSeqIndexToTestSeqIndex; vector m_RefCols; vector m_RefUngappedCounts; vector > m_PosToTestColVec; vector > m_PosToRefColVec; vector > m_TestColToPosVec; vector > m_RefColToPosVec; vector > m_RefColToTestColVec; vector m_TestColToBestRefCol; vector m_MaxFracts; vector m_BestTestCols; uint64 m_TotalPairs = 0; uint64 m_TotalCols = 0; uint64 m_CorrectPairs = 0; uint m_CorrectCols = 0; float m_Q = 0; float m_TC = 0; vector m_RefLabels; map m_RefLabelToSeqIndex; vector m_TestColToCount; public: void Clear(); void Run(const MSA &Test, const MSA &Ref); void InitRefLabels(); void InitRefToTest(); void InitColPosVecs(); void InitColPosVecs1(uint i); void InitRefCols(); void InitRefUngappedCounts(); void DoRefCols(); void DoRefCol(uint k); void SetTestColToBestRefCol(); void UpdateRefLetterCounts(vector > &LetterCountsVec) const; void UpdateRefLetterCountsCol(uint k, vector > &LetterCountsVec) const; uint GetRefSeqCount() const { return m_Ref->GetSeqCount(); } uint GetTestSeqCount() const { return m_Test->GetSeqCount(); } uint GetRefColCount() const { return m_Ref->GetColCount(); } uint GetTestColCount() const { return m_Test->GetColCount(); } }; muscle-5.1.0/src/qscorer3.h000066400000000000000000000012021424453062600155070ustar00rootroot00000000000000#pragma once #include "qscorer.h" class QScorer3 { public: MSA m_Test1; MSA m_Test2; MSA m_Ref; QScorer m_QS1; QScorer m_QS2; vector m_Indexes2; const vector *m_RefCols = 0; uint m_RefAlignedColCount = 0; vector > m_Pairs; vector m_PairIndexToQ1; vector m_PairIndexToQ2; vector m_PairIndexToPWC; public: void Run(const string &TestFileName1, const string &TestFileName2, const string &RefFileName, double MaxGapFract); void ToTSV(const string &FileName) const; void ToTSV(FILE *f) const; private: void TransQ(); void TransQPair(uint Indexi, uint Indexj); }; muscle-5.1.0/src/quarts.cpp000066400000000000000000000022561424453062600156320ustar00rootroot00000000000000#include "myutils.h" #include "sort.h" #include "quarts.h" void GetQuarts(const vector &v, Quarts &Q) { const unsigned N = SIZE(v); Q.Min = 0; Q.LoQ = 0; Q.Med = 0; Q.HiQ = 0; Q.Max = 0; Q.Total = 0; Q.Avg = 0.0; if (N == 0) return; vector v2 = v; unsigned *vs = v2.data(); QuickSortInPlace(vs, N); for (unsigned i = 0; i < N; ++i) Q.Total += vs[i]; Q.Min = vs[0]; Q.LoQ = vs[N/4]; Q.Med = vs[N/2]; Q.HiQ = vs[(3*N)/4]; Q.Max = vs[N-1]; Q.Avg = double(Q.Total)/N; } void GetQuartsFloat(const vector &v, QuartsFloat &Q) { const unsigned N = SIZE(v); Q.Min = 0.0f; Q.LoQ = 0.0f; Q.Med = 0.0f; Q.HiQ = 0.0f; Q.Max = 0.0f; Q.Total = 0.0f; Q.Avg = 0.0f; if (N == 0) return; vector v2 = v; float *vs = v2.data(); QuickSortInPlace(vs, N); for (unsigned i = 0; i < N; ++i) Q.Total += vs[i]; float Mean = float(Q.Total)/N; float Sumd = 0.0f; for (unsigned i = 0; i < N; ++i) { float x = vs[i]; float d = (x - Mean)*(x - Mean); Sumd += d; } float StdDev = (float) sqrt(Sumd/N); Q.Min = vs[0]; Q.LoQ = vs[N/4]; Q.Med = vs[N/2]; Q.HiQ = vs[(3*N)/4]; Q.Max = vs[N-1]; Q.Avg = Mean; Q.StdDev = StdDev; } muscle-5.1.0/src/quarts.h000066400000000000000000000006511424453062600152740ustar00rootroot00000000000000#ifndef quarts_h #define quarts_h struct Quarts { unsigned Min; unsigned LoQ; unsigned Med; unsigned HiQ; unsigned Max; unsigned Total; double Avg; }; struct QuartsFloat { float Min; float LoQ; float Med; float HiQ; float Max; float Total; float Avg; float StdDev; }; void GetQuarts(const vector &v, Quarts &Q); void GetQuartsFloat(const vector &v, QuartsFloat &Q); #endif // quarts_h muscle-5.1.0/src/randomchaintree.cpp000066400000000000000000000026111424453062600174510ustar00rootroot00000000000000#include "muscle.h" #include "mpcflat.h" static void MakeRandomChainTree(const vector &Labels, Tree &T) { vector Parents; vector Lengths; const uint SeqCount = SIZE(Labels); vector SeqIndexes; for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) SeqIndexes.push_back(SeqIndex); Shuffle(SeqIndexes); Parents.resize(2*SeqCount - 1, UINT_MAX); vector NodeLabels; for (uint i = 0; i < SeqCount; ++i) { uint SeqIndex = SeqIndexes[i]; const string &Label = Labels[SeqIndex]; Lengths.push_back(1); NodeLabels.push_back(Label); } for (uint i = 0; i + 1 < SeqCount; ++i) { if (i == 0) { uint Left = SeqIndexes[0]; uint Right = SeqIndexes[1]; Parents[Left] = SeqCount; Parents[Right] = SeqCount; } else { uint Left = SeqCount + i - 1; uint Right = SeqIndexes[i+1]; Parents[Left] = SeqCount + i; Parents[Right] = SeqCount + i; } NodeLabels.push_back(""); Lengths.push_back(1); } T.FromVectors(NodeLabels, Parents, Lengths); } void MPCFlat::CalcGuideTree_RandomChain() { MakeRandomChainTree(m_Labels, m_GuideTree); } void cmd_labels2randomchaintree() { const string &LabelsFileName = opt(labels2randomchaintree); const string &NewickFileName = opt(output); vector Labels; ReadStringsFromFile(LabelsFileName, Labels); Tree T; MakeRandomChainTree(Labels, T); T.ToFile(NewickFileName); } muscle-5.1.0/src/refineflat.cpp000066400000000000000000000012571424453062600164320ustar00rootroot00000000000000#include "muscle.h" #include "mpcflat.h" void MPCFlat::RefineIter() { set SeqIndexes1, SeqIndexes2; const uint SeqCount = GetSeqCount(); asserta(m_MSA != 0); asserta(m_MSA->GetSeqCount() == SeqCount); // create two separate groups for (uint SeqIndex = 0; SeqIndex < SeqCount; SeqIndex++) if (rand()%2 == 0) SeqIndexes1.insert(SeqIndex); else SeqIndexes2.insert(SeqIndex); if (SeqIndexes1.empty() || SeqIndexes2.empty()) return; const MultiSequence *MSA1 = m_MSA->Project(SeqIndexes1); const MultiSequence *MSA2 = m_MSA->Project(SeqIndexes2); delete m_MSA; MultiSequence *MSA12 = AlignAlns(*MSA1, *MSA2); m_MSA = MSA12; delete MSA1; delete MSA2; } muscle-5.1.0/src/relabel.cpp000066400000000000000000000026061424453062600157200ustar00rootroot00000000000000#include "muscle.h" void cmd_relabel() { MultiSequence M; M.FromFASTA(opt(relabel)); FILE *f = OpenStdioFile(opt(labels2)); string Line; vector Fields; map OldLabelToNewLabel; uint LabelCount = 0; while (ReadLineStdioFile(f, Line)) { Split(Line, Fields, '\t'); if (SIZE(Fields) != 2) Die("Expected 2 fields in line '%s'", Line.c_str()); const string &OldLabel = Fields[0]; const string &NewLabel = Fields[1]; if (OldLabelToNewLabel.find(OldLabel) != OldLabelToNewLabel.end()) Die("Dupe label >%s", OldLabel.c_str()); OldLabelToNewLabel[OldLabel] = NewLabel; ++LabelCount; } CloseStdioFile(f); const uint SeqCount = M.GetSeqCount(); uint NotFound = 0; uint Found = 0; FILE *fOut = CreateStdioFile(opt(output)); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { const Sequence *S = M.GetSequence(SeqIndex); const string &OldLabel = M.GetLabel(SeqIndex); map::const_iterator p = OldLabelToNewLabel.find(OldLabel); if (p == OldLabelToNewLabel.end()) { S->WriteMFA(fOut); ++NotFound; if (NotFound < 10) ProgressLog("Not found >%s\n", OldLabel.c_str()); else if (NotFound == 10) ProgressLog("10+ Not found\n"); continue; } const string &NewLabel = p->second; const byte *ByteSeq = S->GetBytePtr(); uint L = S->GetLength(); SeqToFasta(fOut, ByteSeq, L, NewLabel.c_str()); } } muscle-5.1.0/src/relaxflat.cpp000066400000000000000000000044471424453062600163010ustar00rootroot00000000000000#include "muscle.h" #include "mpcflat.h" void RelaxFlat_XZ_ZY(const MySparseMx &XZ, const MySparseMx &ZY, float *Post) { const uint LX = XZ.GetLX(); const uint LZ = XZ.GetLY(); const uint LY = ZY.GetLY(); asserta(ZY.GetLX() == LZ); for (uint PosX = 0; PosX < LX; ++PosX) { uint Offset_XZ = XZ.GetOffset(PosX); uint Size_XZ = XZ.GetSize(PosX); for (uint k = 0; k < Size_XZ; ++k) { float P_XZ = XZ.GetProb_Offset(Offset_XZ + k); uint PosZ = XZ.GetCol_Offset(Offset_XZ + k); uint Offset_ZY = ZY.GetOffset(PosZ); uint Size_ZY = ZY.GetSize(PosZ); for (uint m = 0; m < Size_ZY; ++m) { float P_ZY = ZY.GetProb_Offset(Offset_ZY + m); uint PosY = ZY.GetCol_Offset(Offset_ZY + m); Post[PosX*LY + PosY] += P_XZ*P_ZY; } } } } void RelaxFlat_ZX_ZY(const MySparseMx &ZX, const MySparseMx &ZY, float *Post) { const uint LZ = ZX.GetLX(); const uint LX = ZX.GetLY(); const uint LY = ZY.GetLY(); asserta(ZY.GetLX() == LZ); for (uint PosZ = 0; PosZ < LZ; ++PosZ) { uint Offset_ZX = ZX.GetOffset(PosZ); uint Size_ZX = ZX.GetSize(PosZ); for (uint k = 0; k < Size_ZX; ++k) { float P_ZX = ZX.GetProb_Offset(Offset_ZX + k); uint PosX = ZX.GetCol_Offset(Offset_ZX + k); uint Offset_ZY = ZY.GetOffset(PosZ); uint Size_ZY = ZY.GetSize(PosZ); for (uint m = 0; m < Size_ZY; ++m) { float P_ZY = ZY.GetProb_Offset(Offset_ZY + m); uint PosY = ZY.GetCol_Offset(Offset_ZY + m); Post[PosX*LY + PosY] += P_ZX*P_ZY; } } } } void RelaxFlat_XZ_YZ(const MySparseMx &XZ, const MySparseMx &YZ, float *Post) { const uint LX = XZ.GetLX(); const uint LZ = XZ.GetLY(); const uint LY = YZ.GetLX(); asserta(YZ.GetLY() == LZ); vector PosZToLoPosY; vector PosZToHiPosY; YZ.GetColToRowLoHi(PosZToLoPosY, PosZToHiPosY); for (uint PosX = 0; PosX < LX; ++PosX) { uint Offset_XZ = XZ.GetOffset(PosX); uint Size_XZ = XZ.GetSize(PosX); for (uint k = 0; k < Size_XZ; ++k) { float P_XZ = XZ.GetProb_Offset(Offset_XZ + k); uint PosZ = XZ.GetCol_Offset(Offset_XZ + k); uint LoPosY = PosZToLoPosY[PosZ]; uint HiPosY = PosZToHiPosY[PosZ]; if (LoPosY == UINT_MAX) continue; for (uint PosY = LoPosY; PosY <= HiPosY; ++PosY) { float P_YZ = YZ.GetProb(PosY, PosZ); Post[PosX*LY + PosY] += P_XZ*P_YZ; } } } } muscle-5.1.0/src/resample.cpp000066400000000000000000000034431424453062600161220ustar00rootroot00000000000000#include "muscle.h" #include "ensemble.h" void cmd_resample() { const string &FileName = opt(resample); const string &OutputPattern = opt(output); if (OutputPattern.empty()) Die("Must set -output"); double MaxGapFract = 0.5; if (optset_max_gap_fract) MaxGapFract = opt(max_gap_fract); double MinConf = 0.5; if (optset_minconf) MinConf = opt(minconf); uint ReplicateCount = 100; if (optset_replicates) ReplicateCount = opt(replicates); Ensemble E; E.FromFile(FileName); uint SiteCount = E.GetMedianHiQualColCount(MaxGapFract, MinConf); if (SiteCount == 0) Die("All columns low qual (max fract %.3g, min conf %.3g)", MaxGapFract, MinConf); ProgressLog("Site count %u\n", SiteCount); if (SiteCount < 20) Warning("Very low hi qual site count"); vector NonGappyUniqueIxs; E.GetHiQualUniqueIxs(MaxGapFract, MinConf, NonGappyUniqueIxs); const uint N = SIZE(NonGappyUniqueIxs); bool OutputWildCard = (OutputPattern.find('@') != string::npos); FILE *fOut = 0; if (!OutputWildCard) fOut = CreateStdioFile(OutputPattern); for (uint RepIndex = 0; RepIndex < ReplicateCount; ++RepIndex) { if (ReplicateCount > 1) ProgressStep(RepIndex, ReplicateCount, "Resampling"); vector ResampledUniqueIxs; for (uint i = 0; i < SiteCount; ++i) { uint r = randu32()%N; uint UniqueIx = NonGappyUniqueIxs[r]; ResampledUniqueIxs.push_back(UniqueIx); } MSA RepAln; E.MakeResampledMSA(ResampledUniqueIxs, RepAln); if (OutputWildCard) { string OutputFileName; MakeReplicateFileName_N(OutputPattern, RepIndex+1, OutputFileName); fOut = CreateStdioFile(OutputFileName); } else Pf(fOut, "b? a+log1pexp(b-a): b+log1pexp(a-b); } inline double log1pexp(double x) { return x<-709.089565713? 0.: log1p(exp(x)); } inline double sum_log_prob(double a, double b) { return a>b? a+log1pexp(b-a): b+log1pexp(a-b); } inline long double log1pexp(long double x) { return x<-11355.8302591? 0.: log1p(exp(x)); } inline long double sum_log_prob(long double a, long double b) { return a>b? a+log1pexp(b-a): b+log1pexp(a-b); } A careful coder would check exactly where the exp(x) computation underflows, rather than relying on the theoretical estimates made here, as there could be implementation-dependent details that cause underflow at a higher threshold than strictly necessary. ***/ const float LOG_ZERO = -2e20f; const float LOG_ONE = 0.0f; const float INVALID_LOG = FLT_MAX; const float UNINIT_LOG = 9e9f; const float OUT_OF_BAND_LOG = 8e8f; const float EXP_UNDERFLOW_THRESHOLD = -4.6f; const float LOG_UNDERFLOW_THRESHOLD = 7.5f; // Computes log (exp (x) + 1), for 0 <= x <= 7.5. // This is 2x faster than the libary function log1p() inline float LOGEXP1(float x) { assert(x >= 0.00f); assert(x <= LOG_UNDERFLOW_THRESHOLD); if (x <= 1.00f) return ((-0.009350833524763f * x + 0.130659527668286f) * x + 0.498799810682272f) * x + 0.693203116424741f; if (x <= 2.50f) return ((-0.014532321752540f * x + 0.139942324101744f) * x + 0.495635523139337f) * x + 0.692140569840976f; if (x <= 4.50f) return ((-0.004605031767994f * x + 0.063427417320019f) * x + 0.695956496475118f) * x + 0.514272634594009f; assert(x <= LOG_UNDERFLOW_THRESHOLD); return ((-0.000458661602210f * x + 0.009695946122598f) * x + 0.930734667215156f) * x + 0.168037164329057f; } inline void LOG_PLUS_EQUALS(float& x, float y) { if (x < y) x = (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ? y : LOGEXP1(y - x) + x; else x = (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ? x : LOGEXP1(x - y) + y; } inline float LOG_ADD(float x, float y) { if (x < y) return (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ? y : LOGEXP1(y - x) + x; return (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ? x : LOGEXP1(x - y) + y; } inline float LOG_ADD(float x1, float x2, float x3) { return LOG_ADD(x1, LOG_ADD(x2, x3)); } inline float LOG_ADD(float x1, float x2, float x3, float x4) { return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, x4))); } inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5) { return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, x5)))); } inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5, float x6) { return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, LOG_ADD(x5, x6))))); } inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5, float x6, float x7) { return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, LOG_ADD(x5, LOG_ADD(x6, x7)))))); } muscle-5.1.0/src/seb8.cpp000066400000000000000000000171761424453062600151630ustar00rootroot00000000000000#include "myutils.h" #include "alpha3.h" // [0] = AST // [1] = C // [2] = DHN // [3] = EKQR // [4] = FWY // [5] = G // [6] = ILMV // [7] = P byte g_CharToLetterSEB8[256] = { INVALID_LETTER, // [ 0] 00 INVALID_LETTER, // [ 1] 01 INVALID_LETTER, // [ 2] 02 INVALID_LETTER, // [ 3] 03 INVALID_LETTER, // [ 4] 04 INVALID_LETTER, // [ 5] 05 INVALID_LETTER, // [ 6] 06 INVALID_LETTER, // [ 7] 07 INVALID_LETTER, // [ 8] 08 INVALID_LETTER, // [ 9] 09 INVALID_LETTER, // [ 10] 0a INVALID_LETTER, // [ 11] 0b INVALID_LETTER, // [ 12] 0c INVALID_LETTER, // [ 13] 0d INVALID_LETTER, // [ 14] 0e INVALID_LETTER, // [ 15] 0f INVALID_LETTER, // [ 16] 10 INVALID_LETTER, // [ 17] 11 INVALID_LETTER, // [ 18] 12 INVALID_LETTER, // [ 19] 13 INVALID_LETTER, // [ 20] 14 INVALID_LETTER, // [ 21] 15 INVALID_LETTER, // [ 22] 16 INVALID_LETTER, // [ 23] 17 INVALID_LETTER, // [ 24] 18 INVALID_LETTER, // [ 25] 19 INVALID_LETTER, // [ 26] 1a INVALID_LETTER, // [ 27] 1b INVALID_LETTER, // [ 28] 1c INVALID_LETTER, // [ 29] 1d INVALID_LETTER, // [ 30] 1e INVALID_LETTER, // [ 31] 1f INVALID_LETTER, // [ 32] 20 ' ' INVALID_LETTER, // [ 33] 21 '!' INVALID_LETTER, // [ 34] 22 '"' INVALID_LETTER, // [ 35] 23 '#' INVALID_LETTER, // [ 36] 24 '$' INVALID_LETTER, // [ 37] 25 '%' INVALID_LETTER, // [ 38] 26 '&' INVALID_LETTER, // [ 39] 27 ''' INVALID_LETTER, // [ 40] 28 '(' INVALID_LETTER, // [ 41] 29 ')' INVALID_LETTER, // [ 42] 2a '*' INVALID_LETTER, // [ 43] 2b '+' INVALID_LETTER, // [ 44] 2c ',' INVALID_LETTER, // [ 45] 2d '-' INVALID_LETTER, // [ 46] 2e '.' INVALID_LETTER, // [ 47] 2f '/' INVALID_LETTER, // [ 48] 30 '0' INVALID_LETTER, // [ 49] 31 '1' INVALID_LETTER, // [ 50] 32 '2' INVALID_LETTER, // [ 51] 33 '3' INVALID_LETTER, // [ 52] 34 '4' INVALID_LETTER, // [ 53] 35 '5' INVALID_LETTER, // [ 54] 36 '6' INVALID_LETTER, // [ 55] 37 '7' INVALID_LETTER, // [ 56] 38 '8' INVALID_LETTER, // [ 57] 39 '9' INVALID_LETTER, // [ 58] 3a ':' INVALID_LETTER, // [ 59] 3b ';' INVALID_LETTER, // [ 60] 3c '<' INVALID_LETTER, // [ 61] 3d '=' INVALID_LETTER, // [ 62] 3e '>' INVALID_LETTER, // [ 63] 3f '?' INVALID_LETTER, // [ 64] 40 '@' 0, // [ 65] 41 A {AST} INVALID_LETTER, // [ 66] 42 'B' 1, // [ 67] 43 C {C} 2, // [ 68] 44 D {DHN} 3, // [ 69] 45 E {EKQR} 4, // [ 70] 46 F {FWY} 5, // [ 71] 47 G {G} 2, // [ 72] 48 H {DHN} 6, // [ 73] 49 I {ILMV} INVALID_LETTER, // [ 74] 4a 'J' 3, // [ 75] 4b K {EKQR} 6, // [ 76] 4c L {ILMV} 6, // [ 77] 4d M {ILMV} 2, // [ 78] 4e N {DHN} INVALID_LETTER, // [ 79] 4f 'O' 7, // [ 80] 50 P {P} 3, // [ 81] 51 Q {EKQR} 3, // [ 82] 52 R {EKQR} 0, // [ 83] 53 S {AST} 0, // [ 84] 54 T {AST} INVALID_LETTER, // [ 85] 55 'U' 6, // [ 86] 56 V {ILMV} 4, // [ 87] 57 W {FWY} INVALID_LETTER, // [ 88] 58 'X' 4, // [ 89] 59 Y {FWY} INVALID_LETTER, // [ 90] 5a 'Z' INVALID_LETTER, // [ 91] 5b '[' INVALID_LETTER, // [ 92] 5c '\' INVALID_LETTER, // [ 93] 5d ']' INVALID_LETTER, // [ 94] 5e '^' INVALID_LETTER, // [ 95] 5f '_' INVALID_LETTER, // [ 96] 60 '`' INVALID_LETTER, // [ 97] 61 'a' INVALID_LETTER, // [ 98] 62 'b' INVALID_LETTER, // [ 99] 63 'c' INVALID_LETTER, // [100] 64 'd' INVALID_LETTER, // [101] 65 'e' INVALID_LETTER, // [102] 66 'f' INVALID_LETTER, // [103] 67 'g' INVALID_LETTER, // [104] 68 'h' INVALID_LETTER, // [105] 69 'i' INVALID_LETTER, // [106] 6a 'j' INVALID_LETTER, // [107] 6b 'k' INVALID_LETTER, // [108] 6c 'l' INVALID_LETTER, // [109] 6d 'm' INVALID_LETTER, // [110] 6e 'n' INVALID_LETTER, // [111] 6f 'o' INVALID_LETTER, // [112] 70 'p' INVALID_LETTER, // [113] 71 'q' INVALID_LETTER, // [114] 72 'r' INVALID_LETTER, // [115] 73 's' INVALID_LETTER, // [116] 74 't' INVALID_LETTER, // [117] 75 'u' INVALID_LETTER, // [118] 76 'v' INVALID_LETTER, // [119] 77 'w' INVALID_LETTER, // [120] 78 'x' INVALID_LETTER, // [121] 79 'y' INVALID_LETTER, // [122] 7a 'z' INVALID_LETTER, // [123] 7b '{' INVALID_LETTER, // [124] 7c '|' INVALID_LETTER, // [125] 7d '}' INVALID_LETTER, // [126] 7e '~' INVALID_LETTER, // [127] 7f INVALID_LETTER, // [128] 80 INVALID_LETTER, // [129] 81 INVALID_LETTER, // [130] 82 INVALID_LETTER, // [131] 83 INVALID_LETTER, // [132] 84 INVALID_LETTER, // [133] 85 INVALID_LETTER, // [134] 86 INVALID_LETTER, // [135] 87 INVALID_LETTER, // [136] 88 INVALID_LETTER, // [137] 89 INVALID_LETTER, // [138] 8a INVALID_LETTER, // [139] 8b INVALID_LETTER, // [140] 8c INVALID_LETTER, // [141] 8d INVALID_LETTER, // [142] 8e INVALID_LETTER, // [143] 8f INVALID_LETTER, // [144] 90 INVALID_LETTER, // [145] 91 INVALID_LETTER, // [146] 92 INVALID_LETTER, // [147] 93 INVALID_LETTER, // [148] 94 INVALID_LETTER, // [149] 95 INVALID_LETTER, // [150] 96 INVALID_LETTER, // [151] 97 INVALID_LETTER, // [152] 98 INVALID_LETTER, // [153] 99 INVALID_LETTER, // [154] 9a INVALID_LETTER, // [155] 9b INVALID_LETTER, // [156] 9c INVALID_LETTER, // [157] 9d INVALID_LETTER, // [158] 9e INVALID_LETTER, // [159] 9f INVALID_LETTER, // [160] a0 INVALID_LETTER, // [161] a1 INVALID_LETTER, // [162] a2 INVALID_LETTER, // [163] a3 INVALID_LETTER, // [164] a4 INVALID_LETTER, // [165] a5 INVALID_LETTER, // [166] a6 INVALID_LETTER, // [167] a7 INVALID_LETTER, // [168] a8 INVALID_LETTER, // [169] a9 INVALID_LETTER, // [170] aa INVALID_LETTER, // [171] ab INVALID_LETTER, // [172] ac INVALID_LETTER, // [173] ad INVALID_LETTER, // [174] ae INVALID_LETTER, // [175] af INVALID_LETTER, // [176] b0 INVALID_LETTER, // [177] b1 INVALID_LETTER, // [178] b2 INVALID_LETTER, // [179] b3 INVALID_LETTER, // [180] b4 INVALID_LETTER, // [181] b5 INVALID_LETTER, // [182] b6 INVALID_LETTER, // [183] b7 INVALID_LETTER, // [184] b8 INVALID_LETTER, // [185] b9 INVALID_LETTER, // [186] ba INVALID_LETTER, // [187] bb INVALID_LETTER, // [188] bc INVALID_LETTER, // [189] bd INVALID_LETTER, // [190] be INVALID_LETTER, // [191] bf INVALID_LETTER, // [192] c0 INVALID_LETTER, // [193] c1 INVALID_LETTER, // [194] c2 INVALID_LETTER, // [195] c3 INVALID_LETTER, // [196] c4 INVALID_LETTER, // [197] c5 INVALID_LETTER, // [198] c6 INVALID_LETTER, // [199] c7 INVALID_LETTER, // [200] c8 INVALID_LETTER, // [201] c9 INVALID_LETTER, // [202] ca INVALID_LETTER, // [203] cb INVALID_LETTER, // [204] cc INVALID_LETTER, // [205] cd INVALID_LETTER, // [206] ce INVALID_LETTER, // [207] cf INVALID_LETTER, // [208] d0 INVALID_LETTER, // [209] d1 INVALID_LETTER, // [210] d2 INVALID_LETTER, // [211] d3 INVALID_LETTER, // [212] d4 INVALID_LETTER, // [213] d5 INVALID_LETTER, // [214] d6 INVALID_LETTER, // [215] d7 INVALID_LETTER, // [216] d8 INVALID_LETTER, // [217] d9 INVALID_LETTER, // [218] da INVALID_LETTER, // [219] db INVALID_LETTER, // [220] dc INVALID_LETTER, // [221] dd INVALID_LETTER, // [222] de INVALID_LETTER, // [223] df INVALID_LETTER, // [224] e0 INVALID_LETTER, // [225] e1 INVALID_LETTER, // [226] e2 INVALID_LETTER, // [227] e3 INVALID_LETTER, // [228] e4 INVALID_LETTER, // [229] e5 INVALID_LETTER, // [230] e6 INVALID_LETTER, // [231] e7 INVALID_LETTER, // [232] e8 INVALID_LETTER, // [233] e9 INVALID_LETTER, // [234] ea INVALID_LETTER, // [235] eb INVALID_LETTER, // [236] ec INVALID_LETTER, // [237] ed INVALID_LETTER, // [238] ee INVALID_LETTER, // [239] ef INVALID_LETTER, // [240] f0 INVALID_LETTER, // [241] f1 INVALID_LETTER, // [242] f2 INVALID_LETTER, // [243] f3 INVALID_LETTER, // [244] f4 INVALID_LETTER, // [245] f5 INVALID_LETTER, // [246] f6 INVALID_LETTER, // [247] f7 INVALID_LETTER, // [248] f8 INVALID_LETTER, // [249] f9 INVALID_LETTER, // [250] fa INVALID_LETTER, // [251] fb INVALID_LETTER, // [252] fc INVALID_LETTER, // [253] fd INVALID_LETTER, // [254] fe INVALID_LETTER, // [255] ff }; muscle-5.1.0/src/seq.cpp000066400000000000000000000147711424453062600151100ustar00rootroot00000000000000#include "muscle.h" #include "seq.h" #include "textfile.h" #include "msa.h" //#include const size_t MAX_FASTA_LINE = 16000; void Seq::SetName(const char *ptrName) { delete[] m_ptrName; size_t n = strlen(ptrName) + 1; m_ptrName = new char[n]; strcpy(m_ptrName, ptrName); } void Seq::ToFASTAFile(TextFile &File) const { File.PutFormat(">%s\n", m_ptrName); unsigned uColCount = Length(); for (unsigned n = 0; n < uColCount; ++n) { if (n > 0 && n%60 == 0) File.PutString("\n"); File.PutChar(at(n)); } File.PutString("\n"); } // Return true on end-of-file bool Seq::FromFASTAFile(TextFile &File) { Clear(); char szLine[MAX_FASTA_LINE]; bool bEof = File.GetLine(szLine, sizeof(szLine)); if (bEof) return true; if ('>' != szLine[0]) Die("Expecting '>' in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); size_t n = strlen(szLine); if (1 == n) Die("Missing annotation following '>' in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); m_ptrName = new char[n]; strcpy(m_ptrName, szLine + 1); TEXTFILEPOS Pos = File.GetPos(); for (;;) { bEof = File.GetLine(szLine, sizeof(szLine)); if (bEof) { if (0 == size()) { Die("Empty sequence in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); return true; } return false; } if ('>' == szLine[0]) { if (0 == size()) Die("Empty sequence in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); // Rewind to beginning of this line, it's the start of the // next sequence. File.SetPos(Pos); return false; } const char *ptrChar = szLine; while (char c = *ptrChar++) { if (isspace(c)) continue; if (IsGapChar(c)) continue; if (!IsResidueChar(c)) { if (isprint(c)) { char w = GetWildcardChar(); Warning("Invalid residue '%c' in FASTA file %s line %d, replaced by '%c'", c, File.GetFileName(), File.GetLineNr(), w); c = w; } else Die("Invalid byte hex %02x in FASTA file %s line %d", (unsigned char) c, File.GetFileName(), File.GetLineNr()); } c = toupper(c); push_back(c); } Pos = File.GetPos(); } } void Seq::ExtractUngapped(MSA &msa) const { msa.Clear(); unsigned uColCount = Length(); msa.SetSize(1, 1); unsigned uUngappedPos = 0; for (unsigned n = 0; n < uColCount; ++n) { char c = at(n); if (!IsGapChar(c)) msa.SetChar(0, uUngappedPos++, c); } msa.SetSeqName(0, m_ptrName); } void Seq::Copy(const Seq &rhs) { clear(); const unsigned uLength = rhs.Length(); for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) push_back(rhs.at(uColIndex)); const char *ptrName = rhs.GetName(); size_t n = strlen(ptrName) + 1; m_ptrName = new char[n]; strcpy(m_ptrName, ptrName); SetId(rhs.GetId()); } void Seq::CopyReversed(const Seq &rhs) { clear(); const unsigned uLength = rhs.Length(); const unsigned uBase = rhs.Length() - 1; for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) push_back(rhs.at(uBase - uColIndex)); const char *ptrName = rhs.GetName(); size_t n = strlen(ptrName) + 1; m_ptrName = new char[n]; strcpy(m_ptrName, ptrName); } void Seq::StripGaps() { for (CharVect::iterator p = begin(); p != end(); ) { char c = *p; if (IsGapChar(c)) erase(p); else ++p; } } void Seq::StripGapsAndWhitespace() { for (CharVect::iterator p = begin(); p != end(); ) { char c = *p; if (isspace(c) || IsGapChar(c)) erase(p); else ++p; } } void Seq::ToUpper() { for (CharVect::iterator p = begin(); p != end(); ++p) { char c = *p; if (islower(c)) *p = toupper(c); } } unsigned Seq::GetLetter(unsigned uIndex) const { assert(uIndex < Length()); char c = operator[](uIndex); return CharToLetter(c); } bool Seq::EqIgnoreCase(const Seq &s) const { const unsigned n = Length(); if (n != s.Length()) return false; for (unsigned i = 0; i < n; ++i) { const char c1 = at(i); const char c2 = s.at(i); if (IsGapChar(c1)) { if (!IsGapChar(c2)) return false; } else { if (toupper(c1) != toupper(c2)) return false; } } return true; } bool Seq::Eq(const Seq &s) const { const unsigned n = Length(); if (n != s.Length()) return false; for (unsigned i = 0; i < n; ++i) { const char c1 = at(i); const char c2 = s.at(i); if (c1 != c2) return false; } return true; } bool Seq::EqIgnoreCaseAndGaps(const Seq &s) const { const unsigned uThisLength = Length(); const unsigned uOtherLength = s.Length(); unsigned uThisPos = 0; unsigned uOtherPos = 0; int cThis; int cOther; for (;;) { if (uThisPos == uThisLength && uOtherPos == uOtherLength) break; // Set cThis to next non-gap character in this string // or -1 if end-of-string. for (;;) { if (uThisPos == uThisLength) { cThis = -1; break; } else { cThis = at(uThisPos); ++uThisPos; if (!IsGapChar(cThis)) { cThis = toupper(cThis); break; } } } // Set cOther to next non-gap character in s // or -1 if end-of-string. for (;;) { if (uOtherPos == uOtherLength) { cOther = -1; break; } else { cOther = s.at(uOtherPos); ++uOtherPos; if (!IsGapChar(cOther)) { cOther = toupper(cOther); break; } } } // Compare characters are corresponding ungapped position if (cThis != cOther) return false; } return true; } unsigned Seq::GetUngappedLength() const { unsigned uUngappedLength = 0; for (CharVect::const_iterator p = begin(); p != end(); ++p) { char c = *p; if (!IsGapChar(c)) ++uUngappedLength; } return uUngappedLength; } void Seq::LogMe() const { Log(">%s\n", m_ptrName); const unsigned n = Length(); for (unsigned i = 0; i < n; ++i) Log("%c", at(i)); Log("\n"); } void Seq::FromString(const char *pstrSeq, const char *pstrName) { clear(); const unsigned uLength = (unsigned) strlen(pstrSeq); for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) push_back(pstrSeq[uColIndex]); size_t n = strlen(pstrName) + 1; m_ptrName = new char[n]; strcpy(m_ptrName, pstrName); } bool Seq::HasGap() const { for (CharVect::const_iterator p = begin(); p != end(); ++p) { char c = *p; if (IsGapChar(c)) return true; } return false; } void Seq::FixAlpha() { for (CharVect::iterator p = begin(); p != end(); ++p) { char c = *p; if (!IsResidueChar(c)) { char w = GetWildcardChar(); // Warning("Invalid residue '%c', replaced by '%c'", c, w); InvalidLetterWarning(c, w); *p = w; } } } muscle-5.1.0/src/seq.h000066400000000000000000000033701424453062600145460ustar00rootroot00000000000000#ifndef Seq_h #define Seq_h #include class TextFile; class MSA; typedef std::vector CharVect; class Seq : public CharVect { public: Seq() { m_ptrName = 0; // Start with moderate size to avoid // thrashing the heap. reserve(200); } virtual ~Seq() { delete[] m_ptrName; } private: // Not implemented; prevent use of copy c'tor and assignment. Seq(const Seq &); Seq &operator=(const Seq &); public: void Clear() { clear(); delete[] m_ptrName; m_ptrName = 0; m_uId = UINT_MAX; } const char *GetName() const { return m_ptrName; } unsigned GetId() const { if (UINT_MAX == m_uId) Die("Seq::GetId, id not set"); return m_uId; } void SetId(unsigned uId) { m_uId = uId; } bool FromFASTAFile(TextFile &File); void ToFASTAFile(TextFile &File) const; void ExtractUngapped(MSA &msa) const; void FromString(const char *pstrSeq, const char *pstrName); void Copy(const Seq &rhs); void CopyReversed(const Seq &rhs); void StripGaps(); void StripGapsAndWhitespace(); void ToUpper(); void SetName(const char *ptrName); unsigned GetLetter(unsigned uIndex) const; unsigned Length() const { return (unsigned) size(); } bool Eq(const Seq &s) const; bool EqIgnoreCase(const Seq &s) const; bool EqIgnoreCaseAndGaps(const Seq &s) const; bool HasGap() const; unsigned GetUngappedLength() const; void LogMe() const; char GetChar(unsigned uIndex) const { return operator[](uIndex); } void SetChar(unsigned uIndex, char c) { operator[](uIndex) = c; } void AppendChar(char c) { push_back(c); } void FixAlpha(); #ifndef _WIN32 reference at(size_type i) { return operator[](i); } const_reference at(size_type i) const { return operator[](i); } #endif private: char *m_ptrName; unsigned m_uId; }; #endif // Seq.h muscle-5.1.0/src/sequence.cpp000066400000000000000000000152711424453062600161240ustar00rootroot00000000000000#include "myutils.h" #include "sequence.h" #if SEQ_TRACE #include "locallock.h" static vector g_Files; static vector g_Lines; static vector g_Deleted; static vector g_SeqPtrs; void Sequence::AllocReport(const string &Msg) { Log("\nSequence::AllocReport(%s)\n", Msg.c_str()); uint64 TotalBytes = 0; const uint N = SIZE(g_Files); asserta(SIZE(g_Lines) == N); asserta(SIZE(g_Deleted) == N); asserta(SIZE(g_SeqPtrs) == N); uint DeletedCount = 0; uint NotDeletedCount = 0; Log("\n"); for (uint i = 0; i < N; ++i) { if (g_Deleted[i]) continue; ++NotDeletedCount; double Bytes = double(g_SeqPtrs.size()); TotalBytes += g_SeqPtrs.capacity(); Log("[%7u] %10.0f bytes %s:%u\n", i, Bytes, g_Files[i], g_Lines[i]); } Log("Seqs %u / %u not freed, bytes %s\n", NotDeletedCount, N, MemBytesToStr(TotalBytes)); } void Sequence::AssertId() const { asserta(m_Id < SIZE(g_Files)); } Sequence *Sequence::_NewSequence(const char *File, int Line) { Sequence *Seq = new Sequence; Lock(); uint Id = SIZE(g_Files); Seq->m_Id = Id; g_Files.push_back(File); g_Lines.push_back(Line); g_Deleted.push_back(false); g_SeqPtrs.push_back(Seq); Unlock(); return Seq; } void Sequence::_DeleteSequence(const Sequence *s, const char *File, int Line) { uint Id = s->m_Id; asserta(Id < SIZE(g_Deleted)); asserta(!g_Deleted[Id]); delete s; g_Deleted[Id] = true; } #else Sequence *Sequence::_NewSequence() { Sequence *Seq = new Sequence; return Seq; } void Sequence::_DeleteSequence(const Sequence *Seq) { delete Seq; } #endif void Sequence::Create(const vector *a_data, string a_label, uint GSI, uint SMI) { m_CharVec = *a_data; m_Label = a_label; m_GSI = GSI; m_SMI = SMI; } bool Sequence::FromFileBuffer(FileBuffer& infile, bool stripGaps) { if (infile.eof()) return false; m_GSI = UINT_MAX; m_SMI = UINT_MAX; m_Label = "~"; // Skip blank lines for (;;) { if (infile.eof()) { if (m_Label.empty()) return false; asserta(false); } infile.GetLine(m_Label); if (m_Label.length() > 0) break; } if (m_Label[0] != '>') Die("Expected '>' in FASTA, got '%s'", m_Label.c_str()); // Remove leading ">" m_Label = m_Label.substr(1); if (opt(accs)) { string Acc; GetAccFromLabel(m_Label, Acc); m_Label = Acc; } m_CharVec.clear(); m_CharVec.push_back('@'); char ch; while (infile.Get(ch)) { if (ch == '>') { infile.UnGet(); break; } if (isspace(ch)) continue; if (stripGaps && ch == '-') continue; m_CharVec.push_back(ch); } return true; } void Sequence::WriteMFA(FILE *f) const { const vector &v = m_CharVec; const int L = GetLength(); byte *Seq = myalloc(byte, L); for (int i = 0; i < L; ++i) Seq[i] = v[i+1]; SeqToFasta(f, Seq, (uint) L, m_Label.c_str()); } Sequence* Sequence::Clone() const { Sequence* ret = NewSequence(); asserta(ret); ret->m_Label = m_Label; ret->m_CharVec = m_CharVec; ret->m_GSI = m_GSI; ret->m_SMI = m_SMI; return ret; } ///////////////////////////////////////////////////////////////// // Sequence::AddGaps() // // Given an vector containing the skeleton for an // alignment and the identity of the current character, this // routine will create a new sequence with all necesssary gaps added. // For instance, // alignment = "XXXBBYYYBBYYXX" // id = 'X' // will perform the transformation // "ATGCAGTCA" --> "ATGCC---GT--CA" // (XXXBBYYYBBYYXX) ///////////////////////////////////////////////////////////////// Sequence* Sequence::AddGaps(const vector* alignment, char id) const { Sequence* ret = NewSequence(); assert(ret); ret->m_GSI = m_GSI; ret->m_SMI = m_SMI; ret->m_Label = m_Label; ret->m_CharVec.clear(); ret->m_CharVec.push_back('@'); vector::const_iterator dataIter = m_CharVec.begin() + 1; for (vector::const_iterator iter = alignment->begin(); iter != alignment->end(); ++iter) { if (*iter == 'B' || *iter == id) { ret->m_CharVec.push_back(*dataIter); ++dataIter; } else ret->m_CharVec.push_back('-'); } return ret; } Sequence* Sequence::AddGapsPath(const string &Path, char id) const { Sequence* ret = NewSequence(); assert(ret); ret->m_GSI = m_GSI; ret->m_SMI = m_SMI; ret->m_Label = m_Label; ret->m_CharVec.clear(); ret->m_CharVec.push_back('@'); vector::const_iterator dataIter = m_CharVec.begin() + 1; for (string::const_iterator iter = Path.begin(); iter != Path.end(); ++iter) { if (*iter == 'B' || *iter == id) { ret->m_CharVec.push_back(*dataIter); ++dataIter; } else ret->m_CharVec.push_back('-'); } return ret; } // Returns vector containing 1-based col indexes, e.g. if m_CharVec // is "ATGCC---GT--CA" vector is set to {1,2,3,4,5,9,10,13,14}. void Sequence::GetPosToCol_OneBased(vector &PosToCol) const { PosToCol.clear(); PosToCol.push_back(UINT_MAX); uint L = GetLength(); for (uint i = 1; i <= L; i++) { if (m_CharVec[i] != '-') PosToCol.push_back(i); } } // Returns vector containing 0-based col indexes, e.g. // "ATGCC---GT--CA" PosToCol={0,1,2,3,4,8,9,12,13}. void Sequence::GetPosToCol(vector &PosToCol) const { PosToCol.clear(); const uint ColCount = GetLength(); const byte *ByteSeq = GetBytePtr(); for (uint Col = 0; Col < ColCount; ++Col) { if (ByteSeq[Col] != '-') PosToCol.push_back(Col); } } void Sequence::GetColToPos(vector &ColToPos) const { ColToPos.clear(); const uint ColCount = GetLength(); const byte *ByteSeq = GetBytePtr(); uint Pos = 0; for (uint Col = 0; Col < ColCount; ++Col) { if (ByteSeq[Col] == '-') ColToPos.push_back(UINT_MAX); else ColToPos.push_back(Pos++); } } Sequence *Sequence::DeleteGaps() const { Sequence* ret = NewSequence(); asserta(ret); ret->m_Label = m_Label; ret->m_GSI = m_GSI; ret->m_SMI = m_SMI; ret->m_CharVec.clear(); asserta(!m_CharVec.empty() && m_CharVec[0] == '@'); int L = GetLength(); ret->m_CharVec.push_back('@'); for (int i = 1; i <= L; ++i) { char c = m_CharVec[i]; if (c != '-') ret->m_CharVec.push_back(c); } return ret; } //void Sequence::Copy(const Sequence &rhs) // { // m_Label = rhs.m_Label; // m_SMI = rhs.m_SMI; // m_GSI = rhs.m_GSI; // m_CharVec = rhs.m_CharVec; // } void Sequence::FromString(const string &Label, const string &Seq) { m_GSI = UINT_MAX; m_SMI = UINT_MAX; m_Label = Label; m_CharVec.clear(); int L = (int) SIZE(Seq); m_CharVec.push_back('@'); for (int i = 0; i < L; ++i) { char c = Seq[i]; m_CharVec.push_back(c); } } void Sequence::LogMe() const { Log("\n"); uint L = GetLength(); Log("Sequence(%p) length %u >%s\n", this, L, m_Label.c_str()); for (uint i = 1; i <= L; ++i) Log("%c", m_CharVec[i]); Log("\n"); } muscle-5.1.0/src/sequence.h000066400000000000000000000060361424453062600155700ustar00rootroot00000000000000#ifndef SEQUENCE_H #define SEQUENCE_H #include #include #include #include #include #include #include "filebuffer.h" #define SEQ_TRACE 0 class Sequence; // m_CharVec uses one-based indexing, m_CharVec[0] set to '@'. class Sequence { public: #if SEQ_TRACE uint m_Id = UINT_MAX; #endif // Global input MulitSquence index uint m_GSI = UINT_MAX; // Sparse matrix index uint m_SMI = UINT_MAX; string m_Label; vector m_CharVec; private: Sequence() { #if SEQ_TRACE m_Id = UINT_MAX; #endif m_Label = "~"; m_GSI = UINT_MAX; m_SMI = UINT_MAX; } ~Sequence() {} public: bool FromFileBuffer(FileBuffer& infile, bool stripGaps = false); void Create(const vector *m_CharVec, string m_Label, uint GSI, uint SMI); void FromString(const string &Label, const string &Seq); // void Copy(const Sequence &rhs); void InitData() { m_CharVec.clear(); m_CharVec.push_back('@'); } void AppendChar(char c) { m_CharVec.push_back(c); } const string &GetLabel() const { return m_Label; } const char *GetLabelCStr() const { return m_Label.c_str(); } // One-based char *GetCharPtr1() { return m_CharVec.data(); } const char *GetCharPtr1() const { return m_CharVec.data(); } const byte *GetBytePtr() const { const char *CharPtr = m_CharVec.data() + 1; const byte *BytePtr = (const byte *) CharPtr; return BytePtr; } // Chars stored with one-based indexing. char GetPosition(int i) const { assert(i >= 1 && i < m_CharVec.size()); return m_CharVec[i]; } char GetChar(uint ZeroBasedPos) const { return GetPosition(int(ZeroBasedPos+1)); } void SetGSI(uint GSI) { asserta(m_GSI == UINT_MAX); m_GSI = GSI; } void OverwriteGSI(uint GSI) { m_GSI = GSI; } void OverwriteLabel(const string &Label) { m_Label = Label; } uint GetSMI() const { return m_SMI; } uint GetGSI() const { return m_GSI; } uint GetLength() const { uint n = (uint) m_CharVec.size(); asserta(n > 0); uint L = n - 1; return L; } void WriteMFA(FILE *f) const; Sequence* Clone() const; Sequence* AddGaps(const vector* alignment, char id) const; Sequence* AddGapsPath(const string &Path, char id) const; Sequence* DeleteGaps() const; void GetPosToCol_OneBased(vector &PosToCol) const; void GetPosToCol(vector &PosToCol) const; void GetColToPos(vector &ColToPos) const; void LogMe() const; #if SEQ_TRACE void AssertId() const; #endif public: #if SEQ_TRACE static Sequence *_NewSequence(const char *File, int Line); #define NewSequence() Sequence::_NewSequence(__FILE__, __LINE__) static void _DeleteSequence(const Sequence *Seq, const char *File, int Line); #define DeleteSequence(s) Sequence::_DeleteSequence((s), __FILE__, __LINE__) static void AllocReport(const string &Msg); #else static Sequence *_NewSequence(); #define NewSequence() Sequence::_NewSequence() static void _DeleteSequence(const Sequence *Seq); #define DeleteSequence(s) Sequence::_DeleteSequence((s)) #endif }; #endif muscle-5.1.0/src/setprobconsparams.cpp000066400000000000000000000013351424453062600200550ustar00rootroot00000000000000#include "muscle.h" #include "hmmparams.h" static bool g_InitDone = false; void InitProbcons() { if (g_InitDone) return; asserta(g_Alpha == ALPHA_Amino || g_Alpha == ALPHA_Nucleo); HMMParams HP; if (optset_hmmin) { const string FileName = opt(hmmin); ProgressLog("Reading HMM parameters from %s\n", FileName.c_str()); HP.FromFile(FileName); } else { bool Nucleo = (g_Alpha == ALPHA_Nucleo); HP.FromDefaults(Nucleo); } if (optset_perturb) { uint Seed = opt(perturb); if (Seed > 0) { ProgressLog("Perturbing HMM parameters with seed %u\n", Seed); ResetRand(Seed); HP.PerturbProbs(Seed); } } if (optset_hmmout) HP.ToFile(opt(hmmout)); HP.ToPairHMM(); g_InitDone = true; } muscle-5.1.0/src/sort.h000066400000000000000000000147671424453062600147610ustar00rootroot00000000000000#ifndef sort_h #define sort_h #include "myutils.h" #include "countsort.h" #include #define StartTimer(x) /* empty */ #define EndTimer(x) /* empty */ inline void Range(vector &v, unsigned N) { v.clear(); v.reserve(N); for (unsigned i = 0; i < N; ++i) v.push_back(i); } inline void Range(unsigned *v, unsigned N) { for (unsigned i = 0; i < N; ++i) v[i] = i; } void Range(unsigned *v, unsigned n); template void QuickSortInPlaceRecurse(T *Values, int left, int right) { int i = left; int j = right; int Mid = (left + right)/2; T pivot = Values[Mid]; while (i <= j) { if (Desc) { while (Values[i] > pivot) i++; while (Values[j] < pivot) j--; } else { while (Values[i] < pivot) i++; while (Values[j] > pivot) j--; } if (i <= j) { swap(Values[i], Values[j]); i++; j--; } } if (left < j) QuickSortInPlaceRecurse(Values, left, j); if (i < right) QuickSortInPlaceRecurse(Values, i, right); } template void QuickSortOrderRecurse(const T *Values, int left, int right, unsigned *Order) { int i = left; int j = right; int Mid = (left + right)/2; T pivot = Values[Order[Mid]]; while (i <= j) { if (Desc) { while (Values[Order[i]] > pivot) i++; while (Values[Order[j]] < pivot) j--; } else { while (Values[Order[i]] < pivot) i++; while (Values[Order[j]] > pivot) j--; } if (i <= j) { swap(Order[i], Order[j]); i++; j--; } } if (left < j) QuickSortOrderRecurse(Values, left, j, Order); if (i < right) QuickSortOrderRecurse(Values, i, right, Order); } template void QuickSortInPlace(T *Values, unsigned N) { if (N == 0) return; asserta(N < INT_MAX); StartTimer(QuickSortInPlace); QuickSortInPlaceRecurse(Values, 0, int(N-1)); EndTimer(QuickSortInPlace); } template void QuickSortInPlaceDesc(T *Values, unsigned N) { if (N == 0) return; asserta(N < INT_MAX); StartTimer(QuickSortInPlaceDesc); QuickSortInPlaceRecurse(Values, 0, int(N-1)); EndTimer(QuickSortInPlaceDesc); } template void QuickSortOrder(const T *Values, unsigned N, unsigned *Order) { if (N == 0) return; asserta(N < INT_MAX); StartTimer(QuickSortOrder); Range(Order, N); QuickSortOrderRecurse(Values, 0, int(N-1), Order); EndTimer(QuickSortOrder); } template void QuickSortOrderDesc(const T *Values, unsigned N, unsigned *Order) { if (N == 0) return; asserta(N < INT_MAX); StartTimer(QuickSortOrderDesc); Range(Order, N); QuickSortOrderRecurse(Values, 0, int(N-1), Order); EndTimer(QuickSortOrderDesc); } template void QuickSortSubset(const T *Values, unsigned N, unsigned *Subset) { if (N == 0) return; asserta(N < INT_MAX); StartTimer(QuickSortSubset); QuickSortOrderRecurse(Values, 0, int(N-1), Subset); EndTimer(QuickSortSubset); } template void QuickSortSubsetDesc(const T *Values, unsigned N, unsigned *Subset) { if (N == 0) return; asserta(N < INT_MAX); StartTimer(QuickSortSubsetDesc); QuickSortOrderRecurse(Values, 0, int(N-1), Subset); EndTimer(QuickSortSubsetDesc); } template float GetCountFromMapFloat(map &Map, const t &Key, bool Fail = true) { typename map::const_iterator p = Map.find(Key); if (p == Map.end()) { if (Fail) Die("GetCountFromMapFloat(), key not found"); return 0.0f; } return p->second; } template unsigned GetCountFromMap(map &Map, const t &Key, bool Fail = true) { typename map::const_iterator p = Map.find(Key); if (p == Map.end()) { if (Fail) Die("GetCountFromMap(), key not found"); return 0; } return p->second; } template void IncCountMapFloat(map &Map, const t &Key, float n) { if (Map.find(Key) == Map.end()) Map[Key] = n; else Map[Key] += n; } template void IncCountMap(map &Map, const t &Key, unsigned n = 1) { if (Map.find(Key) == Map.end()) Map[Key] = n; else Map[Key] += n; } template void VecToCountMap(const vector &Values, map &Map) { Map.clear(); const unsigned N = SIZE(Values); for (unsigned i = 0; i < N; ++i) { t Value = Values[i]; IncCountMap(Map, Value, 1); } } inline void CountMapToVecs(const map &Map, vector &Keys, vector &Counts) { Keys.clear(); Counts.clear(); vector Keys1; vector Counts1; for (map::const_iterator p = Map.begin(); p != Map.end(); ++p) { Keys1.push_back(p->first); Counts1.push_back(p->second); } const unsigned N = SIZE(Keys1); if (N == 0) return; unsigned *Order = myalloc(unsigned, N); QuickSortOrderDesc(Counts1.data(), N, Order); for (unsigned k = 0; k < N; ++k) { unsigned i = Order[k]; Keys.push_back(Keys1[i]); Counts.push_back(Counts1[i]); } myfree(Order); } //template void CountMapToVecsTpl(const map &Map, // vector &Keys, vector &Counts) // { // Keys.clear(); // Counts.clear(); // vector Keys1; // vector Counts1; // for (map::const_iterator p = Map.begin(); p != Map.end(); ++p) // { // Keys1.push_back(p->first); // Counts1.push_back(p->second); // } // const unsigned N = SIZE(Keys1); // if (N == 0) // return; // unsigned *Order = myalloc(unsigned, N); // QuickSortOrderDesc(Counts1.data(), N, Order); // for (unsigned k = 0; k < N; ++k) // { // unsigned i = Order[k]; // Keys.push_back(Keys1[i]); // Counts.push_back(Counts1[i]); // } // myfree(Order); // } template void QuickSortIndexesRecurse(const T *Values, int left, int right, unsigned *Indexes) { int i = left; int j = right; int Mid = (left + right)/2; T pivot = Values[Indexes[Mid]]; while (i <= j) { if (Desc) { while (Values[Indexes[i]] > pivot) i++; while (Values[Indexes[j]] < pivot) j--; } else { while (Values[Indexes[i]] < pivot) i++; while (Values[Indexes[j]] > pivot) j--; } if (i <= j) { swap(Indexes[i], Indexes[j]); i++; j--; } } if (left < j) QuickSortIndexesRecurse(Values, left, j, Indexes); if (i < right) QuickSortIndexesRecurse(Values, i, right, Indexes); } template void QuickSortIndexesInPlaceDesc(const T *Values, unsigned N, unsigned *Indexes) { if (N == 0) return; QuickSortIndexesRecurse(Values, 0, int(N-1), Indexes); } #endif // sort_h muscle-5.1.0/src/stripgappycols.cpp000066400000000000000000000027171424453062600174000ustar00rootroot00000000000000#include "myutils.h" #include "muscle.h" #include "msa.h" #include "textfile.h" #include "seq.h" void cmd_strip_gappy_cols() { const string &MSAFileName = opt(strip_gappy_cols); double MaxGapFract = 0.5; if (optset_max_gap_fract) MaxGapFract = opt(max_gap_fract); MSA Aln; TextFile TF(MSAFileName.c_str()); Aln.FromFASTAFile(TF); TF.Close(); const uint SeqCount = Aln.GetSeqCount(); const uint ColCount = Aln.GetColCount(); ProgressLog("%u seqs, %u cols, max gaps %.4f\n", SeqCount, ColCount, MaxGapFract); vector KeepCols; uint GappyCount = 0; for (uint Col = 0; Col < ColCount; ++Col) { uint GapCount = Aln.GetGapCount(Col); double GapFract = double(GapCount)/double(SeqCount); if (GapFract <= MaxGapFract) KeepCols.push_back(Col); else ++GappyCount; } uint NewColCount = SIZE(KeepCols); ProgressLog("Keeping %u cols (%.1f%%)\n", NewColCount, GetPct(NewColCount, ColCount)); asserta(NewColCount > 0); const string &OutputFileName = opt(output); FILE *f = CreateStdioFile(OutputFileName); byte *NewSeq = myalloc(byte, NewColCount); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { ProgressStep(SeqIndex, SeqCount, "Writing %s", OutputFileName.c_str()); const char *Label = Aln.GetSeqName(SeqIndex); for (uint i = 0; i < NewColCount; ++i) { uint Col = KeepCols[i]; char c = Aln.GetChar(SeqIndex, Col); NewSeq[i] = c; } SeqToFasta(f, NewSeq, NewColCount, Label); } CloseStdioFile(f); } muscle-5.1.0/src/stripgappyrows.cpp000066400000000000000000000024101424453062600174200ustar00rootroot00000000000000#include "myutils.h" #include "muscle.h" #include "msa.h" #include "textfile.h" #include "seq.h" void cmd_strip_gappy_rows() { const string &MSAFileName = opt(strip_gappy_rows); double MaxGapFract = 0.5; if (optset_max_gap_fract) MaxGapFract = opt(max_gap_fract); MSA Aln; TextFile TF(MSAFileName.c_str()); Aln.FromFASTAFile(TF); TF.Close(); const uint SeqCount = Aln.GetSeqCount(); const uint ColCount = Aln.GetColCount(); ProgressLog("%u seqs, %u cols, max gaps %.4f\n", SeqCount, ColCount, MaxGapFract); const string &OutputFileName = opt(output); uint DiscardCount = 0; FILE *f = CreateStdioFile(OutputFileName); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { ProgressStep(SeqIndex, SeqCount, "Writing %s", OutputFileName.c_str()); const char *Label = Aln.GetSeqName(SeqIndex); const char *Seq = Aln.m_szSeqs[SeqIndex]; uint GapCount = 0; for (uint Col = 0; Col < ColCount; ++Col) if (isgap(Seq[Col])) ++GapCount; double GapFract = double(GapCount)/ColCount; if (GapFract > MaxGapFract) { ++DiscardCount; continue; } SeqToFasta(f, (const byte *) Seq, ColCount, Label); } ProgressLog("Discarded %u / %u seqs (%.1f%%)\n", DiscardCount, SeqCount, GetPct(DiscardCount, SeqCount)); CloseStdioFile(f); } muscle-5.1.0/src/super4.cpp000066400000000000000000000211411424453062600155270ustar00rootroot00000000000000#include "muscle.h" #include "super4.h" #include "sequence.h" #include "multisequence.h" #include "usorter.h" #include "upgma5.h" #include "pprog.h" #include "treeperm.h" void LogDistMx(const string &Msg, const vector > &Mx); void GetConsensusSequence(const MultiSequence &MSA, string &Seq); void Super4::ClearTreesAndMSAs() { m_FinalMSA.Clear(); m_GuideTree_None.Clear(); m_GuideTree_ABC.Clear(); m_GuideTree_ACB.Clear(); m_GuideTree_BCA.Clear(); m_FinalMSA_None.Clear(); m_FinalMSA_ABC.Clear(); m_FinalMSA_ACB.Clear(); m_FinalMSA_BCA.Clear(); } void Super4::MakeGuideTree() { UPGMA5 U; U.Init(m_ClusterLabels, m_DistMx); U.FixEADistMx(); U.Run(LINKAGE_Biased, m_GuideTree_None); } void Super4::SplitBigMFA_Random(MultiSequence &InputMFA, uint MaxSize, vector &SplitMFAs) { SplitMFAs.clear(); const uint InputSeqCount = InputMFA.GetSeqCount(); asserta(InputSeqCount > MaxSize); uint OutputSeqCount = 0; for (;;) { asserta(OutputSeqCount <= InputSeqCount); uint RemainingSeqCount = InputSeqCount - OutputSeqCount; if (RemainingSeqCount == 0) break; uint N = RemainingSeqCount; if (N > MaxSize) N = MaxSize; MultiSequence *SplitMFA = new MultiSequence; asserta(SplitMFA != 0); for (uint i = 0; i < N; ++i) { const Sequence *seq = InputMFA.GetSequence(OutputSeqCount + i); SplitMFA->AddSequence(seq, false); } SplitMFAs.push_back(SplitMFA); OutputSeqCount += N; } AssertSameSeqsVec(InputMFA, SplitMFAs); } void Super4::SplitBigMFA(MultiSequence &BigMFA, uint MaxSize, float MinEA, vector &SplitMFAs) { SplitMFAs.clear(); const uint InputSeqCount = BigMFA.GetSeqCount(); asserta(InputSeqCount > MaxSize); m_EC.Run(BigMFA, MinEA); m_EC.GetClusterMFAs(SplitMFAs); uint ClusterCount = SIZE(SplitMFAs); asserta(ClusterCount > 0); AssertSameSeqsVec(BigMFA, SplitMFAs); for (uint ClusterIndex = 0; ClusterIndex < ClusterCount; ++ClusterIndex) { MultiSequence *MFA = SplitMFAs[ClusterIndex]; uint SeqCount = MFA->GetSeqCount(); if (SeqCount > MaxSize) { vector *SubMFAs = new vector; SplitBigMFA_Random(*MFA, MaxSize, *SubMFAs); AssertSameSeqsVec(*MFA, *SubMFAs); const uint N = SIZE(*SubMFAs); asserta(N > 1); SplitMFAs[ClusterIndex] = (*SubMFAs)[0]; for (uint i = 1; i < N; ++i) { MultiSequence *SubMFA = (*SubMFAs)[i]; SplitMFAs.push_back(SubMFA); } } } AssertSameSeqsVec(BigMFA, SplitMFAs); } void Super4::ClusterInput() { const uint InputSeqCount = m_InputSeqs->GetSeqCount(); m_EC.Run(*m_InputSeqs, m_MinEAPass1); m_EC.GetClusterMFAs(m_ClusterMFAs); AssertSameSeqsVec(*m_InputSeqs, m_ClusterMFAs); uint ClusterCount = SIZE(m_ClusterMFAs); ProgressLog("%u clusters pass 1\n", ClusterCount); for (uint ClusterIndex = 0; ClusterIndex < ClusterCount; ++ClusterIndex) { MultiSequence *MFA = m_ClusterMFAs[ClusterIndex]; AssertSameLabels(*MFA); uint SeqCount = MFA->GetSeqCount(); asserta(SeqCount > 0); if (SeqCount > m_MaxClusterSize) { vector *SplitMFAs = new vector; SplitBigMFA(*MFA, m_MaxClusterSize, m_MinEAPass2, *SplitMFAs); const uint N = SIZE(*SplitMFAs); asserta(N > 1); m_ClusterMFAs[ClusterIndex] = (*SplitMFAs)[0]; for (uint i = 1; i < N; ++i) { MultiSequence *SplitMFA = (*SplitMFAs)[i]; m_ClusterMFAs.push_back(SplitMFA); AssertSameLabels(*SplitMFA); } } } ClusterCount = SIZE(m_ClusterMFAs); ProgressLog("%u clusters pass 2\n", ClusterCount); AssertSameSeqsVec(*m_InputSeqs, m_ClusterMFAs); m_ClusterLabels.clear(); for (uint ClusterIndex = 0; ClusterIndex < ClusterCount; ++ClusterIndex) { MultiSequence *MFA = m_ClusterMFAs[ClusterIndex]; uint SeqCount = MFA->GetSeqCount(); asserta(SeqCount <= m_MaxClusterSize); string ClusterLabel; Ps(ClusterLabel, "Cluster%u", ClusterIndex); m_ClusterLabels.push_back(ClusterLabel); } } void Super4::AlignClusters() { m_ClusterMSAs.clear(); uint ClusterCount = SIZE(m_ClusterMFAs); for (uint ClusterIndex = 0; ClusterIndex < ClusterCount; ++ClusterIndex) { MultiSequence *ClusterMFA = m_ClusterMFAs[ClusterIndex]; AssertSameLabels(*ClusterMFA); const uint SeqCount = ClusterMFA->GetSeqCount(); if (SeqCount == 1) Progress("Align cluster %u / %u (1 seq)\n", ClusterIndex + 1, ClusterCount); else { Progress("\n"); Progress("Align cluster %u / %u (%u seqs)\n", ClusterIndex + 1, ClusterCount, SeqCount); Progress("\n"); } m_MPC.m_TreePerm = TP_None; m_MPC.Run(ClusterMFA); MultiSequence *ClusterMSA = new MultiSequence; asserta(ClusterMSA != 0); ClusterMSA->Copy(*m_MPC.m_MSA); AssertSameLabels(*ClusterMSA); m_ClusterMSAs.push_back(ClusterMSA); } } void Super4::DeleteClusterMSAs() { const uint N = SIZE(m_ClusterMSAs); for (uint i = 0; i < N; ++i) { MultiSequence *MSA = m_ClusterMSAs[i]; if (MSA != 0) MSA->Clear(); } m_ClusterMSAs.clear(); } void Super4::GetConsensusSeqs() { m_ConsensusSeqs.Clear(); uint ClusterCount = SIZE(m_ClusterMSAs); for (uint ClusterIndex = 0; ClusterIndex < ClusterCount; ++ClusterIndex) { ProgressStep(ClusterIndex, ClusterCount, "Consensus sequences"); MultiSequence *ClusterMSA = m_ClusterMSAs[ClusterIndex]; string Label; Ps(Label, "Cluster%u", ClusterIndex); string ConsSeq; GetConsensusSequence(*ClusterMSA, ConsSeq); Sequence *seq = NewSequence(); seq->FromString(Label, ConsSeq); m_ConsensusSeqs.AddSequence(seq, true); asserta(ClusterIndex < SIZE(m_ClusterLabels)); const string &ClusterLabel = m_ClusterLabels[ClusterIndex]; } if (optset_calnout) { m_MPC.Run(&m_ConsensusSeqs); m_MPC.m_MSA->WriteMFA(opt(calnout)); } } void Super4::InitPP() { vector MSAs; const uint n = SIZE(m_ClusterMSAs); for (uint i = 0; i < n; ++i) { const MultiSequence *MSA = m_ClusterMSAs[i]; MSAs.push_back(MSA); } m_PP.m_TargetPairCount = m_PairCount; m_PP.SetMSAs(MSAs, m_ClusterLabels); } void Super4::SetOpts() { m_PairCount = optd(paircount, DEFAULT_TARGET_PAIR_COUNT); m_MaxClusterSize = optd(paircount, DEFAULT_MAX_COARSE_SEQS); m_MinEAPass1 = (float) optd(super4_minea1, DEFAULT_MIN_EA_SUPER4_PASS1); m_MinEAPass2 = (float) optd(super4_minea2, DEFAULT_MIN_EA_SUPER4_PASS2); m_MPC.m_ConsistencyIterCount = optd(consiters, 2); m_MPC.m_RefineIterCount = optd(refineiters, 100); } void Super4::CalcConsensusSeqsDistMx() { FILE *f = 0; CalcEADistMx(f, &m_ConsensusSeqs, m_DistMx); } void Super4::CoarseAlign() { AssertSameLabels(*m_InputSeqs); ClusterInput(); AlignClusters(); GetConsensusSeqs(); CalcConsensusSeqsDistMx(); MakeGuideTree(); InitPP(); } void Super4::Run(MultiSequence &InputSeqs, TREEPERM TreePerm) { m_InputSeqs = &InputSeqs; SetOpts(); CoarseAlign(); if (TreePerm == TP_None) { m_PP.RunGuideTree(m_GuideTree_None); const MultiSequence &FinalMSA = m_PP.GetFinalMSA(); m_FinalMSA.Copy(FinalMSA); DeleteClusterMSAs(); return; } vector LabelsA; vector LabelsB; vector LabelsC; PermuteTree(m_GuideTree_None, m_GuideTree_ABC, m_GuideTree_ACB, m_GuideTree_BCA, LabelsA, LabelsB, LabelsC); switch (TreePerm) { case TP_ABC: ProgressLog("Guide tree ABC\n"); m_PP.RunGuideTree(m_GuideTree_ABC); m_FinalMSA.Copy(m_PP.GetFinalMSA()); break; case TP_ACB: ProgressLog("Guide tree ACB\n"); m_PP.RunGuideTree(m_GuideTree_ACB); m_FinalMSA.Copy(m_PP.GetFinalMSA()); break; case TP_BCA: ProgressLog("Guide tree BCA\n"); m_PP.RunGuideTree(m_GuideTree_BCA); m_FinalMSA.Copy(m_PP.GetFinalMSA()); break; case TP_All: ProgressLog("Guide tree (default)\n"); m_PP.RunGuideTree(m_GuideTree_None); m_FinalMSA_None.Copy(m_PP.GetFinalMSA()); ProgressLog("Guide tree ABC\n"); m_PP.RunGuideTree(m_GuideTree_ABC); m_FinalMSA_ABC.Copy(m_PP.GetFinalMSA()); ProgressLog("Guide tree ACB\n"); m_PP.RunGuideTree(m_GuideTree_ACB); m_FinalMSA_ACB.Copy(m_PP.GetFinalMSA()); ProgressLog("Guide tree BCA\n"); m_PP.RunGuideTree(m_GuideTree_BCA); m_FinalMSA_BCA.Copy(m_PP.GetFinalMSA()); break; default: asserta(false); } DeleteClusterMSAs(); } void cmd_super4() { const string &InputFileName = opt(super4); const string &OutputFileName = opt(output); MultiSequence &InputSeqs = LoadGlobalInputMS(InputFileName); bool Nucleo = false; if (opt(nt)) Nucleo = true; else if (opt(amino)) Nucleo = false; else Nucleo = InputSeqs.GuessIsNucleo(); TREEPERM TP = TP_None; if (optset_perm) TP = StrToTREEPERM(opt(perm)); SetAlpha(Nucleo ? ALPHA_Nucleo : ALPHA_Amino); InitProbcons(); Super4 S4; S4.Run(InputSeqs, TP); S4.m_FinalMSA.WriteMFA(OutputFileName); } muscle-5.1.0/src/super4.h000066400000000000000000000032671424453062600152050ustar00rootroot00000000000000#pragma once #include "tree.h" #include "pprog.h" #include "eacluster.h" #include "treeperm.h" #include "mpcflat.h" static const float DEFAULT_MIN_EA_SUPER4_PASS1 = 0.7f; static const float DEFAULT_MIN_EA_SUPER4_PASS2 = 0.9f; class Super4 { public: uint m_PairCount = DEFAULT_TARGET_PAIR_COUNT; uint m_MaxClusterSize = DEFAULT_MAX_COARSE_SEQS; float m_MinEAPass1 = DEFAULT_MIN_EA_SUPER4_PASS1; float m_MinEAPass2 = DEFAULT_MIN_EA_SUPER4_PASS2; MultiSequence *m_InputSeqs = 0; // Pass 1: EA clusters EACluster m_EC; vector m_ClusterMFAs; vector m_ClusterLabels; // Pass 2: Probcons MSA for each EA cluster MPCFlat m_MPC; vector m_ClusterMSAs; // Pass 3: Consensus sequence for each MSA MultiSequence m_ConsensusSeqs; // Pass 4: Distance matrix on consensus sequences vector > m_DistMx; // Pass 5: PProg PProg m_PP; MultiSequence m_FinalMSA; Tree m_GuideTree_None; Tree m_GuideTree_ABC; Tree m_GuideTree_ACB; Tree m_GuideTree_BCA; MultiSequence m_FinalMSA_None; MultiSequence m_FinalMSA_ABC; MultiSequence m_FinalMSA_ACB; MultiSequence m_FinalMSA_BCA; public: void SetOpts(); void ClearTreesAndMSAs(); void CoarseAlign(); void InitPP(); void Run(MultiSequence &InputSeqs, TREEPERM TreePerm); uint GetInputSeqCount() const { return m_InputSeqs->GetSeqCount(); } void ClusterInput(); void SplitBigMFA(MultiSequence &MFA, uint MaxSize, float MinEA, vector &SplitMFAs); void SplitBigMFA_Random(MultiSequence &MFA, uint MaxSize, vector &SplitMFAs); void AlignClusters(); void GetConsensusSeqs(); void CalcConsensusSeqsDistMx(); void MakeGuideTree(); void DeleteClusterMSAs(); }; muscle-5.1.0/src/super5.cpp000066400000000000000000000265461424453062600155460ustar00rootroot00000000000000#include "muscle.h" #include "super4.h" #include "sequence.h" #include "multisequence.h" #include "usorter.h" #include "upgma5.h" #include "pprog.h" #include "derep.h" #include "uclust.h" #include "super5.h" void CharVecToStr(const vector &Vec, string &Str) { Str.clear(); for (uint i = 0; i < SIZE(Vec); ++i) Str += Vec[i]; } void Super5::SetOpts() { m_MinEAPass1 = (float) optd(super5_minea1, DEFAULT_MIN_EA_SUPER5_PASS1); } void Super5::ClearTreesAndMSAs() { m_GuideTree_None.Clear(); m_GuideTree_ABC.Clear(); m_GuideTree_ACB.Clear(); m_GuideTree_BCA.Clear(); m_FinalMSA_None.Clear(); m_FinalMSA_ABC.Clear(); m_FinalMSA_ACB.Clear(); m_FinalMSA_BCA.Clear(); } void Super5::MakeCentroidSeqs(MultiSequence &InputSeqs) { m_InputSeqs = &InputSeqs; m_UniqueSeqs = new MultiSequence; m_CentroidSeqs = new MultiSequence; m_CentroidMSA = new MultiSequence; m_D.Run(*m_InputSeqs); m_D.Validate(); m_D.GetUniqueSeqs(*m_UniqueSeqs); SetDupeVecs(); m_U.Run(*m_UniqueSeqs, m_MinEAPass1); m_U.GetCentroidSeqs(*m_CentroidSeqs); SetCentroidVecs(); SetCentroidSeqsVecs(); ValidateVecs(); } void Super5::Run(MultiSequence &InputSeqs, TREEPERM Perm) { MakeCentroidSeqs(InputSeqs); m_S4.Run(*m_CentroidSeqs, Perm); if (Perm != TP_All) { m_CentroidMSA = &m_S4.m_FinalMSA; SetCentroidMSAVecs(); AlignMembers(); AlignDupes(); m_FinalMSA = m_ExtendedMSA; return; } m_CentroidMSA = &m_S4.m_FinalMSA_None; SetCentroidMSAVecs(); AlignMembers(); AlignDupes(); m_FinalMSA_None.Copy(*m_ExtendedMSA); m_CentroidMSA = &m_S4.m_FinalMSA_ABC; SetCentroidMSAVecs(); AlignMembers(); AlignDupes(); m_FinalMSA_ABC.Copy(*m_ExtendedMSA); m_CentroidMSA = &m_S4.m_FinalMSA_ACB; SetCentroidMSAVecs(); AlignMembers(); AlignDupes(); m_FinalMSA_ACB.Copy(*m_ExtendedMSA); m_CentroidMSA = &m_S4.m_FinalMSA_BCA; SetCentroidMSAVecs(); AlignMembers(); AlignDupes(); m_FinalMSA_BCA.Copy(*m_ExtendedMSA); } void Super5::AlignCentroidSeqs(TREEPERM Perm, MultiSequence &MSA) { m_S4.Run(*m_CentroidSeqs, Perm); m_CentroidMSA = &m_S4.m_FinalMSA; SetCentroidMSAVecs(); AlignMembers(); AlignDupes(); asserta(m_ExtendedMSA != 0); MSA.Copy(*m_ExtendedMSA); } void Super5::SetCentroidMSAVecs() { m_CentroidMSASeqIndexToGSI.clear(); m_GSIToCentroidMSASeqIndex.clear(); const uint CentroidSeqCount = SIZE(m_CentroidGSIs); const uint CentroidMSASeqCount = m_CentroidMSA->GetSeqCount(); asserta(CentroidSeqCount == CentroidMSASeqCount); const uint GlobalSeqCount = GetGlobalMSSeqCount(); m_GSIToCentroidMSASeqIndex.resize(GlobalSeqCount, UINT_MAX); m_CentroidMSASeqIndexToGSI.clear(); for (uint CentroidMSASeqIndex = 0; CentroidMSASeqIndex < CentroidSeqCount; ++CentroidMSASeqIndex) { const Sequence *Seq = m_CentroidMSA->GetSequence(CentroidMSASeqIndex); uint GSI = Seq->GetGSI(); asserta(GSI < GlobalSeqCount); if (m_GSIToCentroidMSASeqIndex[GSI] != UINT_MAX) Die("Super5::SetCentroidMSAVecs() GSI=%u found twice (%u,%u)", GSI, m_GSIToCentroidMSASeqIndex[GSI], CentroidMSASeqIndex); m_GSIToCentroidMSASeqIndex[GSI] = CentroidMSASeqIndex; m_CentroidMSASeqIndexToGSI.push_back(GSI); } } void Super5::SetCentroidSeqsVecs() { m_CentroidSeqsSeqIndexToGSI.clear(); m_GSIToCentroidSeqsSeqIndex.clear(); const uint CentroidSeqCount = SIZE(m_CentroidGSIs); const uint CentroidSeqSeqCount = m_CentroidSeqs->GetSeqCount(); asserta(CentroidSeqCount == CentroidSeqSeqCount); const uint GlobalSeqCount = GetGlobalMSSeqCount(); m_GSIToCentroidSeqsSeqIndex.resize(GlobalSeqCount, UINT_MAX); m_CentroidSeqsSeqIndexToGSI.clear(); for (uint CentroidSeqSeqIndex = 0; CentroidSeqSeqIndex < CentroidSeqCount; ++CentroidSeqSeqIndex) { const Sequence *Seq = m_CentroidSeqs->GetSequence(CentroidSeqSeqIndex); uint GSI = Seq->GetGSI(); asserta(GSI < GlobalSeqCount); asserta(m_GSIToCentroidSeqsSeqIndex[GSI] == UINT_MAX); m_GSIToCentroidSeqsSeqIndex[GSI] = CentroidSeqSeqIndex; m_CentroidSeqsSeqIndexToGSI.push_back(GSI); } } void Super5::SetDupeVecs() { const uint InputSeqCount = m_InputSeqs->GetSeqCount(); m_DupeGSIs.clear(); m_DupeRepGSIs.clear(); m_IsDupe.clear(); m_D.GetDupeGSIs( m_DupeGSIs, m_DupeRepGSIs); m_IsDupe.resize(InputSeqCount, false); const uint DupeCount = SIZE(m_DupeGSIs); for (uint i = 0; i < DupeCount; ++i) { uint GSI = m_DupeGSIs[i]; asserta(GSI < InputSeqCount); asserta(m_IsDupe[GSI] == false); m_IsDupe[GSI] = true; } } void Super5::SetCentroidVecs() { const uint InputSeqCount = m_InputSeqs->GetSeqCount(); m_CentroidGSIs.clear(); m_MemberGSIs.clear(); m_MemberCentroidGSIs.clear(); m_U.GetGSIs(m_CentroidGSIs, m_MemberGSIs, m_MemberCentroidGSIs, m_GSIToMemberCentroidPath); m_IsCentroid.clear(); m_IsMember.clear(); m_IsCentroid.resize(InputSeqCount, false); m_IsMember.resize(InputSeqCount, false); const uint GSICount = GetGlobalMSSeqCount(); m_GSIToMemberCount.resize(GSICount, 0); const uint CentroidCount = SIZE(m_CentroidGSIs); for (uint i = 0; i < CentroidCount; ++i) { uint CentroidGSI = m_CentroidGSIs[i]; asserta(CentroidGSI < InputSeqCount); asserta(!m_IsDupe[CentroidGSI]); asserta(!m_IsCentroid[CentroidGSI]); m_IsCentroid[CentroidGSI] = true; } const uint MemberCount = SIZE(m_MemberGSIs); asserta(SIZE(m_MemberCentroidGSIs) == MemberCount); for (uint i = 0; i < MemberCount; ++i) { uint MemberGSI = m_MemberGSIs[i]; uint MemberCentroidGSI = m_MemberCentroidGSIs[i]; asserta(MemberGSI < GSICount); asserta(MemberCentroidGSI < GSICount); asserta(m_IsCentroid[MemberCentroidGSI]); bool IsDupe = m_IsDupe[MemberGSI]; bool IsMember = m_IsMember[MemberGSI]; bool IsCentroid = m_IsCentroid[MemberGSI]; if (IsDupe || IsMember || IsCentroid) Die("Super5::SetCentroidVecs(), MemberGSI=%u dupe=%c mem=%c cent=%c", MemberGSI, tof(IsDupe), tof(IsMember), tof(IsCentroid)); asserta(!IsDupe); asserta(!IsMember); asserta(!IsCentroid); m_IsMember[MemberGSI] = true; m_GSIToMemberCount[MemberCentroidGSI] += 1; } } void Super5::ValidateVecs() const { const uint InputSeqCount = m_InputSeqs->GetSeqCount(); asserta(SIZE(m_IsDupe) == InputSeqCount); asserta(SIZE(m_IsCentroid) == InputSeqCount); asserta(SIZE(m_IsMember) == InputSeqCount); for (uint i = 0; i < InputSeqCount; ++i) { bool Dupe = m_IsDupe[i]; bool Centroid = m_IsCentroid[i]; bool Member = m_IsMember[i]; int Sum = int(Dupe) + int(Centroid) + int(Member); if (Sum != 1) Die("Input seq %u dupe %c, centroid %c member %c", i, tof(Dupe), tof(Centroid), tof(Member)); } } void Super5::AlignMembers() { const uint MemberCount = SIZE(m_MemberGSIs); const uint GSICount = GetGSICount(); asserta(SIZE(m_MemberCentroidGSIs) == MemberCount); asserta(m_CentroidMSA != 0); const uint CentroidCount = m_CentroidMSA->GetSeqCount(); const uint CentroidMSAColCount = m_CentroidMSA->GetColCount(); vector MemberIndexToCentroidIndex; asserta(SIZE(m_MemberCentroidGSIs) == MemberCount); asserta(SIZE(m_GSIToCentroidMSASeqIndex) == GSICount); asserta(SIZE(m_GSIToMemberCentroidPath) == GSICount); MultiSequence *MemberSeqs = new MultiSequence; asserta(MemberSeqs != 0); vector MemberPaths; for (uint MemberIndex = 0; MemberIndex < MemberCount; ++MemberIndex) { uint MemberGSI = m_MemberGSIs[MemberIndex]; Sequence *MemberSeq = (Sequence *) &GetGlobalInputSeq(MemberGSI); MemberSeqs->AddSequence(MemberSeq, false); uint CentroidGSI = m_MemberCentroidGSIs[MemberIndex]; asserta(CentroidGSI < GSICount); uint CentroidMSASeqIndex = m_GSIToCentroidMSASeqIndex[CentroidGSI]; asserta(CentroidMSASeqIndex < CentroidCount); MemberIndexToCentroidIndex.push_back(CentroidMSASeqIndex); const string &Path = m_GSIToMemberCentroidPath[MemberGSI]; asserta(!Path.empty()); MemberPaths.push_back(Path); } m_TA.Init(*m_CentroidMSA, *MemberSeqs, MemberIndexToCentroidIndex, MemberPaths); m_TA.MakeExtendedMSA(); asserta(m_TA.m_ExtendedMSA != 0); m_ExtendedMSA = m_TA.m_ExtendedMSA; AssertSeqsEqInput(*m_ExtendedMSA); } void Super5::AlignDupes() { const uint DupeCount = SIZE(m_DupeGSIs); asserta(SIZE(m_DupeRepGSIs) == DupeCount); if (DupeCount == 0) return; ProgressLog("Inserting %u dupes...", DupeCount); const uint GSICount = GetGSICount(); vector GSIToExtendedSeqIndex(GSICount, UINT_MAX); asserta(m_ExtendedMSA != 0); const uint ExtendedSeqCount = m_ExtendedMSA->GetSeqCount(); for (uint ExtendedSeqIndex = 0; ExtendedSeqIndex < ExtendedSeqCount; ++ExtendedSeqIndex) { const Sequence *Seq = m_ExtendedMSA->GetSequence(ExtendedSeqIndex); uint GSI = Seq->GetGSI(); asserta(GSI < GSICount); asserta(GSIToExtendedSeqIndex[GSI] == UINT_MAX); GSIToExtendedSeqIndex[GSI] = ExtendedSeqIndex; } for (uint i = 0; i < DupeCount; ++i) { uint DupeGSI = m_DupeGSIs[i]; uint RepGSI = m_DupeRepGSIs[i]; asserta(RepGSI < GSICount); uint RepExtendedSeqIndex = GSIToExtendedSeqIndex[RepGSI]; asserta(RepExtendedSeqIndex < ExtendedSeqCount); const Sequence *Rep = m_ExtendedMSA->GetSequence(RepExtendedSeqIndex); Sequence *AlignedDupe = Rep->Clone(); AlignedDupe->OverwriteGSI(DupeGSI); const string &Label = GetGlobalInputSeqLabel(DupeGSI); AlignedDupe->OverwriteLabel(Label); m_ExtendedMSA->AddSequence(AlignedDupe, true); } ProgressLog(" done.\n"); AssertSeqsEqInput(*m_ExtendedMSA); } void cmd_super5() { LoadGlobalInputMS(opt(super5)); string &OutputPattern = opt(output); if (OutputPattern.empty()) Die("Must set -output"); ShowGlobalInputSeqStats(); MultiSequence &InputSeqs = GetGlobalInputMS(); const uint InputSeqCount = GetGlobalMSSeqCount(); bool Nucleo = false; if (opt(nt)) Nucleo = true; else if (opt(amino)) Nucleo = false; else Nucleo = InputSeqs.GuessIsNucleo(); SetAlpha(Nucleo ? ALPHA_Nucleo : ALPHA_Amino); InitProbcons(); if (optset_diversified) Die("-diversified not supported"); if (optset_replicates) Die("-replicates not supported"); if (optset_stratified) Die("-stratified not supported"); TREEPERM Perm = TP_None; if (optset_perm) Perm = StrToTREEPERM(opt(perm)); if (Perm == TP_All && OutputPattern.find('@') == string::npos) Die("Must be '@' in output filename with -perm all"); Super5 S5; S5.SetOpts(); S5.Run(InputSeqs, Perm); if (Perm == TP_All) { string FileName_None; string FileName_ABC; string FileName_ACB; string FileName_BCA; uint PerturbSeed = 0; if (optset_perturb) PerturbSeed = opt(perturb); MakeReplicateFileName(OutputPattern, TP_None, PerturbSeed, FileName_None); MakeReplicateFileName(OutputPattern, TP_ABC, PerturbSeed, FileName_ABC); MakeReplicateFileName(OutputPattern, TP_ACB, PerturbSeed, FileName_ACB); MakeReplicateFileName(OutputPattern, TP_BCA, PerturbSeed, FileName_BCA); S5.m_FinalMSA_None.WriteMFA(FileName_None); S5.m_FinalMSA_ABC.WriteMFA(FileName_ABC); S5.m_FinalMSA_ACB.WriteMFA(FileName_ACB); S5.m_FinalMSA_BCA.WriteMFA(FileName_BCA); } else { uint PerturbSeed = 0; if (optset_perturb) PerturbSeed = opt(perturb); string OutputFileName; if (OutputPattern.find('@') == string::npos) OutputFileName = OutputPattern; else MakeReplicateFileName(OutputPattern, Perm, PerturbSeed, OutputFileName); S5.m_FinalMSA->WriteMFA(OutputFileName); } if (S5.m_FinalMSA != 0) { S5.m_FinalMSA->Clear(); S5.m_FinalMSA = 0; } S5.m_FinalMSA_None.Clear(); S5.m_FinalMSA_ABC.Clear(); S5.m_FinalMSA_ACB.Clear(); S5.m_FinalMSA_BCA.Clear(); ClearGlobalInputMS(); #if SEQ_TRACE Sequence::AllocReport("final"); #endif } muscle-5.1.0/src/super5.h000066400000000000000000000030551424453062600152010ustar00rootroot00000000000000#pragma once #include "derep.h" #include "uclust.h" #include "transaln.h" #include "super4.h" static const float DEFAULT_MIN_EA_SUPER5_PASS1 = 0.99f; class Super5 { public: float m_MinEAPass1 = DEFAULT_MIN_EA_SUPER5_PASS1; MultiSequence *m_InputSeqs = 0; MultiSequence *m_UniqueSeqs = 0; MultiSequence *m_CentroidSeqs = 0; MultiSequence *m_CentroidMSA = 0; MultiSequence *m_ExtendedMSA = 0; MultiSequence *m_FinalMSA = 0; Tree m_GuideTree_None; Tree m_GuideTree_ABC; Tree m_GuideTree_ACB; Tree m_GuideTree_BCA; MultiSequence m_FinalMSA_None; MultiSequence m_FinalMSA_ABC; MultiSequence m_FinalMSA_ACB; MultiSequence m_FinalMSA_BCA; Derep m_D; UClust m_U; TransAln m_TA; Super4 m_S4; vector m_IsDupe; vector m_IsCentroid; vector m_IsMember; vector m_DupeGSIs; vector m_DupeRepGSIs; vector m_CentroidGSIs; vector m_MemberGSIs; vector m_MemberCentroidGSIs; vector m_CentroidSeqsSeqIndexToGSI; vector m_CentroidMSASeqIndexToGSI; vector m_GSIToCentroidSeqsSeqIndex; vector m_GSIToCentroidMSASeqIndex; vector m_GSIToMemberCount; vector m_GSIToMemberCentroidPath; public: void SetOpts(); void Run(MultiSequence &InputSeqs, TREEPERM Perm); void MakeCentroidSeqs(MultiSequence &InputSeqs); void AlignCentroidSeqs(TREEPERM Perm, MultiSequence &MSA); void SetDupeVecs(); void SetCentroidVecs(); void SetCentroidSeqsVecs(); void SetCentroidMSAVecs(); void AlignMembers(); void AlignDupes(); void ValidateVecs() const; void ClearTreesAndMSAs(); }; muscle-5.1.0/src/testfb.cpp000066400000000000000000000236711424453062600156060ustar00rootroot00000000000000#include "muscle.h" #include "timing.h" #include "mysparsemx.h" #if 0 void CalcFwdFlat(const byte *X, uint LX, const byte *Y, uint LY, float *Flat); void CalcBwdFlat(const byte *X, uint LX, const byte *Y, uint LY, float *Flat); void CalcFwdSimple(const string &X, const string &Y, vector > > &Fwd); void CalcBwdSimple(const string &X, const string &Y, vector > > &Bwd); float CalcTotalProbFlat(const float *FlatFwd, const float *FlatBwd, uint LX, uint LY); void CalcPostFlat(const float *FlatFwd, const float *FlatBwd, uint LX, uint LY, float *Post); float CalcAlnFlat(const float *Post, uint LX, uint LY, float *DPRows, char *TB, string &Path); static uint g_PathEqCount; static uint g_PathDiffCount; static void CalcBwdFlat(const string &X, const string &Y, float *Flat) { const byte *pX = (const byte *) X.c_str(); const byte *pY = (const byte *) Y.c_str(); uint LX = SIZE(X); uint LY = SIZE(Y); CalcBwdFlat(pX, LX, pY, LY, Flat); } static void CalcFwdFlat(const string &X, const string &Y, float *Flat) { const byte *pX = (const byte *) X.c_str(); const byte *pY = (const byte *) Y.c_str(); uint LX = SIZE(X); uint LY = SIZE(Y); CalcFwdFlat(pX, LX, pY, LY, Flat); } static void LogTBA(const vector > &A, const vector > &TB, uint LX, uint LY) { Log("\n"); Log(" "); for (uint j = 0; j <= LY; ++j) Log(" %8u", j); Log("\n"); for (uint i = 0; i <= LX; ++i) { Log("[%3u] ", i); for (uint j = 0; j <= LY; ++j) Log(" %8.3g", A[i][j]); Log("\n"); } Log("\n"); Log(" "); for (uint j = 0; j <= LY; ++j) Log(" %2u", j); Log("\n"); for (uint i = 0; i <= LX; ++i) { Log("[%3u] ", i); for (uint j = 0; j <= LY; ++j) { char c = TB[i][j]; if (c == 'D') c = 'B'; else if (c == 'L') c = 'Y'; else if (c == 'U') c = 'X'; Log(" %2c", c); } Log("\n"); } } static void LogTomPost(const vector &TomPosterior, uint LX, uint LY) { Log("\n"); Log("TomPost LX=%u LY=%u\n", LX, LY); Log(" "); for (uint j = 0; j <= LY; ++j) Log(" %10u", j); Log("\n"); uint Ix = 0; for (uint i = 0; i <= LX; ++i) { Log("[%3u] ", i); for (uint j = 0; j <= LY; ++j) { float P = TomPosterior[Ix++]; Log(" %10.3g", P); } Log("\n"); } } static void LogFlatPost(const float *MyPost, uint LX, uint LY) { Log("\n"); Log("MyPost LX=%u LY=%u\n", LX, LY); Log(" "); for (uint j = 0; j < LY; ++j) Log(" %10u", j); Log("\n"); uint Ix = 0; for (uint i = 0; i < LX; ++i) { Log("[%3u] ", i); for (uint j = 0; j < LY; ++j) { float P = MyPost[Ix++]; Log(" %10.3g", P); } Log("\n"); } } static bool PostEq(float x, float y) { if (x == y) return true; if (abs(x) < POSTERIOR_CUTOFF && abs(x) < POSTERIOR_CUTOFF) return true; double X = fabs(x); double Y = fabs(y); double Max = max(X, Y); double Diff = fabs(X-Y); bool Same = Diff < Max*0.05; if (Same) return true; return false; } static void CmpPost(const vector &TomPosterior, const float *PostFlat, uint LX, uint LY) { for (uint i = 0; i < LX; ++i) { for (uint j = 0; j < LY; ++j) { float TomP = TomPosterior[(LY+1)*(i+1) + j + 1]; float MyP = PostFlat[LY*i + j]; if (!PostEq(TomP, MyP)) { LogTomPost(TomPosterior, LX, LY); LogFlatPost(PostFlat, LX, LY); Die("CmpPost i=%u j=%u Tom=%.5g My=%.5g", i, j, TomP, MyP); } } } } void CvtFlat(const float *Flat, uint LX, uint LY, vector > > &Mxs) { const uint LY1 = LY+1; Mxs.clear(); Mxs.resize(HMMSTATE_COUNT); for (uint s = 0; s < HMMSTATE_COUNT; ++s) { Mxs[s].resize(LX+1); for (uint i = 0; i <= LX; ++i) Mxs[s][i].resize(LY+1, INVALID_LOG); } for (uint s = 0; s < HMMSTATE_COUNT; ++s) for (uint i = 0; i <= LX; ++i) for (uint j = 0; j <= LY; ++j) Mxs[s][i][j] = FLATMX(s, i, j); } static void Test(const string &X, const string &Y, bool DoFwd, bool DoBwd) { Sequence &SeqX = *NewSequence(); Sequence &SeqY = *NewSequence(); const uint LX = SIZE(X); const uint LY = SIZE(Y); float *FlatFwd = myalloc(float, (LX+1)*(LY+1)*HMMSTATE_COUNT); float *FlatBwd = myalloc(float, (LX+1)*(LY+1)*HMMSTATE_COUNT); float *PostFlat = myalloc(float, LX*LY); vector > > TomMxsFwd; vector > > TomMxsBwd; vector > > SimpleMxsFwd; vector > > SimpleMxsBwd; vector > > FlatMxsFwd; vector > > FlatMxsBwd; SeqX.FromString("X", X); SeqY.FromString("Y", Y); SetAlpha(ALPHA_Amino); InitProbcons(); vector *Fwd = 0; vector *Bwd = 0; vector *TomPosterior = 0; if (DoFwd) { Fwd = PairHMM::ComputeForwardMatrix(&SeqX, &SeqY); PairHMM::ConvertFBMxs(*Fwd, LX, LY, TomMxsFwd); g_Toms = &TomMxsFwd; CalcFwdSimple(X, Y, SimpleMxsFwd); CalcFwdFlat(X, Y, FlatFwd); CvtFlat(FlatFwd, LX, LY, FlatMxsFwd); CmpFBMxs("Tom-Simple-Fwd", X, Y, TomMxsFwd, SimpleMxsFwd); CmpFBMxs("Tom-Flat-Fwd", X, Y, TomMxsFwd, FlatMxsFwd); } if (DoBwd) { Bwd = PairHMM::ComputeBackwardMatrix(&SeqX, &SeqY); PairHMM::ConvertFBMxs(*Bwd, LX, LY, TomMxsBwd); g_Toms = &TomMxsBwd; CalcBwdSimple(X, Y, SimpleMxsBwd); CalcBwdFlat(X, Y, FlatBwd); CvtFlat(FlatBwd, LX, LY, FlatMxsBwd); CmpFBMxs("Tom-Simple-Bwd", X, Y, TomMxsBwd, SimpleMxsBwd); CmpFBMxs("Tom-Flat-Bwd", X, Y, TomMxsBwd, FlatMxsBwd); } if (DoFwd && DoBwd) { float TomTotal = PairHMM::ComputeTotalProbability(LX, LY, *Fwd, *Bwd); float MyTotal = CalcTotalProbFlat(FlatFwd, FlatBwd, LX, LY); asserta(feq(TomTotal, MyTotal)); TomPosterior = PairHMM::ComputePosteriorMatrix(&SeqX, &SeqY, *Fwd, *Bwd); CalcPostFlat(FlatFwd, FlatBwd, LX, LY, PostFlat); SparseMatrix SM((int) LX, (int) LY, *TomPosterior); SM.LogMe(); MySparseMx MM; MM.FromPost(PostFlat, LX, LY); MM.LogMe(); //LogTomPost(*TomPosterior, LX, LY); //LogFlatPost(PostFlat, LX, LY); CmpPost(*TomPosterior, PostFlat, LX, LY); pair*, float> Result = PairHMM::ComputeAlignment(LX, LY, *TomPosterior); const vector &TomPathVec = *Result.first; float TomScore = Result.second; string TomPath; for (uint Col = 0; Col < SIZE(TomPathVec); ++Col) TomPath += TomPathVec[Col]; string Path; float *DPRows = myalloc(float, 2*(LY+1)); char *TB = myalloc(char, (LX+1)*(LY+1)); float MyScore = CalcAlnFlat(PostFlat, LX, LY, DPRows, TB, Path); Log("Scores %.3g %.3g\n", TomScore, MyScore); Log("\n"); Log(" Path %s\n", Path.c_str()); LogAln(X, Y, Path); if (Path != TomPath) { Log("\n"); Log("MyPath %s\n", Path.c_str()); } if (TomPath == Path) ++g_PathEqCount; else ++g_PathDiffCount; myfree(DPRows); myfree(TB); delete TomPosterior; TomPosterior = 0; } myfree(FlatBwd); myfree(FlatFwd); myfree(PostFlat); if (Bwd != 0) delete Bwd; if (Fwd != 0) delete Fwd; if (TomPosterior != 0) delete TomPosterior; } static void GetRandomSeq(string &Seq, uint MinLen, uint MaxLen) { Seq.clear(); uint L = MinLen + randu32() % (MaxLen - MinLen); for (uint i = 0; i < L; ++i) { uint Letter = randu32() % 20; char c = g_LetterToChar[Letter]; Seq += c; } } static void TestTiming() { vector Seqs; vector TomSeqs; const uint N = 50; const uint MAXL = 300; for (uint i = 0; i < N; ++i) { string Seq; GetRandomSeq(Seq, MAXL/2, MAXL); Seqs.push_back(Seq); Sequence* TomSeq = NewSequence(); TomSeq->FromString("tom", Seq); TomSeqs.push_back(TomSeq); } double MyTicks = 0; { TICKS t1 = GetClockTicks(); float* Flat = myalloc(float, MAXL * MAXL * HMMSTATE_COUNT); for (uint i = 0; i < N; ++i) { ProgressStep(i, N, "Flat"); const string& Seqi = Seqs[i]; for (uint j = 0; j < N; ++j) { const string& Seqj = Seqs[j]; CalcFwdFlat(Seqi, Seqj, Flat); CalcBwdFlat(Seqi, Seqj, Flat); } } TICKS t2 = GetClockTicks(); MyTicks = double(t2 - t1); } double TomTicks = 0; { TICKS t3 = GetClockTicks(); for (uint i = 0; i < N; ++i) { ProgressStep(i, N, "Tom"); Sequence* Seqi = TomSeqs[i]; for (uint j = 0; j < N; ++j) { Sequence *Seqj = TomSeqs[j]; vector *Fwd = PairHMM::ComputeForwardMatrix(Seqi, Seqj); vector *Bwd = PairHMM::ComputeBackwardMatrix(Seqi, Seqj); delete Fwd; delete Bwd; } } TICKS t4 = GetClockTicks(); TomTicks = double(t4 - t3); } ProgressLog("Me %.3g, Tom %.3g (%.1f%%)\n", MyTicks, TomTicks, 100*MyTicks/TomTicks); } static void TestShort(bool DoFwd, bool DoBwd) { Test("MQTIF", "MSIF", DoFwd, DoBwd); Test("GATTACA", "MQTIF", DoFwd, DoBwd); Test("ABC", "DEF", DoFwd, DoBwd); Test("LQNGSEQVENCE", "QTHERSEQVENCEINSERT", DoFwd, DoBwd); } static void TestLong(uint MaxL, bool DoFwd, bool DoBwd) { vector Seqs; for (uint i = 0; i < 10; ++i) { string Seq; GetRandomSeq(Seq, MaxL/2, MaxL); Seqs.push_back(Seq); } Seqs.push_back("LSIDGKKYDTRLVATLLWFASLVLQDHVVDRYKDAADVLITETIYALLVTFSGTVVAKHGGNASGGYLTLILNCLVQLLLLIRSNIKRCGCTIGRCLVPAIIGDDGTY"); Seqs.push_back("LEIDISKFDKSQQMIACLFEREIMKRFGFPDDLAEIWFNCRWICSFYDPVCGVSFKSDFQMKSGVASTFITNTLFLMSVIFYFWEPSPNAFGLFGGDDSLL"); Seqs.push_back("GKFDKSQGLLALLIEIGIMRRFGAPEDLVELWYYSHMYTLLKDVKTGVSLKVIFQRKSGDAATFIGNTLFLLFVLAYYFGFNSLALALLGGDDSLL"); Seqs.push_back("EEIDISKYDKSQGLLALMFECKLMKRFGVMWFNQHLSSHFYSQSTGVSGMTSFQRKSGDAATFAGNTFFLMAIVADSCKIEDLDICAFSGDDSVL"); Seqs.push_back("LEIDISKYDKSQRELALEFECKLMKYFGVPSDIVELWFNAHVLTEVYDRTTKLNALIPYQRKSGDASTFIGNTLFLMAVICDLIPVSELELALFSGDDSLL"); const uint N = SIZE(Seqs); for (uint i = 0; i < N; ++i) { ProgressStep(i, N, "Testing long %u %u", g_PathEqCount, g_PathDiffCount); const string &X = Seqs[i]; for (uint j = 0; j < N; ++j) { const string &Y = Seqs[j]; Test(X, Y, DoFwd, DoBwd); } } } #endif // 0 void cmd_testfb() { //opt(testfb); //Test("MQTIF", "QTIF", true, true); //TestShort(true, true); //SetAlpha(ALPHA_Amino); //TestLong(200, true, true); //ProgressLog("Eq %u diff %u\n", g_PathEqCount, g_PathDiffCount); } muscle-5.1.0/src/testlog.cpp000066400000000000000000000070371424453062600157760ustar00rootroot00000000000000#include "muscle.h" #include "timing.h" void cmd_testlog() {} #if 0 inline float HACK(float x) { assert(x >= 0.00f); assert(x <= LOG_UNDERFLOW_THRESHOLD); if (x <= 1.00f) return ((-0.009350833524763f * x + 0.130659527668286f) * x + 0.498799810682272f) * x + 0.693203116424741f; //if (x <= 2.50f) return ((-0.014532321752540f * x + 0.139942324101744f) * x + 0.495635523139337f) * x + 0.692140569840976f; //if (x <= 4.50f) return ((-0.004605031767994f * x + 0.063427417320019f) * x + 0.695956496475118f) * x + 0.514272634594009f; assert(x <= LOG_UNDERFLOW_THRESHOLD); return ((-0.000458661602210f * x + 0.009695946122598f) * x + 0.930734667215156f) * x + 0.168037164329057f; } inline float LOG_ADD_HACK(float x, float y) { if (x < y) return (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ? y : HACK(y - x) + x; return (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ? x : HACK(x - y) + y; } inline float log1pexp(float x) { return x < -88.029691931f? 0.0f : log1p(exp(x)); } inline float sum_log_prob(float x, float y) { return x > y ? x + log1pexp(y - x) : y + log1pexp(x - y); } void cmd_testlog() { opt(testlog); const uint N = 10000; float *v = myalloc(float, N); for (uint i = 0; i < N; ++i) { float x = -float(randu32()%20) + float(randu32()%100)/10000.0f; v[i] = x; } const uint M = 1000*1000; float *mv = myalloc(float, M); ProgressLog("Making mv..."); for (uint i = 0; i < M; ++i) mv[i] = -float(randu32()%20) + float(randu32()%100)/10000.0f; ProgressLog("done.\n"); uint Diffs = 0; for (uint i = 0; i < N; ++i) { for (uint j = 0; j < N; ++j) { float x = v[i]; float y = v[j]; float Sum1 = LOG_ADD(x, y); float Sum2 = sum_log_prob(x, y); if (!feq(Sum1, Sum2)) { ProgressLog("x=%.5g y=%.5g LOG_ADD=%.3g sum_log_prob=%.3g\n", x, y, Sum1, Sum2); if (++Diffs > 20) goto Next; } } } Next: TICKS t1 = GetClockTicks(); float Total = 0; for (uint i = 0; i < N; ++i) { for (uint j = 0; j < N; ++j) { float x = v[i]; float y = v[j]; float Sum = LOG_ADD(x, y); Total += Sum; } } TICKS t2 = GetClockTicks(); ProgressLog("LOG_ADD %.4g ticks\n", double(t2 - t1)); Log("%.3g\n", Total); TICKS t3 = GetClockTicks(); Total = 0; for (uint i = 0; i < N; ++i) { for (uint j = 0; j < N; ++j) { float x = v[i]; float y = v[j]; float Sum = sum_log_prob(x, y); Total += Sum; } } TICKS t4 = GetClockTicks(); ProgressLog("sum_log_prob %.4g ticks\n", double(t4 - t3)); Log("%.3g\n", Total); TICKS t5 = GetClockTicks(); Total = 0; for (uint i = 0; i < N; ++i) { for (uint j = 0; j < N; ++j) { float x = v[i]; float y = v[j]; float Sum = mv[randu32()%M]; Total += Sum; } } Log("%.3g\n", Total); TICKS t6 = GetClockTicks(); ProgressLog("lookup %.4g ticks\n", double(t6 - t5)); TICKS t7 = GetClockTicks(); Total = 0; for (uint i = 0; i < N; ++i) { for (uint j = 0; j < N; ++j) { float x = v[i]; float y = v[j]; float Sum = LOG_ADD_HACK(x, y); Total += Sum; } } Log("%.3g\n", Total); TICKS t8 = GetClockTicks(); ProgressLog("HACK %.4g ticks\n", double(t8 - t7)); TICKS t9 = GetClockTicks(); Total = 0; for (uint i = 0; i < N; ++i) { for (uint j = 0; j < N; ++j) { float x = v[i]; float y = v[j]; float Sum = x + y; Total += Sum; } } Log("%.3g\n", Total); TICKS t10 = GetClockTicks(); ProgressLog("NULL %.4g ticks\n", double(t10 - t9)); ProgressLog("LOG_ADD %.3g sum_log_prob %.3g lookup %.3g\n", double(t2 - t1), double(t4 - t3), double(t6 - t5)); } #endif // 0 muscle-5.1.0/src/testscoretype.cpp000066400000000000000000000021761424453062600172310ustar00rootroot00000000000000#include "muscle.h" #include "timing.h" void Test2(double P1d, double P2d) { float P1 = float(P1d); float P2 = float(P2d); float Prod = P1*P2; float Sum = P1 + P2; float logP1 = log(P1); float logP2 = log(P2); float logProd = log(Prod); float logSum = log(Sum); float sumP12 = logP1 + logP2; asserta(feq(logProd, sumP12)); float Add = LOG_ADD(logP1, logP2); asserta(feq(Add, logSum)); float PE = logP1; LOG_PLUS_EQUALS(PE, logP2); asserta(feq(PE, logSum)); ProgressLog("P1=%.3g P2=%.3g ok\n", P1, P2); } #if 0 static void TestExp() { vector Xs; Progress("Calc Xs..."); for (uint i = 0; i < 1000000; ++i) { uint r = randu32()%16; float x = -float(r)/16; Xs.push_back(x); } Progress("\n"); TICKS t1 = GetClockTicks(); float Sum = 0; for (uint i = 0; i < SIZE(Xs); ++i) { float x = Xs[i]; float E = EXP(x); Sum += E; } TICKS t2 = GetClockTicks(); TICKS t3 = GetClockTicks(); for (uint i = 0; i < SIZE(Xs); ++i) { float x = Xs[i]; float e = exp(x); Sum += e; } TICKS t4 = GetClockTicks(); ProgressLog("EXP %.3g, exp %.3g\n", double(t2 - t1), double(t4 - t3)); } #endif // 0 muscle-5.1.0/src/textfile.cpp000066400000000000000000000150001424453062600161260ustar00rootroot00000000000000#include "muscle.h" #include "textfile.h" #include TextFile::TextFile(const char szFileName[], bool bWrite) { FILE *ptrFile = 0; if (bWrite) { if (0 == strcmp(szFileName, "-")) ptrFile = stdout; else ptrFile = fopen(szFileName, "wb"); } else { if (0 == strcmp(szFileName, "-")) ptrFile = stdin; else ptrFile = fopen(szFileName, "rb"); } if (0 == ptrFile) Die("Cannot open '%s' errno=%d\n", szFileName, errno); Init(ptrFile, szFileName); } TextFile::TextFile(const string &FileName, bool bWrite) { const char *szFileName = FileName.c_str(); FILE *ptrFile = 0; if (bWrite) { if (0 == strcmp(szFileName, "-")) ptrFile = stdout; else ptrFile = fopen(szFileName, "wb"); } else { if (0 == strcmp(szFileName, "-")) ptrFile = stdin; else ptrFile = fopen(szFileName, "rb"); } if (0 == ptrFile) Die("Cannot open '%s' errno=%d\n", szFileName, errno); Init(ptrFile, szFileName); } void TextFile::Init(FILE *ptrFile, const char *ptrFileName) { m_ptrFile = ptrFile; m_ptrName = strdup(ptrFileName); m_uLineNr = 1; m_uColNr = 0; m_bLastCharWasEOL = true; m_cPushedBack = -1; #if DEBUG setbuf(m_ptrFile, 0); #endif } TextFile::TextFile(FILE *ptrFile, const char *ptrFileName) { Init(ptrFile, "-"); } TextFile::~TextFile() { if (m_ptrFile && m_ptrFile != stdin && m_ptrFile != stdout && m_ptrFile != stderr) fclose(m_ptrFile); free(m_ptrName); } // Get line from file. // Return true if end-of-file, quit if line too long. bool TextFile::GetLine(char szLine[], unsigned uBytes) { if (0 == uBytes) Die("TextFile::GetLine, buffer zero size"); int FillVal = 0; // suppress warning from gcc that I don't understand memset(szLine, FillVal, (size_t) uBytes); unsigned uBytesCopied = 0; // Loop until end of line or end of file. for (;;) { char c; bool bEof = GetChar(c); if (bEof) return true; if ('\r' == c) continue; if ('\n' == c) return false; if (uBytesCopied < uBytes - 1) szLine[uBytesCopied++] = (char) c; else Die("TextFile::GetLine: input buffer too small, line %u", m_uLineNr); } } // As GetLine, but trim leading and trailing blanks; skip empty lines bool TextFile::GetTrimLine(char szLine[], unsigned uBytes) { Die("GetTrimLine"); return false; } void TextFile::Rewind() { fseek(m_ptrFile, 0, SEEK_SET); m_uLineNr = 1; m_bLastCharWasEOL = true; } void TextFile::PutChar(char c) { int i = fputc(c, m_ptrFile); assert(i == c); if ('\n' == c) { ++m_uLineNr; m_uColNr = 1; } else ++m_uColNr; } void TextFile::PutString(const char szLine[]) { int iError = fputs(szLine, m_ptrFile); assert(iError >= 0); } void TextFile::PutFormat(const char szFormat[], ...) { char szStr[4096]; va_list ArgList; va_start(ArgList, szFormat); vsprintf(szStr, szFormat, ArgList); PutString(szStr); } void TextFile::GetLineX(char szLine[], unsigned uBytes) { if (uBytes == 0) Die("GetLineX"); bool bEof = GetLine(szLine, uBytes); if (bEof) Die("end-of-file in GetLineX"); } bool TextFile::GetToken(char szToken[], unsigned uBytes, const char szCharTokens[]) { // Skip leading white space char c; for (;;) { bool bEof = GetChar(c); if (bEof) return true; if (!isspace(c)) break; } // Check for special case single-character tokens if (0 != strchr(szCharTokens, c)) { assert(uBytes >= 2); szToken[0] = c; szToken[1] = 0; return false; } // Loop until token terminated by white space, EOF or special unsigned uBytesCopied = 0; for (;;) { if (uBytesCopied < uBytes - 1) szToken[uBytesCopied++] = c; else Die("TextFile::GetToken: input buffer too small, line %u", m_uLineNr); bool bEof = GetChar(c); if (bEof) { szToken[uBytesCopied] = 0; return true; } // Check for special case single-character tokens if (0 != strchr(szCharTokens, c)) { PushBack(c); assert(uBytesCopied > 0 && uBytesCopied < uBytes); szToken[uBytesCopied] = 0; return false; } if (isspace(c)) { assert(uBytesCopied > 0 && uBytesCopied < uBytes); szToken[uBytesCopied] = 0; return false; } } } void TextFile::GetTokenX(char szToken[], unsigned uBytes, const char szCharTokens[]) { bool bEof = GetToken(szToken, uBytes, szCharTokens); if (bEof) Die("End-of-file in GetTokenX"); } void TextFile::Skip() { for (;;) { char c; bool bEof = GetChar(c); if (bEof || '\n' == c) return; assert(isspace(c)); } } #ifdef _WIN32 TEXTFILEPOS TextFile::GetPos() { fpos_t p; int i = fgetpos(m_ptrFile, &p); assert(0 == i); assert(p >= 0); TEXTFILEPOS Pos; Pos.uOffset = (unsigned) p; Pos.uLineNr = m_uLineNr; Pos.uColNr = m_uColNr; return Pos; } void TextFile::SetPos(TEXTFILEPOS Pos) { fpos_t p = (fpos_t) Pos.uOffset; int i = fsetpos(m_ptrFile, &p); assert(0 == i); m_uLineNr = Pos.uLineNr; m_uColNr = Pos.uColNr; } #else TEXTFILEPOS TextFile::GetPos() { TEXTFILEPOS Pos; Pos.uOffset = ftell(m_ptrFile); Pos.uLineNr = m_uLineNr; Pos.uColNr = m_uColNr; return Pos; } void TextFile::SetPos(TEXTFILEPOS Pos) { fseek(m_ptrFile, Pos.uOffset, SEEK_SET); m_uLineNr = Pos.uLineNr; m_uColNr = Pos.uColNr; } #endif bool TextFile::GetChar(char &c) { if (-1 != m_cPushedBack) { c = (char) m_cPushedBack; m_cPushedBack = -1; return false; } int ic = fgetc(m_ptrFile); if (ic < 0) { if (feof(m_ptrFile)) { // Hack to fix up a non-empty text file that is missing // and end-of-line character in the last line. if (!m_bLastCharWasEOL && m_uLineNr > 0) { c = '\n'; m_bLastCharWasEOL = true; return false; } return true; } Die("TextFile::GetChar, error %s", strerror(errno)); } c = (char) ic; if ('\n' == c) { m_bLastCharWasEOL = true; ++m_uLineNr; m_uColNr = 1; } else { m_bLastCharWasEOL = false; ++m_uColNr; } return false; } void TextFile::GetCharX(char &c) { bool bEof = GetChar(c); if (bEof) Die("End-of-file in GetCharX"); } void TextFile::GetNonblankChar(char &c) { do { bool bEof = GetChar(c); if (bEof) Die("End-of-file in GetCharX"); } while (isspace(c)); } void TextFile::SkipLine() { if (m_bLastCharWasEOL) return; for (;;) { char c; bool bEof = GetChar(c); if (bEof) Die("End-of-file in SkipLine"); if ('\n' == c) break; } } void TextFile::SkipWhite() { bool bEof = SkipWhiteX(); if (bEof) Die("End-of-file skipping white space"); } bool TextFile::SkipWhiteX() { for (;;) { char c; bool bEof = GetChar(c); if (bEof) return true; if (!isspace(c)) { PushBack(c); break; } } return false; } muscle-5.1.0/src/textfile.h000066400000000000000000000030441424453062600156000ustar00rootroot00000000000000#ifndef TextFile_h #define TextFile_h #include struct TEXTFILEPOS { unsigned uOffset; unsigned uLineNr; unsigned uColNr; }; const unsigned TextFileBufferSize = 256; class TextFile { private: // no default c'tor, not implemented TextFile(); public: virtual ~TextFile(); TextFile(const char szFileName[], bool bWrite = false); TextFile(const string &FileName, bool bWrite = false); TextFile(FILE *ptrFile, const char *ptrFileName = "-"); void Close() { fclose(m_ptrFile); m_ptrFile = 0; } bool GetLine(char szLine[], unsigned uBytes); bool GetTrimLine(char szLine[], unsigned uBytes); void GetLineX(char szLine[], unsigned uBytes); bool GetToken(char szToken[], unsigned uBytes, const char szCharTokens[] = "{}"); void GetTokenX(char szToken[], unsigned uBytes, const char szCharTokens[] = "{}"); void Skip(); void SkipLine(); void SkipWhite(); bool SkipWhiteX(); void Rewind(); TEXTFILEPOS GetPos(); void SetPos(TEXTFILEPOS Pos); bool GetChar(char &c); void GetCharX(char &c); void GetNonblankChar(char &c); unsigned GetLineNr() { return m_uLineNr; } void PutString(const char szLine[]); void PutFormat(const char szFormat[], ...); void PutChar(char c); const char *GetFileName() { return m_ptrName; } void PushBack(int c) { m_cPushedBack = c; } FILE *GetStdioFile() const { return m_ptrFile; } private: void Init(FILE *ptrFile, const char *ptrFileName); private: FILE *m_ptrFile; unsigned m_uLineNr; unsigned m_uColNr; char *m_ptrName; bool m_bLastCharWasEOL; int m_cPushedBack; }; #endif // TextFile_h muscle-5.1.0/src/timing.h000066400000000000000000000011401424453062600152360ustar00rootroot00000000000000#ifndef getticks_h #define getticks_h #if 0 // ~3 x 10^9 ticks/sec #ifdef _MSC_VER #include typedef unsigned __int64 TICKS; #define GetClockTicks __rdtsc #elif defined(__APPLE__) typedef uint64_t TICKS; __inline__ uint64_t GetClockTicks() { return 0; } #elif __GNUC__ typedef uint64_t TICKS; __inline__ uint64_t GetClockTicks() { uint32_t lo, hi; /* We cannot use "=A", since this would use %rax on x86_64 */ __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); return (uint64_t)hi << 32 | lo; } #else #error "getticks_h, unknown compiler" #endif #endif #endif // getticks_h muscle-5.1.0/src/totalprobflat.cpp000066400000000000000000000005621424453062600171660ustar00rootroot00000000000000#include "muscle.h" float CalcTotalProbFlat(const float *FlatFwd, const float *FlatBwd, uint LX, uint LY) { float Sum = LOG_ZERO; uint Base = HMMSTATE_COUNT*(LX*(LY+1) + LY); for (uint s = 0; s < HMMSTATE_COUNT; ++s) { float FwdScore = FlatFwd[Base + s]; float BwdScore = FlatBwd[Base + s]; LOG_PLUS_EQUALS(Sum, FwdScore + BwdScore); } return Sum; } muscle-5.1.0/src/tracebackflat.cpp000066400000000000000000000007761424453062600171060ustar00rootroot00000000000000#include "muscle.h" void TraceBackFlat(const char *TB, uint LX, uint LY, string &Path) { Path.clear(); int i = int(LX); int j = int(LY); for (;;) { if (i == 0 && j == 0) break; if (i < 0 || j < 0) { Warning("TraceBackFlat i=%d j=%d", i, j); return; } char TBChar = TB[i*(LY+1) + j]; Path.push_back(TBChar); switch (TBChar) { case 'B': --i; --j; break; case 'X': --i; break; case 'Y': --j; break; } } reverse(Path.begin(), Path.end()); } muscle-5.1.0/src/transaln.cpp000066400000000000000000000412661424453062600161410ustar00rootroot00000000000000#include "muscle.h" #include "transaln.h" /*** PWPath: Pair-wise path Fresh + MSASeq {XYB} MSAPath: Length = columns in MSA Path MSASeq to columns in MSA {MG} TPath1: Fresh sequence F to MSA columns Length = columns in MSA plus nr. inserts in F {FGgI} F = fresh letter G = gap in F in pair-wise alignment I = insert in F in pair-wise alignment g = gap because MSA seq has gap in MSA TPath2: Fresh sequence F to expanded MSA columns Length = columns in expanded MSA {FGgIi} F = fresh letter G = gap in F in pair-wise alignment g = gap because MSA seq has gap in MSA I = insert in F in pair-wise alignment i = padding insert because longer insert in other fresh sequence MPath: MSA to expanded MSA Length = columns in expanded MSA {Mi} M = MSA column i = padding insert ***/ void TransAln::LogMe() const { Log("\n"); Log("Pair-wise alignments:\n"); const uint FreshCount = GetFreshCount(); for (uint FreshIndex = 0; FreshIndex < FreshCount; ++FreshIndex) { const Sequence &FreshSeq = GetFreshSeq(FreshIndex); const uint MSAIndex = GetMSAIndex(FreshIndex); const Sequence &UngappedMSASeq = GetUngappedMSASeq(MSAIndex); const string &PWPath = GetPWPath(FreshIndex); LogAln(FreshSeq, UngappedMSASeq, PWPath); } Log("\n"); Log("MSAPaths:\n"); const uint MSACount = GetMSACount(); for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { const string &MSAPath = GetMSAPath(MSAIndex); const string &MSALabel = GetMSALabel(MSAIndex); Log("%s", MSAPath.c_str()); Log(" >%s", MSALabel.c_str()); Log("\n"); } Log("\n"); Log("MaxInserts:\n"); uint M = 0; for (uint Col = 0; Col < SIZE(m_MSAColToMaxInserts); ++Col) { uint n = m_MSAColToMaxInserts[Col]; if (n > 0) { Log(" [%u]=%u", Col, n); ++M; } } Log(" (%u)\n", M); Log("ExtendedColCount = %u\n", m_ExtendedMSAColCount); Log("\n"); Log("\n"); Log("TPaths1:\n"); for (uint i = 0; i < SIZE(m_TPaths1); ++i) { const string &Path1 = m_TPaths1[i]; const string &FreshLabel = GetFreshLabel(i); Log("%s", Path1.c_str()); Log(" >%s", FreshLabel.c_str()); Log("\n"); } for (uint i = 0; i < SIZE(m_TPaths1); ++i) LogTPath1Aln(i); Log("\n"); Log("TPaths2:\n"); for (uint i = 0; i < SIZE(m_TPaths2); ++i) { const string &Path2 = m_TPaths2[i]; const string &FreshLabel = GetFreshLabel(i); Log("%s", Path2.c_str()); Log(" >%s", FreshLabel.c_str()); Log("\n"); } Log("\n"); Log("MPath\n"); Log("%s\n", m_MPath.c_str()); for (uint i = 0; i < SIZE(m_TPaths2); ++i) LogTPath2Aln(i, true); Log("\n"); for (uint i = 0; i < MSACount; ++i) LogMPathAln(i, false); for (uint i = 0; i < SIZE(m_TPaths2); ++i) LogTPath2Aln(i, false); } uint TransAln::GetFreshCount() const { asserta(m_FreshSeqs != 0); return m_FreshSeqs->GetSeqCount(); } uint TransAln::GetMSACount() const { asserta(m_MSA != 0); return m_MSA->GetSeqCount(); } uint TransAln::GetMSAIndex(uint FreshIndex) const { asserta(m_FreshIndexToMSAIndex != 0); asserta(FreshIndex < SIZE(*m_FreshIndexToMSAIndex)); uint MSAIndex = (*m_FreshIndexToMSAIndex)[FreshIndex]; return MSAIndex; } const string &TransAln::GetPWPath(uint FreshIndex) const { asserta(m_PWPaths != 0); asserta(FreshIndex < SIZE(*m_PWPaths)); const string &Path = (*m_PWPaths)[FreshIndex]; return Path; } const string &TransAln::GetMSAPath(uint FreshIndex) const { asserta(FreshIndex < SIZE(m_MSAPaths)); return m_MSAPaths[FreshIndex]; } const string &TransAln::GetTPath1(uint FreshIndex) const { asserta(FreshIndex < SIZE(m_TPaths1)); return m_TPaths1[FreshIndex]; } const string &TransAln::GetTPath2(uint FreshIndex) const { asserta(FreshIndex < SIZE(m_TPaths2)); return m_TPaths2[FreshIndex]; } const string &TransAln::GetMSALabel(uint MSAIndex) const { const Sequence &Seq = GetMSASeq(MSAIndex); return Seq.GetLabel(); } const string &TransAln::GetFreshLabel(uint FreshIndex) const { const Sequence &Seq = GetFreshSeq(FreshIndex); return Seq.GetLabel(); } const Sequence &TransAln::GetMSASeq(uint MSAIndex) const { asserta(m_MSA != 0); const Sequence *Seq = m_MSA->GetSequence(MSAIndex); asserta(Seq != 0); return *Seq; } const Sequence &TransAln::GetUngappedMSASeq(uint MSAIndex) const { asserta(MSAIndex < SIZE(m_UngappedMSASeqs)); const Sequence *Seq = m_UngappedMSASeqs[MSAIndex]; asserta(Seq != 0); return *Seq; } const Sequence &TransAln::GetFreshSeq(uint FreshIndex) const { asserta(m_FreshSeqs != 0); const Sequence *Seq = m_FreshSeqs->GetSequence(FreshIndex); asserta(Seq != 0); return *Seq; } uint TransAln::GetUngappedMSASeqLength(uint MSAIndex) const { const Sequence &Seq = GetUngappedMSASeq(MSAIndex); const uint L = Seq.GetLength(); return L; } uint TransAln::GetFreshSeqLength(uint FreshIndex) const { const Sequence &Seq = GetFreshSeq(FreshIndex); uint L = Seq.GetLength(); return L; } void TransAln::MakeTPath1(uint FreshIndex, string &Path1) const { Path1.clear(); const uint MSAColCount = GetMSAColCount(); const uint MSAIndex = GetMSAIndex(FreshIndex); const string &PWPath = GetPWPath(FreshIndex); const string &MSAPath = GetMSAPath(MSAIndex); const uint FreshL = GetFreshSeqLength(FreshIndex); const uint UL = GetUngappedMSASeqLength(MSAIndex); const uint PWColCount = SIZE(PWPath); uint NF = 0; uint NG = 0; uint NI = 0; uint Ng = 0; uint MSACol = 0; for (uint PWCol = 0; PWCol < PWColCount; ++PWCol) { char c = PWPath[PWCol]; if (c == 'B' || c == 'Y') { while (MSAPath[MSACol] == 'G') { ++MSACol; Path1 += 'g'; ++Ng; } } switch (c) { case 'B': Path1 += 'F'; ++NF; ++MSACol; break; case 'X': Path1 += 'I'; ++NI; break; case 'Y': Path1 += 'G'; ++NG; ++MSACol; break; default: asserta(false); } } while (MSACol < MSAColCount) { asserta(MSAPath[MSACol] == 'G'); ++MSACol; Path1 += 'g'; ++Ng; } asserta(NF + NG + Ng == MSAColCount); asserta(NF + NI == FreshL); asserta(NF + NG == UL); } void TransAln::MakeMSAPath(uint MSAIndex, string &MSAPath) const { MSAPath.clear(); const Sequence &MSASeq = GetMSASeq(MSAIndex); const byte *ByteSeq = MSASeq.GetBytePtr(); asserta(MSASeq.GetLength() == m_MSAColCount); for (uint Col = 0; Col < m_MSAColCount; ++Col) { byte c = ByteSeq[Col]; if (c == '-') MSAPath += 'G'; else MSAPath += 'M'; } } void TransAln::MakeMPath(string &MPath) const { MPath.clear(); asserta(SIZE(m_MSAColToMaxInserts) == m_MSAColCount + 1); for (uint Col = 0; Col <= m_MSAColCount; ++Col) { uint Ins = m_MSAColToMaxInserts[Col]; for (uint i = 0; i < Ins; ++i) MPath += 'i'; if (Col < m_MSAColCount) MPath += 'M'; } } void TransAln::Init(const MultiSequence &MSA, const MultiSequence &FreshSeqs, const vector &FreshIndexToMSAIndex, const vector &PWPaths) { m_MSAPaths.clear(); m_TPaths1.clear(); m_TPaths2.clear(); m_UngappedMSASeqs.clear(); m_MSA = &MSA; m_FreshSeqs = &FreshSeqs; m_FreshIndexToMSAIndex = &FreshIndexToMSAIndex; m_PWPaths = &PWPaths; m_MSAColCount = MSA.GetColCount(); const uint MSASeqCount = MSA.GetSeqCount(); for (uint MSAIndex = 0; MSAIndex < MSASeqCount; ++MSAIndex) { string MSAPath; MakeMSAPath(MSAIndex, MSAPath); m_MSAPaths.push_back(MSAPath); const Sequence *Seq = MSA.GetSequence(MSAIndex); Sequence *UngappedSeq = Seq->DeleteGaps(); m_UngappedMSASeqs.push_back(UngappedSeq); } const uint FreshSeqCount = FreshSeqs.GetSeqCount(); for (uint FreshIndex = 0; FreshIndex < FreshSeqCount; ++FreshIndex) { string Path1; MakeTPath1(FreshIndex, Path1); m_TPaths1.push_back(Path1); } SetMaxInserts(); for (uint FreshIndex = 0; FreshIndex < FreshSeqCount; ++FreshIndex) { string Path2; MakeTPath2(FreshIndex, Path2); m_TPaths2.push_back(Path2); } MakeMPath(m_MPath); } void TransAln::SetMaxInserts() { const uint FreshCount = GetFreshCount(); asserta(SIZE(m_TPaths1) == FreshCount); m_MSAColToMaxInserts.clear(); m_MSAColToMaxInserts.resize(m_MSAColCount+1, 0); for (uint FreshIndex = 0; FreshIndex < FreshCount; ++FreshIndex) { vector MSAColToInserts; MakeMSAColToInserts(FreshIndex, MSAColToInserts); asserta(SIZE(MSAColToInserts) == m_MSAColCount + 1); for (uint MSACol = 0; MSACol <= m_MSAColCount; ++MSACol) { uint Ins = MSAColToInserts[MSACol]; m_MSAColToMaxInserts[MSACol] = max(Ins, m_MSAColToMaxInserts[MSACol]); } } m_ExtendedMSAColCount = 0; for (uint MSACol = 0; MSACol <= m_MSAColCount; ++MSACol) { uint Ins = m_MSAColToMaxInserts[MSACol]; m_ExtendedMSAColCount += Ins; if (MSACol < m_MSAColCount) ++m_ExtendedMSAColCount; } } void TransAln::MakeMSAColToInserts(uint FreshIndex, vector &MSAColToInserts) const { MSAColToInserts.clear(); const string &TPath1 = GetTPath1(FreshIndex); uint MSACol = 0; MSAColToInserts.resize(m_MSAColCount + 1, 0); const uint n = SIZE(TPath1); for (uint i = 0; i < n; ++i) { char c = TPath1[i]; switch (c) { case 'F': case 'G': case 'g': ++MSACol; break; case 'I': asserta(MSACol <= m_MSAColCount); MSAColToInserts[MSACol] += 1; break; default: asserta(false); } } asserta(MSACol == m_MSAColCount); } void TransAln::MakeTPath2(uint FreshIndex, string &Path2) const { Path2.clear(); asserta(SIZE(m_MSAColToMaxInserts) == m_MSAColCount + 1); const string &TPath1 = GetTPath1(FreshIndex); uint MSACol = 0; vector MSAColToInserts; MakeMSAColToInserts(FreshIndex, MSAColToInserts); const uint n1 = SIZE(TPath1); MSACol = 0; for (uint i = 0; i < n1; ++i) { char c = TPath1[i]; Path2 += c; if (c != 'I') { asserta(MSACol < m_MSAColCount); uint InsertCount = MSAColToInserts[MSACol]; uint MaxInsertCount = m_MSAColToMaxInserts[MSACol]; asserta(InsertCount <= MaxInsertCount); for (uint j = InsertCount; j < MaxInsertCount; ++j) Path2 += 'i'; } switch (c) { case 'F': case 'G': case 'g': ++MSACol; break; case 'I': break; default: asserta(false); } } asserta(MSACol == m_MSAColCount); uint InsertCount = MSAColToInserts[m_MSAColCount]; uint MaxInsertCount = m_MSAColToMaxInserts[m_MSAColCount]; asserta(InsertCount <= MaxInsertCount); for (uint j = InsertCount; j < MaxInsertCount; ++j) Path2 += 'i'; if (SIZE(Path2) != m_ExtendedMSAColCount) { LogMe(); Log("FreshIndex %u, MSAIndex %u, Path2=%s\n", FreshIndex, GetMSAIndex(FreshIndex), Path2.c_str()); Die("|Path2|=%u, m_ExtendedMSAColCount=%u", SIZE(Path2), m_ExtendedMSAColCount); } } void TransAln::LogTPath1Aln(uint FreshIndex) const { const string &TPath1 = GetTPath1(FreshIndex); const uint MSAIndex = GetMSAIndex(FreshIndex); const Sequence &F = GetFreshSeq(FreshIndex); const Sequence &U = GetUngappedMSASeq(MSAIndex); const uint FL = F.GetLength(); const uint UL = U.GetLength(); const string &FLabel = GetFreshLabel(FreshIndex); const string &ULabel = GetMSALabel(MSAIndex); const byte *Fb = F.GetBytePtr(); const byte *Ub = U.GetBytePtr(); const uint ColCount = SIZE(TPath1); uint FPos = 0; uint UPos = 0; string FRow; string URow; for (uint Col = 0; Col < ColCount; ++Col) { char c = TPath1[Col]; switch (c) { case 'F': { char f = Fb[FPos]; char u = Ub[UPos]; FRow += f; URow += u; ++FPos; ++UPos; break; } case 'G': { char u = Ub[UPos]; FRow += '-'; URow += u; ++UPos; break; } case 'I': { char f = Fb[FPos]; FRow += f; URow += '.'; ++FPos; break; } case 'g': { FRow += '.'; URow += '.'; break; } } } Log("\n"); Log("%s\n", TPath1.c_str()); Log("%s >%s\n", FRow.c_str(), FLabel.c_str()); Log("%s >%s\n", URow.c_str(), ULabel.c_str()); asserta(FPos == FL); asserta(UPos == UL); } void TransAln::LogTPath2Aln(uint FreshIndex, bool WithPath) const { const string &TPath2 = GetTPath2(FreshIndex); const uint MSAIndex = GetMSAIndex(FreshIndex); const Sequence &F = GetFreshSeq(FreshIndex); const Sequence &U = GetUngappedMSASeq(MSAIndex); const uint FL = F.GetLength(); const uint UL = U.GetLength(); const string &FLabel = GetFreshLabel(FreshIndex); const string &ULabel = GetMSALabel(MSAIndex); const byte *Fb = F.GetBytePtr(); const byte *Ub = U.GetBytePtr(); const uint ColCount = SIZE(TPath2); uint FPos = 0; uint UPos = 0; string FRow; string URow; for (uint Col = 0; Col < ColCount; ++Col) { char c = TPath2[Col]; switch (c) { case 'F': { char f = Fb[FPos]; char u = Ub[UPos]; FRow += f; URow += u; ++FPos; ++UPos; break; } case 'G': { char u = Ub[UPos]; FRow += '-'; URow += u; ++UPos; break; } case 'I': { char f = Fb[FPos]; FRow += f; URow += '.'; ++FPos; break; } case 'g': { FRow += '.'; URow += '.'; break; } case 'i': { FRow += '.'; URow += '.'; break; } } } if (WithPath) { Log("\n"); Log("%s\n", TPath2.c_str()); } Log("%s [F] >%s\n", FRow.c_str(), FLabel.c_str()); Log("%s [U] >%s\n", URow.c_str(), ULabel.c_str()); asserta(FPos == FL); asserta(UPos == UL); } void TransAln::LogMPathAln(uint MSAIndex, bool WithPath) const { const Sequence &M = GetMSASeq(MSAIndex); const byte *Mb = M.GetBytePtr(); const uint ColCount = SIZE(m_MPath); const string &MLabel = GetMSALabel(MSAIndex); uint MSACol = 0; string MRow; for (uint Col = 0; Col < ColCount; ++Col) { char c = m_MPath[Col]; switch (c) { case 'M': { MRow += Mb[MSACol]; ++MSACol; break; } case 'i': { MRow += '.'; break; } } } if (WithPath) { Log("\n"); Log("%s\n", m_MPath.c_str()); } Log("%s [M] >%s\n", MRow.c_str(), MLabel.c_str()); asserta(MSACol == m_MSAColCount); } Sequence *TransAln::ExtendFreshSeq(uint FreshIndex) const { const Sequence &F = GetFreshSeq(FreshIndex); const byte *Fb = F.GetBytePtr(); const string &TPath2 = GetTPath2(FreshIndex); const uint ColCount = SIZE(m_MPath); const string &FLabel = GetFreshLabel(FreshIndex); uint MSACol = 0; Sequence *FX = Sequence::NewSequence(); FX->InitData(); uint GSI = F.GetGSI(); FX->SetGSI(GSI); FX->m_Label = F.m_Label; uint FPos = 0; for (uint Col = 0; Col < ColCount; ++Col) { char c = TPath2[Col]; switch (c) { case 'F': case 'I': { char f = Fb[FPos]; FX->AppendChar(f); ++FPos; break; } case 'G': case 'g': case 'i': { FX->AppendChar('-'); break; } default: Die("Invalid char '%c' in TPath2", c); } } asserta(FX->GetLength() == m_ExtendedMSAColCount); return FX; } Sequence *TransAln::ExtendMSASeq(uint MSAIndex) const { const Sequence &M = GetMSASeq(MSAIndex); const byte *Mb = M.GetBytePtr(); const uint ColCount = SIZE(m_MPath); const string &MLabel = GetMSALabel(MSAIndex); uint MSACol = 0; Sequence *MX = Sequence::NewSequence(); MX->InitData(); uint GSI = M.GetGSI(); MX->SetGSI(GSI); MX->m_Label = M.m_Label; for (uint Col = 0; Col < ColCount; ++Col) { char c = m_MPath[Col]; switch (c) { case 'M': { char c = Mb[MSACol]; MX->AppendChar(c); ++MSACol; break; } case 'i': { MX->AppendChar('-'); break; } default: Die("Invalid char '%c' in MPath", c); } } asserta(MX->GetLength() == m_ExtendedMSAColCount); return MX; } void TransAln::MakeExtendedMSA() { const uint MSACount = GetMSACount(); const uint FreshCount = GetFreshCount(); m_ExtendedMSA = new MultiSequence; for (uint i = 0; i < MSACount; ++i) { Sequence *S = ExtendMSASeq(i); m_ExtendedMSA->AddSequence(S, true); } for (uint i = 0; i < FreshCount; ++i) { Sequence *S = ExtendFreshSeq(i); m_ExtendedMSA->AddSequence(S, true); } } void cmd_transaln() { const string &InputFileName = opt(transaln); const string &RefFileName = opt(ref); const string &OutputFileName = opt(output); MultiSequence InputSeqs; InputSeqs.FromFASTA(InputFileName); const uint InputSeqCount = InputSeqs.GetSeqCount(); MultiSequence RefMSA; RefMSA.FromFASTA(RefFileName); const uint RefSeqCount = RefMSA.GetSeqCount(); MultiSequence UngappedRefSeqs; for (uint i = 0; i < RefSeqCount; ++i) { const Sequence *AlignedRefSeq = RefMSA.GetSequence(i); Sequence *UngappedRefSeq = AlignedRefSeq->DeleteGaps(); UngappedRefSeqs.AddSequence(UngappedRefSeq, true); } vector PWPaths; vector FreshIndexToMSAIndex; for (uint InputSeqIndex = 0; InputSeqIndex < InputSeqCount; ++InputSeqIndex) { const uint RefSeqIndex = InputSeqIndex%RefSeqCount; FreshIndexToMSAIndex.push_back(RefSeqIndex); const Sequence *InputSeq = InputSeqs.GetSequence(InputSeqIndex); const Sequence *RefSeq = UngappedRefSeqs.GetSequence(RefSeqIndex); string PWPath; AlignPairFlat(InputSeq, RefSeq, PWPath); PWPaths.push_back(PWPath); } TransAln TA; TA.Init(RefMSA, InputSeqs, FreshIndexToMSAIndex, PWPaths); TA.MakeExtendedMSA(); asserta(TA.m_ExtendedMSA != 0); TA.m_ExtendedMSA->WriteMFA(OutputFileName); } muscle-5.1.0/src/transaln.h000066400000000000000000000037101424453062600155760ustar00rootroot00000000000000#pragma once class TransAln { public: // Input data const MultiSequence *m_MSA = 0; const MultiSequence *m_FreshSeqs = 0; const vector *m_FreshIndexToMSAIndex = 0; const vector *m_PWPaths = 0; uint m_MSAColCount = 0; // Derived data uint m_ExtendedMSAColCount = 0; vector m_UngappedMSASeqs; vector m_MSAPaths; vector m_TPaths1; vector m_TPaths2; vector m_MSAColToMaxInserts; string m_MPath; MultiSequence *m_ExtendedMSA = 0; public: void Init(const MultiSequence &MSA, const MultiSequence &FreshSeqs, const vector &FreshIndexToMSAIndex, const vector &PWPaths); void LogMe() const; void LogTPath1Aln(uint FreshIndex) const; void LogTPath2Aln(uint FreshIndex, bool WithPath = true) const; void LogMPathAln(uint MSAIndex, bool WithPath = true) const; uint GetFreshCount() const; uint GetMSACount() const; const string &GetFreshLabel(uint MSAIndex) const; const string &GetMSALabel(uint MSAIndex) const; const Sequence &GetMSASeq(uint MSAIndex) const; const Sequence &GetUngappedMSASeq(uint MSAIndex) const; const Sequence &GetFreshSeq(uint FreshIndex) const; uint GetMSAIndex(uint FreshIndex) const; const string &GetPWPath(uint FreshIndex) const; const string &GetMSAPath(uint FreshIndex) const; const string &GetTPath1(uint FreshIndex) const; void MakeMPath(string &MPath) const; const string &GetTPath2(uint FreshIndex) const; uint GetUngappedMSASeqLength(uint MSAIndex) const; uint GetFreshSeqLength(uint MSAIndex) const; uint GetMSAColCount() const { return m_MSAColCount; } void MakeMSAPath(uint MSAIndex, string &MSAPath) const; void MakeTPath1(uint FreshIndex, string &Path1) const; void MakeTPath2(uint FreshIndex, string &Path2) const; void MakeMSAColToInserts(uint FreshIndex, vector &MSAColToInserts) const; void SetMaxInserts(); void MakeExtendedMSA(); Sequence *ExtendMSASeq(uint MSAIndex) const; Sequence *ExtendFreshSeq(uint MSAIndex) const; }; muscle-5.1.0/src/transq.cpp000066400000000000000000000104211424453062600156140ustar00rootroot00000000000000#include "muscle.h" #include "qscorer.h" #include "qscorer3.h" void QScorer3::TransQPair(uint Indexi, uint Indexj) { const string &Labeli = m_QS1.m_Labels[Indexi]; const string &Labelj = m_QS1.m_Labels[Indexj]; const uint RefSeqIndexi = m_QS1.m_RefSeqIndexes[Indexi]; const uint RefSeqIndexj = m_QS1.m_RefSeqIndexes[Indexj]; asserta(string(m_Ref.GetSeqName(RefSeqIndexi)) == Labeli); asserta(string(m_Ref.GetSeqName(RefSeqIndexj)) == Labelj); uint Indexi2 = m_Indexes2[Indexi]; uint Indexj2 = m_Indexes2[Indexj]; asserta(m_QS2.m_Labels[Indexi2] == Labeli); asserta(m_QS2.m_Labels[Indexj2] == Labelj); asserta(m_QS2.m_RefSeqIndexes[Indexi2] == RefSeqIndexi); asserta(m_QS2.m_RefSeqIndexes[Indexj2] == RefSeqIndexj); const vector &TestColToPosi1 = m_QS1.m_TestColToPosVec[Indexi]; const vector &TestColToPosi2 = m_QS2.m_TestColToPosVec[Indexi2]; const vector &TestColToPosj1 = m_QS1.m_TestColToPosVec[Indexj]; const vector &TestColToPosj2 = m_QS2.m_TestColToPosVec[Indexj2]; const vector &RefColToPosi1 = m_QS1.m_RefColToPosVec[Indexi]; const vector &RefColToPosi2 = m_QS2.m_RefColToPosVec[Indexi2]; const vector &RefColToPosj1 = m_QS1.m_RefColToPosVec[Indexj]; const vector &RefColToPosj2 = m_QS2.m_RefColToPosVec[Indexj2]; const vector &PosToTestColi1 = m_QS1.m_PosToTestColVec[Indexi]; const vector &PosToTestColi2 = m_QS2.m_PosToTestColVec[Indexi2]; const vector &PosToTestColj1 = m_QS1.m_PosToTestColVec[Indexj]; const vector &PosToTestColj2 = m_QS2.m_PosToTestColVec[Indexj2]; const uint RefColCount = SIZE(RefColToPosi1); asserta(SIZE(RefColToPosi2) == RefColCount); asserta(SIZE(RefColToPosj1) == RefColCount); asserta(SIZE(RefColToPosj2) == RefColCount); const uint Li = SIZE(PosToTestColi1); const uint Lj = SIZE(PosToTestColj1); asserta(SIZE(PosToTestColi2) == Li); asserta(SIZE(PosToTestColj2) == Lj); const vector &RefCols = *m_RefCols; const uint RefAlignedColCount = m_QS1.m_RefAlignedColCount; asserta(m_QS2.m_RefAlignedColCount == RefAlignedColCount); uint CorrectColCount1 = 0; uint CorrectColCount2 = 0; vector Posis; vector Posjs; for (uint k = 0; k < RefAlignedColCount; ++k) { uint RefCol = RefCols[k]; uint Posi = RefColToPosi1[RefCol]; uint Posi2 = RefColToPosi2[RefCol]; if (Posi == UINT_MAX || Posi2 == UINT_MAX) continue; asserta(Posi2 == Posi); uint TestColi1 = PosToTestColi1[Posi]; uint TestColi2 = PosToTestColi2[Posi]; uint Posj = RefColToPosj1[RefCol]; uint Posj2 = m_QS2.m_RefColToPosVec[Indexj2][RefCol]; if (Posj == UINT_MAX || Posj2 == UINT_MAX) continue; asserta(Posj2 == Posj); Posis.push_back(Posi); Posjs.push_back(Posj); uint TestColj1 = PosToTestColj1[Posj]; uint TestColj2 = PosToTestColj2[Posj]; if (TestColi1 == TestColj1) ++CorrectColCount1; if (TestColi2 == TestColj2) ++CorrectColCount2; } const uint TestAlignedPosCount = SIZE(Posis); asserta(SIZE(Posjs) == TestAlignedPosCount); uint SameColCount = 0; for (uint k = 0; k < TestAlignedPosCount; ++k) { uint Posi = Posis[k]; uint TestCol1 = PosToTestColi1[Posi]; uint Posj1 = TestColToPosj1[TestCol1]; uint TestCol2 = PosToTestColi2[Posi]; uint Posj2 = TestColToPosj2[TestCol2]; if (Posj1 == Posj2) ++SameColCount; } for (uint k = 0; k < TestAlignedPosCount; ++k) { uint Posj = Posjs[k]; uint TestCol1 = PosToTestColj1[Posj]; uint Posi1 = TestColToPosi1[TestCol1]; uint TestCol2 = PosToTestColj2[Posj]; uint Posi2 = TestColToPosi2[TestCol2]; if (Posi1 == Posi2) ++SameColCount; } float Q1 = float(CorrectColCount1)/RefAlignedColCount; float Q2 = float(CorrectColCount2)/RefAlignedColCount;; float PWC = float(SameColCount)/(2*TestAlignedPosCount); m_Pairs.push_back(pair(Indexi, Indexj)); m_PairIndexToQ1.push_back(Q1); m_PairIndexToQ2.push_back(Q2); m_PairIndexToPWC.push_back(PWC); } void QScorer3::TransQ() { const vector &Labels = m_QS1.m_Labels; const uint N = SIZE(Labels); asserta(m_RefCols != 0); const vector &RefCols = *m_RefCols; m_Pairs.clear(); m_PairIndexToQ1.clear(); m_PairIndexToQ2.clear(); m_PairIndexToPWC.clear(); for (uint Indexi = 0; Indexi < N; ++Indexi) for (uint Indexj = Indexi + 1; Indexj < N; ++Indexj) TransQPair(Indexi, Indexj); } muscle-5.1.0/src/tree.cpp000066400000000000000000001131541424453062600152520ustar00rootroot00000000000000#include "muscle.h" #include "tree.h" #include #define TRACE 0 /*** Node has 0 to 3 neighbors: 0 neighbors: singleton root 1 neighbor: leaf, neighbor is parent 2 neigbors: non-singleton root 3 neighbors: internal node (other than root) Minimal rooted tree is single node. Minimal unrooted tree is single edge. Leaf node always has nulls in neighbors 2 and 3, neighbor 1 is parent. When tree is rooted, neighbor 1=parent, 2=left, 3=right. Tree2 from Newick ================= Nbr1 Nbr2 Nbr3 :-----------------------------------------------------------: : Nbr1 Nbr2 Nbr3 : Non-leaf, unrooted : : Parent Left Right : Internal, rooted : : Parent * * : Leaf, rooted or unrooted : : * Left Right : Root : :-----------------------------------------------------------: ***/ void Tree::InitCache(unsigned uCacheCount) { m_uCacheCount = uCacheCount; m_uNeighbor1 = new unsigned[m_uCacheCount]; m_uNeighbor2 = new unsigned[m_uCacheCount]; m_uNeighbor3 = new unsigned[m_uCacheCount]; m_Ids = new unsigned[m_uCacheCount]; m_dEdgeLength1 = new double[m_uCacheCount]; m_dEdgeLength2 = new double[m_uCacheCount]; m_dEdgeLength3 = new double[m_uCacheCount]; m_dHeight = new double[m_uCacheCount]; m_bHasEdgeLength1 = new bool[m_uCacheCount]; m_bHasEdgeLength2 = new bool[m_uCacheCount]; m_bHasEdgeLength3 = new bool[m_uCacheCount]; m_bHasHeight = new bool[m_uCacheCount]; m_ptrName = new char *[m_uCacheCount]; for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { m_uNeighbor1[uNodeIndex] = NULL_NEIGHBOR; m_uNeighbor2[uNodeIndex] = NULL_NEIGHBOR; m_uNeighbor3[uNodeIndex] = NULL_NEIGHBOR; m_bHasEdgeLength1[uNodeIndex] = false; m_bHasEdgeLength2[uNodeIndex] = false; m_bHasEdgeLength3[uNodeIndex] = false; m_bHasHeight[uNodeIndex] = false; m_dEdgeLength1[uNodeIndex] = DBL_MAX; m_dEdgeLength2[uNodeIndex] = DBL_MAX; m_dEdgeLength3[uNodeIndex] = DBL_MAX; m_dHeight[uNodeIndex] = DBL_MAX; m_ptrName[uNodeIndex] = 0; m_Ids[uNodeIndex] = UINT_MAX; } } void Tree::AssertAreNeighbors(unsigned uNodeIndex1, unsigned uNodeIndex2) const { if (uNodeIndex1 >= m_uNodeCount || uNodeIndex2 >= m_uNodeCount) Die("AssertAreNeighbors(%u,%u), are %u nodes", uNodeIndex1, uNodeIndex2, m_uNodeCount); if (m_uNeighbor1[uNodeIndex1] != uNodeIndex2 && m_uNeighbor2[uNodeIndex1] != uNodeIndex2 && m_uNeighbor3[uNodeIndex1] != uNodeIndex2) { LogMe(); Die("AssertAreNeighbors(%u,%u) failed", uNodeIndex1, uNodeIndex2); } if (m_uNeighbor1[uNodeIndex2] != uNodeIndex1 && m_uNeighbor2[uNodeIndex2] != uNodeIndex1 && m_uNeighbor3[uNodeIndex2] != uNodeIndex1) { LogMe(); Die("AssertAreNeighbors(%u,%u) failed", uNodeIndex1, uNodeIndex2); } bool Has12 = HasEdgeLength(uNodeIndex1, uNodeIndex2); bool Has21 = HasEdgeLength(uNodeIndex2, uNodeIndex1); if (Has12 != Has21) { HasEdgeLength(uNodeIndex1, uNodeIndex2); HasEdgeLength(uNodeIndex2, uNodeIndex1); LogMe(); Log("HasEdgeLength(%u, %u)=%c HasEdgeLength(%u, %u)=%c\n", uNodeIndex1, uNodeIndex2, Has12 ? 'T' : 'F', uNodeIndex2, uNodeIndex1, Has21 ? 'T' : 'F'); Die("Tree::AssertAreNeighbors, HasEdgeLength not symmetric"); } if (Has12) { double d12 = GetEdgeLength(uNodeIndex1, uNodeIndex2); double d21 = GetEdgeLength(uNodeIndex2, uNodeIndex1); if (d12 != d21) { LogMe(); Die("Tree::AssertAreNeighbors, Edge length disagrees %u-%u=%.3g, %u-%u=%.3g", uNodeIndex1, uNodeIndex2, d12, uNodeIndex2, uNodeIndex1, d21); } } } void Tree::ValidateNode(unsigned uNodeIndex) const { if (uNodeIndex >= m_uNodeCount) Die("ValidateNode(%u), %u nodes", uNodeIndex, m_uNodeCount); const unsigned uNeighborCount = GetNeighborCount(uNodeIndex); if (2 == uNeighborCount) { if (!m_bRooted) { LogMe(); Die("Tree::ValidateNode: Node %u has two neighbors, tree is not rooted", uNodeIndex); } if (uNodeIndex != m_uRootNodeIndex) { LogMe(); Die("Tree::ValidateNode: Node %u has two neighbors, but not root node=%u", uNodeIndex, m_uRootNodeIndex); } } const unsigned n1 = m_uNeighbor1[uNodeIndex]; const unsigned n2 = m_uNeighbor2[uNodeIndex]; const unsigned n3 = m_uNeighbor3[uNodeIndex]; if (NULL_NEIGHBOR == n2 && NULL_NEIGHBOR != n3) { LogMe(); Die("Tree::ValidateNode, n2=null, n3!=null", uNodeIndex); } if (NULL_NEIGHBOR == n3 && NULL_NEIGHBOR != n2) { LogMe(); Die("Tree::ValidateNode, n3=null, n2!=null", uNodeIndex); } if (n1 != NULL_NEIGHBOR) AssertAreNeighbors(uNodeIndex, n1); if (n2 != NULL_NEIGHBOR) AssertAreNeighbors(uNodeIndex, n2); if (n3 != NULL_NEIGHBOR) AssertAreNeighbors(uNodeIndex, n3); if (n1 != NULL_NEIGHBOR && (n1 == n2 || n1 == n3)) { LogMe(); Die("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex); } if (n2 != NULL_NEIGHBOR && (n2 == n1 || n2 == n3)) { LogMe(); Die("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex); } if (n3 != NULL_NEIGHBOR && (n3 == n1 || n3 == n2)) { LogMe(); Die("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex); } if (IsRooted()) { if (NULL_NEIGHBOR == GetParent(uNodeIndex)) { if (uNodeIndex != m_uRootNodeIndex) { LogMe(); Die("Tree::ValiateNode(%u), no parent", uNodeIndex); } } else if (GetLeft(GetParent(uNodeIndex)) != uNodeIndex && GetRight(GetParent(uNodeIndex)) != uNodeIndex) { LogMe(); Die("Tree::ValidateNode(%u), parent / child mismatch", uNodeIndex); } } } void Tree::Validate() const { for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) ValidateNode(uNodeIndex); } uint Tree::GetSibling(uint Node) const { if (Node == UINT_MAX) return UINT_MAX; uint Parent = GetParent(Node); if (Parent == UINT_MAX) return UINT_MAX; uint ParentLeft = GetLeft(Parent); uint ParentRight = GetRight(Parent); asserta(ParentLeft != UINT_MAX); asserta(ParentRight != UINT_MAX); if (ParentLeft == Node) return ParentRight; if (ParentRight == Node) return ParentLeft; asserta(false); return UINT_MAX; } bool Tree::IsEdge(unsigned uNodeIndex1, unsigned uNodeIndex2) const { assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount); return m_uNeighbor1[uNodeIndex1] == uNodeIndex2 || m_uNeighbor2[uNodeIndex1] == uNodeIndex2 || m_uNeighbor3[uNodeIndex1] == uNodeIndex2; } double Tree::GetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const { assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount); if (!HasEdgeLength(uNodeIndex1, uNodeIndex2)) { LogMe(); Die("Missing edge length in tree %u-%u", uNodeIndex1, uNodeIndex2); } if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2) return m_dEdgeLength1[uNodeIndex1]; else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2) return m_dEdgeLength2[uNodeIndex1]; assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2); return m_dEdgeLength3[uNodeIndex1]; } void Tree::ExpandCache() { const unsigned uNodeCount = 100; unsigned uNewCacheCount = m_uCacheCount + uNodeCount; unsigned *uNewNeighbor1 = new unsigned[uNewCacheCount]; unsigned *uNewNeighbor2 = new unsigned[uNewCacheCount]; unsigned *uNewNeighbor3 = new unsigned[uNewCacheCount]; unsigned *uNewIds = new unsigned[uNewCacheCount]; memset(uNewIds, 0xff, uNewCacheCount*sizeof(unsigned)); double *dNewEdgeLength1 = new double[uNewCacheCount]; double *dNewEdgeLength2 = new double[uNewCacheCount]; double *dNewEdgeLength3 = new double[uNewCacheCount]; double *dNewHeight = new double[uNewCacheCount]; bool *bNewHasEdgeLength1 = new bool[uNewCacheCount]; bool *bNewHasEdgeLength2 = new bool[uNewCacheCount]; bool *bNewHasEdgeLength3 = new bool[uNewCacheCount]; bool *bNewHasHeight = new bool[uNewCacheCount]; char **ptrNewName = new char *[uNewCacheCount]; memset(ptrNewName, 0, uNewCacheCount*sizeof(char *)); if (m_uCacheCount > 0) { const unsigned uUnsignedBytes = m_uCacheCount*sizeof(unsigned); memcpy(uNewNeighbor1, m_uNeighbor1, uUnsignedBytes); memcpy(uNewNeighbor2, m_uNeighbor2, uUnsignedBytes); memcpy(uNewNeighbor3, m_uNeighbor3, uUnsignedBytes); memcpy(uNewIds, m_Ids, uUnsignedBytes); const unsigned uEdgeBytes = m_uCacheCount*sizeof(double); memcpy(dNewEdgeLength1, m_dEdgeLength1, uEdgeBytes); memcpy(dNewEdgeLength2, m_dEdgeLength2, uEdgeBytes); memcpy(dNewEdgeLength3, m_dEdgeLength3, uEdgeBytes); memcpy(dNewHeight, m_dHeight, uEdgeBytes); const unsigned uBoolBytes = m_uCacheCount*sizeof(bool); memcpy(bNewHasEdgeLength1, m_bHasEdgeLength1, uBoolBytes); memcpy(bNewHasEdgeLength2, m_bHasEdgeLength2, uBoolBytes); memcpy(bNewHasEdgeLength3, m_bHasEdgeLength3, uBoolBytes); memcpy(bNewHasHeight, m_bHasHeight, uBoolBytes); const unsigned uNameBytes = m_uCacheCount*sizeof(char *); memcpy(ptrNewName, m_ptrName, uNameBytes); delete[] m_uNeighbor1; delete[] m_uNeighbor2; delete[] m_uNeighbor3; delete[] m_Ids; delete[] m_dEdgeLength1; delete[] m_dEdgeLength2; delete[] m_dEdgeLength3; delete[] m_bHasEdgeLength1; delete[] m_bHasEdgeLength2; delete[] m_bHasEdgeLength3; delete[] m_bHasHeight; delete[] m_ptrName; } m_uCacheCount = uNewCacheCount; m_uNeighbor1 = uNewNeighbor1; m_uNeighbor2 = uNewNeighbor2; m_uNeighbor3 = uNewNeighbor3; m_Ids = uNewIds; m_dEdgeLength1 = dNewEdgeLength1; m_dEdgeLength2 = dNewEdgeLength2; m_dEdgeLength3 = dNewEdgeLength3; m_dHeight = dNewHeight; m_bHasEdgeLength1 = bNewHasEdgeLength1; m_bHasEdgeLength2 = bNewHasEdgeLength2; m_bHasEdgeLength3 = bNewHasEdgeLength3; m_bHasHeight = bNewHasHeight; m_ptrName = ptrNewName; } // Creates tree with single node, no edges. // Root node always has index 0. void Tree::CreateRooted() { Clear(); ExpandCache(); m_uNodeCount = 1; m_uNeighbor1[0] = NULL_NEIGHBOR; m_uNeighbor2[0] = NULL_NEIGHBOR; m_uNeighbor3[0] = NULL_NEIGHBOR; m_bHasEdgeLength1[0] = false; m_bHasEdgeLength2[0] = false; m_bHasEdgeLength3[0] = false; m_bHasHeight[0] = false; m_uRootNodeIndex = 0; m_bRooted = true; #if DEBUG Validate(); #endif } // Creates unrooted tree with single edge. // Nodes for that edge are always 0 and 1. void Tree::CreateUnrooted(double dEdgeLength) { Clear(); ExpandCache(); m_uNeighbor1[0] = 1; m_uNeighbor2[0] = NULL_NEIGHBOR; m_uNeighbor3[0] = NULL_NEIGHBOR; m_uNeighbor1[1] = 0; m_uNeighbor2[1] = NULL_NEIGHBOR; m_uNeighbor3[1] = NULL_NEIGHBOR; m_dEdgeLength1[0] = dEdgeLength; m_dEdgeLength1[1] = dEdgeLength; m_bHasEdgeLength1[0] = true; m_bHasEdgeLength1[1] = true; m_bRooted = false; #if DEBUG Validate(); #endif } void Tree::SetLeafName(unsigned uNodeIndex, const char *ptrName) { assert(uNodeIndex < m_uNodeCount); assert(IsLeaf(uNodeIndex)); free(m_ptrName[uNodeIndex]); m_ptrName[uNodeIndex] = mystrsave(ptrName); } void Tree::SetLeafId(unsigned uNodeIndex, unsigned uId) { assert(uNodeIndex < m_uNodeCount); assert(IsLeaf(uNodeIndex)); m_Ids[uNodeIndex] = uId; } const char *Tree::GetLeafName(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); assert(IsLeaf(uNodeIndex)); return m_ptrName[uNodeIndex]; } unsigned Tree::GetLeafId(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); assert(IsLeaf(uNodeIndex)); return m_Ids[uNodeIndex]; } // Append a new branch. // This adds two new nodes and joins them to an existing leaf node. // Return value is k, new nodes have indexes k and k+1 respectively. unsigned Tree::AppendBranch(unsigned uExistingLeafIndex) { if (0 == m_uNodeCount) Die("Tree::AppendBranch: tree has not been created"); #if DEBUG assert(uExistingLeafIndex < m_uNodeCount); if (!IsLeaf(uExistingLeafIndex)) { LogMe(); Die("AppendBranch(%u): not leaf", uExistingLeafIndex); } #endif if (m_uNodeCount >= m_uCacheCount - 2) ExpandCache(); const unsigned uNewLeaf1 = m_uNodeCount; const unsigned uNewLeaf2 = m_uNodeCount + 1; m_uNodeCount += 2; assert(m_uNeighbor2[uExistingLeafIndex] == NULL_NEIGHBOR); assert(m_uNeighbor3[uExistingLeafIndex] == NULL_NEIGHBOR); m_uNeighbor2[uExistingLeafIndex] = uNewLeaf1; m_uNeighbor3[uExistingLeafIndex] = uNewLeaf2; m_uNeighbor1[uNewLeaf1] = uExistingLeafIndex; m_uNeighbor1[uNewLeaf2] = uExistingLeafIndex; m_uNeighbor2[uNewLeaf1] = NULL_NEIGHBOR; m_uNeighbor2[uNewLeaf2] = NULL_NEIGHBOR; m_uNeighbor3[uNewLeaf1] = NULL_NEIGHBOR; m_uNeighbor3[uNewLeaf2] = NULL_NEIGHBOR; m_dEdgeLength2[uExistingLeafIndex] = 0; m_dEdgeLength3[uExistingLeafIndex] = 0; m_dEdgeLength1[uNewLeaf1] = 0; m_dEdgeLength2[uNewLeaf1] = 0; m_dEdgeLength3[uNewLeaf1] = 0; m_dEdgeLength1[uNewLeaf2] = 0; m_dEdgeLength2[uNewLeaf2] = 0; m_dEdgeLength3[uNewLeaf2] = 0; m_bHasEdgeLength1[uNewLeaf1] = false; m_bHasEdgeLength2[uNewLeaf1] = false; m_bHasEdgeLength3[uNewLeaf1] = false; m_bHasEdgeLength1[uNewLeaf2] = false; m_bHasEdgeLength2[uNewLeaf2] = false; m_bHasEdgeLength3[uNewLeaf2] = false; m_bHasHeight[uNewLeaf1] = false; m_bHasHeight[uNewLeaf2] = false; m_Ids[uNewLeaf1] = UINT_MAX; m_Ids[uNewLeaf2] = UINT_MAX; return uNewLeaf1; } void Tree::LogMe() const { Log("Tree::LogMe %u nodes, ", m_uNodeCount); if (IsRooted()) { Log("rooted.\n"); Log("\n"); Log("Index Parnt LengthP Left LengthL Right LengthR Id Name\n"); Log("----- ----- ------- ---- ------- ----- ------- ----- ----\n"); } else { Log("unrooted.\n"); Log("\n"); Log("Index Nbr_1 Length1 Nbr_2 Length2 Nbr_3 Length3 Id Name\n"); Log("----- ----- ------- ----- ------- ----- ------- ----- ----\n"); } for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { Log("%5u ", uNodeIndex); const unsigned n1 = m_uNeighbor1[uNodeIndex]; const unsigned n2 = m_uNeighbor2[uNodeIndex]; const unsigned n3 = m_uNeighbor3[uNodeIndex]; if (NULL_NEIGHBOR != n1) { Log("%5u ", n1); if (m_bHasEdgeLength1[uNodeIndex]) Log("%7.4f ", m_dEdgeLength1[uNodeIndex]); else Log(" * "); } else Log(" "); if (NULL_NEIGHBOR != n2) { Log("%5u ", n2); if (m_bHasEdgeLength2[uNodeIndex]) Log("%7.4f ", m_dEdgeLength2[uNodeIndex]); else Log(" * "); } else Log(" "); if (NULL_NEIGHBOR != n3) { Log("%5u ", n3); if (m_bHasEdgeLength3[uNodeIndex]) Log("%7.4f ", m_dEdgeLength3[uNodeIndex]); else Log(" * "); } else Log(" "); if (m_Ids != 0 && IsLeaf(uNodeIndex)) { unsigned uId = m_Ids[uNodeIndex]; if (uId == UINT_MAX) Log(" *"); else Log("%5u", uId); } else Log(" "); if (m_bRooted && uNodeIndex == m_uRootNodeIndex) Log(" [ROOT] "); const char *ptrName = m_ptrName[uNodeIndex]; if (ptrName != 0) Log(" %s", ptrName); Log("\n"); } } void Tree::SetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2, double dLength) { assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount); assert(IsEdge(uNodeIndex1, uNodeIndex2)); if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2) { m_dEdgeLength1[uNodeIndex1] = dLength; m_bHasEdgeLength1[uNodeIndex1] = true; } else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2) { m_dEdgeLength2[uNodeIndex1] = dLength; m_bHasEdgeLength2[uNodeIndex1] = true; } else { assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2); m_dEdgeLength3[uNodeIndex1] = dLength; m_bHasEdgeLength3[uNodeIndex1] = true; } if (m_uNeighbor1[uNodeIndex2] == uNodeIndex1) { m_dEdgeLength1[uNodeIndex2] = dLength; m_bHasEdgeLength1[uNodeIndex2] = true; } else if (m_uNeighbor2[uNodeIndex2] == uNodeIndex1) { m_dEdgeLength2[uNodeIndex2] = dLength; m_bHasEdgeLength2[uNodeIndex2] = true; } else { assert(m_uNeighbor3[uNodeIndex2] == uNodeIndex1); m_dEdgeLength3[uNodeIndex2] = dLength; m_bHasEdgeLength3[uNodeIndex2] = true; } } unsigned Tree::UnrootFromFile() { #if TRACE Log("Before unroot:\n"); LogMe(); #endif if (!m_bRooted) Die("Tree::Unroot, not rooted"); // Convention: root node is always node zero assert(IsRoot(0)); assert(NULL_NEIGHBOR == m_uNeighbor1[0]); const unsigned uThirdNode = m_uNodeCount++; m_uNeighbor1[0] = uThirdNode; m_uNeighbor1[uThirdNode] = 0; m_uNeighbor2[uThirdNode] = NULL_NEIGHBOR; m_uNeighbor3[uThirdNode] = NULL_NEIGHBOR; m_dEdgeLength1[0] = 0; m_dEdgeLength1[uThirdNode] = 0; m_bHasEdgeLength1[uThirdNode] = true; m_bRooted = false; #if TRACE Log("After unroot:\n"); LogMe(); #endif return uThirdNode; } // In an unrooted tree, equivalent of GetLeft/Right is // GetFirst/SecondNeighbor. // uNeighborIndex must be a known neighbor of uNodeIndex. // This is the way to find the other two neighbor nodes of // an internal node. // The labeling as "First" and "Second" neighbor is arbitrary. // Calling these functions on a leaf returns NULL_NEIGHBOR, as // for GetLeft/Right. unsigned Tree::GetFirstNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const { assert(uNodeIndex < m_uNodeCount); assert(uNeighborIndex < m_uNodeCount); assert(IsEdge(uNodeIndex, uNeighborIndex)); for (unsigned n = 0; n < 3; ++n) { unsigned uNeighbor = GetNeighbor(uNodeIndex, n); if (NULL_NEIGHBOR != uNeighbor && uNeighborIndex != uNeighbor) return uNeighbor; } return NULL_NEIGHBOR; } unsigned Tree::GetSecondNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const { assert(uNodeIndex < m_uNodeCount); assert(uNeighborIndex < m_uNodeCount); assert(IsEdge(uNodeIndex, uNeighborIndex)); bool bFoundOne = false; for (unsigned n = 0; n < 3; ++n) { unsigned uNeighbor = GetNeighbor(uNodeIndex, n); if (NULL_NEIGHBOR != uNeighbor && uNeighborIndex != uNeighbor) { if (bFoundOne) return uNeighbor; else bFoundOne = true; } } return NULL_NEIGHBOR; } // Compute the number of leaves in the sub-tree defined by an edge // in an unrooted tree. Conceptually, the tree is cut at this edge, // and uNodeIndex2 considered the root of the sub-tree. unsigned Tree::GetLeafCountUnrooted(unsigned uNodeIndex1, unsigned uNodeIndex2, double *ptrdTotalDistance) const { assert(!IsRooted()); if (IsLeaf(uNodeIndex2)) { *ptrdTotalDistance = GetEdgeLength(uNodeIndex1, uNodeIndex2); return 1; } // Recurse down the rooted sub-tree defined by cutting the edge // and considering uNodeIndex2 as the root. const unsigned uLeft = GetFirstNeighbor(uNodeIndex2, uNodeIndex1); const unsigned uRight = GetSecondNeighbor(uNodeIndex2, uNodeIndex1); double dLeftDistance; double dRightDistance; const unsigned uLeftCount = GetLeafCountUnrooted(uNodeIndex2, uLeft, &dLeftDistance); const unsigned uRightCount = GetLeafCountUnrooted(uNodeIndex2, uRight, &dRightDistance); *ptrdTotalDistance = dLeftDistance + dRightDistance; return uLeftCount + uRightCount; } bool Tree::HasEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const { assert(uNodeIndex1 < m_uNodeCount); assert(uNodeIndex2 < m_uNodeCount); assert(IsEdge(uNodeIndex1, uNodeIndex2)); if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2) return m_bHasEdgeLength1[uNodeIndex1]; else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2) return m_bHasEdgeLength2[uNodeIndex1]; assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2); return m_bHasEdgeLength3[uNodeIndex1]; } void Tree::OrientParent(unsigned uNodeIndex, unsigned uParentNodeIndex) { if (NULL_NEIGHBOR == uNodeIndex) return; if (m_uNeighbor1[uNodeIndex] == uParentNodeIndex) ; else if (m_uNeighbor2[uNodeIndex] == uParentNodeIndex) { double dEdgeLength2 = m_dEdgeLength2[uNodeIndex]; m_uNeighbor2[uNodeIndex] = m_uNeighbor1[uNodeIndex]; m_dEdgeLength2[uNodeIndex] = m_dEdgeLength1[uNodeIndex]; m_uNeighbor1[uNodeIndex] = uParentNodeIndex; m_dEdgeLength1[uNodeIndex] = dEdgeLength2; } else { assert(m_uNeighbor3[uNodeIndex] == uParentNodeIndex); double dEdgeLength3 = m_dEdgeLength3[uNodeIndex]; m_uNeighbor3[uNodeIndex] = m_uNeighbor1[uNodeIndex]; m_dEdgeLength3[uNodeIndex] = m_dEdgeLength1[uNodeIndex]; m_uNeighbor1[uNodeIndex] = uParentNodeIndex; m_dEdgeLength1[uNodeIndex] = dEdgeLength3; } OrientParent(m_uNeighbor2[uNodeIndex], uNodeIndex); OrientParent(m_uNeighbor3[uNodeIndex], uNodeIndex); } unsigned Tree::FirstDepthFirstNode() const { assert(IsRooted()); // Descend via left branches until we hit a leaf unsigned uNodeIndex = m_uRootNodeIndex; while (!IsLeaf(uNodeIndex)) uNodeIndex = GetLeft(uNodeIndex); return uNodeIndex; } unsigned Tree::FirstDepthFirstNodeR() const { assert(IsRooted()); // Descend via left branches until we hit a leaf unsigned uNodeIndex = m_uRootNodeIndex; while (!IsLeaf(uNodeIndex)) uNodeIndex = GetRight(uNodeIndex); return uNodeIndex; } unsigned Tree::NextDepthFirstNode(unsigned uNodeIndex) const { #if TRACE Log("NextDepthFirstNode(%3u) ", uNodeIndex); #endif assert(IsRooted()); assert(uNodeIndex < m_uNodeCount); if (IsRoot(uNodeIndex)) { #if TRACE Log(">> Node %u is root, end of traversal\n", uNodeIndex); #endif return NULL_NEIGHBOR; } unsigned uParent = GetParent(uNodeIndex); if (GetRight(uParent) == uNodeIndex) { #if TRACE Log(">> Is right branch, return parent=%u\n", uParent); #endif return uParent; } uNodeIndex = GetRight(uParent); #if TRACE Log(">> Descend left from right sibling=%u ... ", uNodeIndex); #endif while (!IsLeaf(uNodeIndex)) uNodeIndex = GetLeft(uNodeIndex); #if TRACE Log("bottom out at leaf=%u\n", uNodeIndex); #endif return uNodeIndex; } unsigned Tree::NextDepthFirstNodeR(unsigned uNodeIndex) const { #if TRACE Log("NextDepthFirstNode(%3u) ", uNodeIndex); #endif assert(IsRooted()); assert(uNodeIndex < m_uNodeCount); if (IsRoot(uNodeIndex)) { #if TRACE Log(">> Node %u is root, end of traversal\n", uNodeIndex); #endif return NULL_NEIGHBOR; } unsigned uParent = GetParent(uNodeIndex); if (GetLeft(uParent) == uNodeIndex) { #if TRACE Log(">> Is left branch, return parent=%u\n", uParent); #endif return uParent; } uNodeIndex = GetLeft(uParent); #if TRACE Log(">> Descend right from left sibling=%u ... ", uNodeIndex); #endif while (!IsLeaf(uNodeIndex)) uNodeIndex = GetRight(uNodeIndex); #if TRACE Log("bottom out at leaf=%u\n", uNodeIndex); #endif return uNodeIndex; } static void GetMaxString(const vector &v, string &MaxStr) { asserta(!v.empty()); MaxStr = v[0]; for (uint i = 1; i < SIZE(v); ++i) MaxStr = max(MaxStr, v[i]); } static bool CompareLabels(const vector &Labels1, const vector &Labels2) { string Max1; string Max2; GetMaxString(Labels1, Max1); GetMaxString(Labels2, Max2); bool Gt = (Max1 > Max2); return Gt; } uint Tree::Ladderize(bool MoreRight) { const uint NodeCount = GetNodeCount(); uint RotatedCount = 0; for (uint Node = 0; Node < NodeCount; ++Node) { if (IsLeaf(Node)) continue; uint Left = GetLeft(Node); uint Right = GetRight(Node); uint NLeft = GetSubtreeLeafCount(Left); uint NRight = GetSubtreeLeafCount(Right); bool DoRotate = (MoreRight ? NRight < NLeft : NLeft < NRight); if (NLeft == NRight) { vector LeftLabels; vector RightLabels; GetSubtreeLeafLabels(Left, LeftLabels); GetSubtreeLeafLabels(Right, RightLabels); DoRotate = CompareLabels(LeftLabels, RightLabels); } if (DoRotate) { ++RotatedCount; uint Left = GetLeft(Node); uint Right = GetRight(Node); m_uNeighbor2[Node] = Right; m_uNeighbor3[Node] = Left; } } return RotatedCount; } void Tree::UnrootByDeletingRoot() { assert(IsRooted()); assert(m_uNodeCount >= 3); const unsigned uLeft = GetLeft(m_uRootNodeIndex); const unsigned uRight = GetRight(m_uRootNodeIndex); m_uNeighbor1[uLeft] = uRight; m_uNeighbor1[uRight] = uLeft; bool bHasEdgeLength = HasEdgeLength(m_uRootNodeIndex, uLeft) && HasEdgeLength(m_uRootNodeIndex, uRight); if (bHasEdgeLength) { double dEdgeLength = GetEdgeLength(m_uRootNodeIndex, uLeft) + GetEdgeLength(m_uRootNodeIndex, uRight); m_dEdgeLength1[uLeft] = dEdgeLength; m_dEdgeLength1[uRight] = dEdgeLength; } // Remove root node entry from arrays const unsigned uMoveCount = m_uNodeCount - m_uRootNodeIndex; const unsigned uUnsBytes = uMoveCount*sizeof(unsigned); memmove(m_uNeighbor1 + m_uRootNodeIndex, m_uNeighbor1 + m_uRootNodeIndex + 1, uUnsBytes); memmove(m_uNeighbor2 + m_uRootNodeIndex, m_uNeighbor2 + m_uRootNodeIndex + 1, uUnsBytes); memmove(m_uNeighbor3 + m_uRootNodeIndex, m_uNeighbor3 + m_uRootNodeIndex + 1, uUnsBytes); const unsigned uDoubleBytes = uMoveCount*sizeof(double); memmove(m_dEdgeLength1 + m_uRootNodeIndex, m_dEdgeLength1 + m_uRootNodeIndex + 1, uDoubleBytes); memmove(m_dEdgeLength2 + m_uRootNodeIndex, m_dEdgeLength2 + m_uRootNodeIndex + 1, uDoubleBytes); memmove(m_dEdgeLength3 + m_uRootNodeIndex, m_dEdgeLength3 + m_uRootNodeIndex + 1, uDoubleBytes); const unsigned uBoolBytes = uMoveCount*sizeof(bool); memmove(m_bHasEdgeLength1 + m_uRootNodeIndex, m_bHasEdgeLength1 + m_uRootNodeIndex + 1, uBoolBytes); memmove(m_bHasEdgeLength2 + m_uRootNodeIndex, m_bHasEdgeLength2 + m_uRootNodeIndex + 1, uBoolBytes); memmove(m_bHasEdgeLength3 + m_uRootNodeIndex, m_bHasEdgeLength3 + m_uRootNodeIndex + 1, uBoolBytes); const unsigned uPtrBytes = uMoveCount*sizeof(char *); memmove(m_ptrName + m_uRootNodeIndex, m_ptrName + m_uRootNodeIndex + 1, uPtrBytes); --m_uNodeCount; m_bRooted = false; // Fix up table entries for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { #define DEC(x) if (x != NULL_NEIGHBOR && x > m_uRootNodeIndex) --x; DEC(m_uNeighbor1[uNodeIndex]) DEC(m_uNeighbor2[uNodeIndex]) DEC(m_uNeighbor3[uNodeIndex]) #undef DEC } Validate(); } unsigned Tree::GetLeafParent(unsigned uNodeIndex) const { assert(IsLeaf(uNodeIndex)); if (IsRooted()) return GetParent(uNodeIndex); if (m_uNeighbor1[uNodeIndex] != NULL_NEIGHBOR) return m_uNeighbor1[uNodeIndex]; if (m_uNeighbor2[uNodeIndex] != NULL_NEIGHBOR) return m_uNeighbor2[uNodeIndex]; return m_uNeighbor3[uNodeIndex]; } uint Tree::GetLCA(uint Node1, uint Node2) const { vector Path1; vector Path2; GetPathToRoot(Node1, Path1); GetPathToRoot(Node2, Path2); const uint N1 = SIZE(Path1); const uint N2 = SIZE(Path2); for (uint i = 0; i < N1; ++i) { uint AncNode1 = Path1[i]; for (uint j = 0; j < N2; ++j) { if (Path2[j] == AncNode1) return AncNode1; } } asserta(false); return UINT_MAX; } void Tree::GetPathToRoot(uint Node, vector &Path) const { if (!IsRooted()) Die("GetPathToRoot(), not rooted"); const uint NodeCount = GetNodeCount(); Path.clear(); for (;;) { asserta(Node < NodeCount); Path.push_back(Node); asserta(SIZE(Path) <= NodeCount); if (IsRoot(Node)) return; Node = GetParent(Node); } } // AncNode must be on path from Node to root double Tree::GetDistance(uint Node, uint AncNode) const { if (Node == m_uRootNodeIndex && AncNode == UINT_MAX) return 0; vector Path; GetPathToRoot(Node, Path); double Distance = 0; const uint n = SIZE(Path); asserta(Path[0] == Node); for (uint i = 0; i < n; ++i) { uint PathNode = Path[i]; if (PathNode == AncNode) return Distance; // Node1 is parent if (m_bHasEdgeLength1[PathNode]) { double Length = m_dEdgeLength1[PathNode]; Distance += Length; } } Die("GetDistance, not ancestor"); return 0; } void Tree::GetSubtreeLeafLabels(uint Node, vector &Labels) const { Labels.clear(); vector Leaves; AppendLeaves(Node, Leaves); uint n = SIZE(Leaves); for (uint i = 0; i < n; ++i) { uint LeafNode = Leaves[i]; const char *Name = m_ptrName[LeafNode]; Labels.push_back(string(Name)); } } void Tree::GetSubtreeLeafNodes(uint Node, vector &LeafNodes) const { AppendLeaves(Node, LeafNodes); } uint Tree::GetSubtreeLeafCount(uint Node) const { if (Node == UINT_MAX) return 0; vector Leaves; AppendLeaves(Node, Leaves); uint n = SIZE(Leaves); return n; } void Tree::GetLeafLabels(vector &Labels) const { Labels.clear(); const uint NodeCount = GetNodeCount(); for (uint Node = 0; Node < NodeCount; ++Node) { if (IsLeaf(Node)) { string Label; GetLabel(Node, Label); Labels.push_back(Label); } } } void Tree::AppendLeaves(uint Node, vector &Leaves) const { uint NodeCount = GetNodeCount(); uint LeafCount = GetLeafCount(); asserta(Node < NodeCount); asserta(SIZE(Leaves) < LeafCount); if (IsLeaf(Node)) Leaves.push_back(Node); else { uint Edge2 = m_uNeighbor2[Node]; uint Edge3 = m_uNeighbor3[Node]; AppendLeaves(Edge2, Leaves); AppendLeaves(Edge3, Leaves); } } // TODO: This is not efficient for large trees, should cache. double Tree::GetNodeHeight(unsigned uNodeIndex) const { if (!IsRooted()) Die("Tree::GetNodeHeight: undefined unless rooted tree"); if (IsLeaf(uNodeIndex)) return 0.0; if (m_bHasHeight[uNodeIndex]) return m_dHeight[uNodeIndex]; const unsigned uLeft = GetLeft(uNodeIndex); const unsigned uRight = GetRight(uNodeIndex); double dLeftLength = GetEdgeLength(uNodeIndex, uLeft); double dRightLength = GetEdgeLength(uNodeIndex, uRight); if (dLeftLength < 0) dLeftLength = 0; if (dRightLength < 0) dRightLength = 0; const double dLeftHeight = dLeftLength + GetNodeHeight(uLeft); const double dRightHeight = dRightLength + GetNodeHeight(uRight); const double dHeight = (dLeftHeight + dRightHeight)/2; m_bHasHeight[uNodeIndex] = true; m_dHeight[uNodeIndex] = dHeight; return dHeight; } unsigned Tree::GetNeighborSubscript(unsigned uNodeIndex, unsigned uNeighborIndex) const { assert(uNodeIndex < m_uNodeCount); assert(uNeighborIndex < m_uNodeCount); if (uNeighborIndex == m_uNeighbor1[uNodeIndex]) return 0; if (uNeighborIndex == m_uNeighbor2[uNodeIndex]) return 1; if (uNeighborIndex == m_uNeighbor3[uNodeIndex]) return 2; return NULL_NEIGHBOR; } unsigned Tree::GetNeighbor(unsigned uNodeIndex, unsigned uNeighborSubscript) const { switch (uNeighborSubscript) { case 0: return m_uNeighbor1[uNodeIndex]; case 1: return m_uNeighbor2[uNodeIndex]; case 2: return m_uNeighbor3[uNodeIndex]; } Die("Tree::GetNeighbor, sub=%u", uNeighborSubscript); return NULL_NEIGHBOR; } // TODO: check if this is a performance issue, could cache a lookup table unsigned Tree::LeafIndexToNodeIndex(unsigned uLeafIndex) const { const unsigned uNodeCount = GetNodeCount(); unsigned uLeafCount = 0; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (IsLeaf(uNodeIndex)) { if (uLeafCount == uLeafIndex) return uNodeIndex; else ++uLeafCount; } } Die("LeafIndexToNodeIndex: out of range"); return 0; } uint Tree::GetNodeIndex(const string &Label) const { return GetNodeIndex(Label.c_str()); } unsigned Tree::GetNodeIndex(const char *ptrName) const { const unsigned uNodeCount = GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { const char *ptrLeafName = m_ptrName[uNodeIndex]; if (ptrLeafName != 0 && 0 == strcmp(ptrName, ptrLeafName)) return uNodeIndex; } Die("Tree::GetLeafNodeIndex, name not found"); return 0; } void Tree::Copy(const Tree &tree) { const unsigned uNodeCount = tree.GetNodeCount(); InitCache(uNodeCount); m_uNodeCount = uNodeCount; const size_t UnsignedBytes = uNodeCount*sizeof(unsigned); const size_t DoubleBytes = uNodeCount*sizeof(double); const size_t BoolBytes = uNodeCount*sizeof(bool); memcpy(m_uNeighbor1, tree.m_uNeighbor1, UnsignedBytes); memcpy(m_uNeighbor2, tree.m_uNeighbor2, UnsignedBytes); memcpy(m_uNeighbor3, tree.m_uNeighbor3, UnsignedBytes); memcpy(m_Ids, tree.m_Ids, UnsignedBytes); memcpy(m_dEdgeLength1, tree.m_dEdgeLength1, DoubleBytes); memcpy(m_dEdgeLength2, tree.m_dEdgeLength2, DoubleBytes); memcpy(m_dEdgeLength3, tree.m_dEdgeLength3, DoubleBytes); memcpy(m_dHeight, tree.m_dHeight, DoubleBytes); memcpy(m_bHasEdgeLength1, tree.m_bHasEdgeLength1, BoolBytes); memcpy(m_bHasEdgeLength2, tree.m_bHasEdgeLength2, BoolBytes); memcpy(m_bHasEdgeLength3, tree.m_bHasEdgeLength3, BoolBytes); memcpy(m_bHasHeight, tree.m_bHasHeight, BoolBytes); m_uRootNodeIndex = tree.m_uRootNodeIndex; m_bRooted = tree.m_bRooted; for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) { if (tree.IsLeaf(uNodeIndex)) { const char *ptrName = tree.GetLeafName(uNodeIndex); m_ptrName[uNodeIndex] = mystrsave(ptrName); } else m_ptrName[uNodeIndex] = 0; } #if DEBUG Validate(); #endif } void Tree::ToVectors(vector &Labels, vector &Parents, vector &Lengths) const { asserta(IsRooted()); Labels.clear(); Parents.clear(); Lengths.clear(); const uint NodeCount = GetNodeCount(); for (uint Node = 0; Node < NodeCount; ++Node) { string Label; GetLabel(Node, Label); uint Parent = GetParent(Node); float Length = 0; if (Parent != UINT_MAX) Length = (float) GetEdgeLength(Node, Parent); Labels.push_back(Label); Parents.push_back(Parent); Lengths.push_back(Length); } } void Tree::FromVectors(const vector &Labels, const vector &Parents, const vector &Lengths) { Clear(); const uint NodeCount = SIZE(Labels); asserta(SIZE(Parents) == NodeCount); asserta(SIZE(Lengths) == NodeCount); vector Lefts(NodeCount, UINT_MAX); vector Rights(NodeCount, UINT_MAX); uint Root = UINT_MAX; for (uint Node = 0; Node < NodeCount; ++Node) { uint Parent = Parents[Node]; if (Parent == UINT_MAX) { asserta(Root == UINT_MAX); Root = Node; continue; } asserta(Parent < NodeCount); if (Lefts[Parent] == UINT_MAX) Lefts[Parent] = Node; else if (Rights[Parent] == UINT_MAX) Rights[Parent] = Node; else Die("Tree::FromVectors(), invalid vector topology"); } asserta(Root != UINT_MAX); vector LeafNodes; vector IntNodes; vector NodeToLeafIndex(NodeCount, UINT_MAX); vector NodeToIntIndex(NodeCount, UINT_MAX); for (uint Node = 0; Node < NodeCount; ++Node) { if (Lefts[Node] == UINT_MAX) { asserta(Rights[Node] == UINT_MAX); uint LeafIndex = SIZE(LeafNodes); NodeToLeafIndex[Node] = LeafIndex; LeafNodes.push_back(Node); } else { asserta(Rights[Node] != UINT_MAX); uint IntIndex = SIZE(IntNodes); NodeToIntIndex[Node] = IntIndex; IntNodes.push_back(Node); } } uint LeafCount = SIZE(LeafNodes); uint IntCount = SIZE(IntNodes); asserta(LeafCount == (NodeCount + 1)/2); asserta(IntCount = LeafCount - 1); m_uNodeCount = NodeCount; InitCache(NodeCount); // Leaves for (uint i = 0; i < LeafCount; ++i) { uint Node = LeafNodes[i]; asserta(Node < SIZE(Labels)); const string &Label = Labels[Node]; asserta(Label != ""); m_Ids[i] = i; m_ptrName[i] = mystrsave(Label.c_str()); } // Internal ndoes for (uint i = 0; i < IntCount; ++i) { uint Node = IntNodes[i]; asserta(Node < SIZE(Lefts)); asserta(Node < SIZE(Rights)); uint NewNode = LeafCount + i; uint Left = Lefts[Node]; uint Right = Rights[Node]; uint LeftIntIndex = NodeToIntIndex[Left]; uint RightIntIndex = NodeToIntIndex[Right]; uint LeftLeafIndex = NodeToLeafIndex[Left]; uint RightLeafIndex = NodeToLeafIndex[Right]; uint NewLeftNode = UINT_MAX; if (LeftIntIndex == UINT_MAX) { asserta(LeftLeafIndex < LeafCount); NewLeftNode = LeftLeafIndex; } else { asserta(LeftIntIndex < IntCount); NewLeftNode = LeafCount + LeftIntIndex; } uint NewRightNode = UINT_MAX; if (RightIntIndex == UINT_MAX) { asserta(RightLeafIndex < LeafCount); NewRightNode = RightLeafIndex; } else { asserta(RightIntIndex < IntCount); NewRightNode = LeafCount + RightIntIndex; } m_uNeighbor2[NewNode] = NewLeftNode; m_uNeighbor3[NewNode] = NewRightNode; m_uNeighbor1[NewLeftNode] = NewNode; m_uNeighbor1[NewRightNode] = NewNode; m_bHasEdgeLength1[NewLeftNode] = false; m_bHasEdgeLength1[NewRightNode] = false; float LeftLength = Lengths[Left]; float RightLength = Lengths[Right]; m_bHasEdgeLength2[NewNode] = false; m_bHasEdgeLength3[NewNode] = false; if (LeftLength != MISSING_LENGTH) { m_bHasEdgeLength2[NewNode] = true; m_dEdgeLength2[NewNode] = LeftLength; m_bHasEdgeLength1[NewLeftNode] = true; m_dEdgeLength1[NewLeftNode] = LeftLength; } if (RightLength != MISSING_LENGTH) { m_bHasEdgeLength3[NewNode] = true; m_dEdgeLength3[NewNode] = RightLength; m_bHasEdgeLength1[NewRightNode] = true; m_dEdgeLength1[NewRightNode] = RightLength; } } uint NewRootIntIndex = NodeToIntIndex[Root]; asserta(NewRootIntIndex != UINT_MAX); m_bRooted = true; m_uRootNodeIndex = LeafCount + NewRootIntIndex; Validate(); } // Create rooted tree from a vector description. // Node indexes are 0..N-1 for leaves, N..2N-2 for // internal nodes. // Vector subscripts are i-N and have values for // internal nodes only, but those values are node // indexes 0..2N-2. So e.g. if N=6 and Left[2]=1, // this means that the third internal node (node index 8) // has the second leaf (node index 1) as its left child. // uRoot gives the vector subscript of the root, so add N // to get the node index. void Tree::Create(unsigned uLeafCount, unsigned uRoot, const unsigned Left[], const unsigned Right[], const float LeftLength[], const float RightLength[], const unsigned LeafIds[], char **LeafNames) { Clear(); m_uNodeCount = 2*uLeafCount - 1; InitCache(m_uNodeCount); for (unsigned uNodeIndex = 0; uNodeIndex < uLeafCount; ++uNodeIndex) { m_Ids[uNodeIndex] = LeafIds[uNodeIndex]; m_ptrName[uNodeIndex] = mystrsave(LeafNames[uNodeIndex]); } for (unsigned uNodeIndex = uLeafCount; uNodeIndex < m_uNodeCount; ++uNodeIndex) { unsigned v = uNodeIndex - uLeafCount; unsigned uLeft = Left[v]; unsigned uRight = Right[v]; float fLeft = LeftLength[v]; float fRight = RightLength[v]; m_uNeighbor2[uNodeIndex] = uLeft; m_uNeighbor3[uNodeIndex] = uRight; m_bHasEdgeLength2[uNodeIndex] = true; m_bHasEdgeLength3[uNodeIndex] = true; m_dEdgeLength2[uNodeIndex] = fLeft; m_dEdgeLength3[uNodeIndex] = fRight; m_uNeighbor1[uLeft] = uNodeIndex; m_uNeighbor1[uRight] = uNodeIndex; m_dEdgeLength1[uLeft] = fLeft; m_dEdgeLength1[uRight] = fRight; m_bHasEdgeLength1[uLeft] = true; m_bHasEdgeLength1[uRight] = true; } m_bRooted = true; m_uRootNodeIndex = uRoot + uLeafCount; Validate(); } muscle-5.1.0/src/tree.h000066400000000000000000000210631424453062600147140ustar00rootroot00000000000000#ifndef tree_h #define tree_h #include class Clust; const unsigned NULL_NEIGHBOR = UINT_MAX; const double MISSING_LENGTH = DBL_MAX; enum NEWICK_TOKEN_TYPE { NTT_Unknown, // Returned from Tree::GetToken: NTT_Lparen, NTT_Rparen, NTT_Colon, NTT_Comma, NTT_Semicolon, NTT_String, // Following are never returned from Tree::GetToken: NTT_SingleQuotedString, NTT_DoubleQuotedString, NTT_Comment }; class Tree { public: Tree() { m_uNodeCount = 0; m_uCacheCount = 0; m_uNeighbor1 = 0; m_uNeighbor2 = 0; m_uNeighbor3 = 0; m_dEdgeLength1 = 0; m_dEdgeLength2 = 0; m_dEdgeLength3 = 0; m_dHeight = 0; m_bHasEdgeLength1 = 0; m_bHasEdgeLength2 = 0; m_bHasEdgeLength3 = 0; m_bHasHeight = 0; m_ptrName = 0; m_Ids = 0; } virtual ~Tree() { Clear(); } void Clear() { for (unsigned n = 0; n < m_uNodeCount; ++n) free(m_ptrName[n]); m_uNodeCount = 0; m_uCacheCount = 0; #define del(x) if (x != 0) delete[] x; x = 0; del(m_uNeighbor1) del(m_uNeighbor2) del(m_uNeighbor3) del(m_dEdgeLength1) del(m_dEdgeLength2) del(m_dEdgeLength3) del(m_bHasEdgeLength1) del(m_bHasEdgeLength2) del(m_bHasEdgeLength3) del(m_ptrName) del(m_Ids) del(m_bHasHeight) del(m_dHeight) #undef del m_uNeighbor1 = 0; m_uNeighbor2 = 0; m_uNeighbor3 = 0; m_dEdgeLength1 = 0; m_dEdgeLength2 = 0; m_dEdgeLength3 = 0; m_ptrName = 0; m_Ids = 0; m_uRootNodeIndex = 0; m_bHasHeight = 0; m_dHeight = 0; m_bRooted = false; } // Creation and manipulation void CreateRooted(); void CreateUnrooted(double dEdgeLength); void FromFile(const string &FileName); void FromFile(TextFile &File); void FromClust(Clust &C); void Copy(const Tree &tree); void Create(unsigned uLeafCount, unsigned uRoot, const unsigned Left[], const unsigned Right[], const float LeftLength[], const float RightLength[], const unsigned LeafIds[], char *LeafNames[]); void FromVectors(const vector &Labels, const vector &Parents, const vector &Lengths); void ToVectors(vector &Labels, vector &Parents, vector &Lengths) const; unsigned AppendBranch(unsigned uExistingNodeIndex); void SetLeafName(unsigned uNodeIndex, const char *ptrName); void SetLeafId(unsigned uNodeIndex, unsigned uId); void SetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2, double dLength); void RootUnrootedTree(unsigned uNodeIndex1, unsigned uNodeIndex2); void UnrootByDeletingRoot(); uint Ladderize(bool Right); // Saving to file void ToFile(TextFile &File) const; void ToFile(const string &FileName) const; // Accessor functions unsigned GetNodeCount() const { return m_uNodeCount; } unsigned GetLeafCount() const { if (m_bRooted) { assert(m_uNodeCount%2 == 1); return (m_uNodeCount + 1)/2; } else { assert(m_uNodeCount%2 == 0); return (m_uNodeCount + 2)/2; } } uint GetRoot() const { return m_uRootNodeIndex; } unsigned GetNeighbor(unsigned uNodeIndex, unsigned uNeighborSubscript) const; unsigned GetNeighbor1(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); return m_uNeighbor1[uNodeIndex]; } unsigned GetNeighbor2(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); return m_uNeighbor2[uNodeIndex]; } unsigned GetNeighbor3(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); return m_uNeighbor3[uNodeIndex]; } unsigned GetParent(unsigned uNodeIndex) const { assert(m_bRooted && uNodeIndex < m_uNodeCount); return m_uNeighbor1[uNodeIndex]; } bool IsRooted() const { return m_bRooted; } uint GetSibling(uint uNodeIndex) const; unsigned GetLeft(unsigned uNodeIndex) const { assert(m_bRooted && uNodeIndex < m_uNodeCount); return m_uNeighbor2[uNodeIndex]; } unsigned GetRight(unsigned uNodeIndex) const { assert(m_bRooted && uNodeIndex < m_uNodeCount); return m_uNeighbor3[uNodeIndex]; } const char *GetName(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); return m_ptrName[uNodeIndex]; } const void GetLabel(uint uNodeIndex, string &Label) const { assert(uNodeIndex < m_uNodeCount); const char *Name = m_ptrName[uNodeIndex]; if (Name == 0) Label = ""; else Label = string(Name); } unsigned GetRootNodeIndex() const { assert(m_bRooted); return m_uRootNodeIndex; } unsigned GetNeighborCount(unsigned uNodeIndex) const { const unsigned n1 = m_uNeighbor1[uNodeIndex]; const unsigned n2 = m_uNeighbor2[uNodeIndex]; const unsigned n3 = m_uNeighbor3[uNodeIndex]; return (NULL_NEIGHBOR != n1) + (NULL_NEIGHBOR != n2) + (NULL_NEIGHBOR != n3); } bool IsLeaf(unsigned uNodeIndex) const { assert(uNodeIndex < m_uNodeCount); if (1 == m_uNodeCount) return true; return 1 == GetNeighborCount(uNodeIndex); } bool IsRoot(unsigned uNodeIndex) const { return IsRooted() && m_uRootNodeIndex == uNodeIndex; } unsigned GetLeafId(unsigned uNodeIndex) const; unsigned GetNodeIndex(const char *ptrName) const; unsigned GetNodeIndex(const string &Label) const; bool IsEdge(unsigned uNodeIndex1, unsigned uNodeIndex2) const; bool HasEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const; double GetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const; const char *GetLeafName(unsigned uNodeIndex) const; unsigned GetNeighborSubscript(unsigned uNodeIndex, unsigned uNeighborIndex) const; double GetNodeHeight(unsigned uNodeIndex) const; void AppendLeaves(uint Node, vector &Leaves) const; uint GetSubtreeLeafCount(uint Node) const; void GetSubtreeLeafLabels(uint Node, vector &Labels) const; void GetSubtreeLeafNodes(uint Node, vector &Labels) const; void GetPathToRoot(uint Node, vector &Path) const; double GetDistance(uint Node, uint AncNode) const; void GetLeafLabels(vector &Labels) const; // Depth-first traversal unsigned FirstDepthFirstNode() const; unsigned NextDepthFirstNode(unsigned uNodeIndex) const; unsigned FirstDepthFirstNodeR() const; unsigned NextDepthFirstNodeR(unsigned uNodeIndex) const; // Equivalent of GetLeft/Right in unrooted tree, works in rooted tree too. unsigned GetFirstNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const; unsigned GetSecondNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const; // Getting parent node in unrooted tree defined iff leaf unsigned GetLeafParent(unsigned uNodeIndex) const; // Misc const char *NTTStr(NEWICK_TOKEN_TYPE NTT) const; void FindCenterByLongestSpan(unsigned *ptrNodeIndex1, unsigned *ptrNodeIndex2) const; void PruneTree(const Tree &tree, unsigned Subfams[], unsigned uSubfamCount); unsigned LeafIndexToNodeIndex(unsigned uLeafIndex) const; // Debugging & trouble-shooting support void Validate() const; void ValidateNode(unsigned uNodeIndex) const; void AssertAreNeighbors(unsigned uNodeIndex1, unsigned uNodeIndex2) const; void LogMe() const; uint GetLCA(uint Node1, uint Node2) const; private: unsigned UnrootFromFile(); NEWICK_TOKEN_TYPE GetTokenVerbose(TextFile &File, char szToken[], unsigned uBytes) const { NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, uBytes); Log("GetToken %10.10s %s\n", NTTStr(NTT), szToken); return NTT; } void InitCache(unsigned uCacheCount); void ExpandCache(); NEWICK_TOKEN_TYPE GetToken(TextFile &File, char szToken[], unsigned uBytes) const; bool GetGroupFromFile(TextFile &File, unsigned uNodeIndex, double *ptrdEdgeLength); unsigned GetLeafCountUnrooted(unsigned uNodeIndex1, unsigned uNodeIndex2, double *ptrdTotalDistance) const; void ToFileNodeRooted(TextFile &File, unsigned uNodeIndex) const; void ToFileNodeUnrooted(TextFile &File, unsigned uNodeIndex, unsigned uParent) const; void OrientParent(unsigned uNodeIndex, unsigned uParentNodeIndex); double FromClustNode(const Clust &C, unsigned uClustNodeIndex, unsigned uPhyNodeIndex); unsigned GetAnyNonLeafNode() const; // Yuck. Data is made public for the convenience of Tree::Copy. // There has to be a better way. public: unsigned m_uNodeCount; unsigned m_uCacheCount; unsigned *m_uNeighbor1; unsigned *m_uNeighbor2; unsigned *m_uNeighbor3; double *m_dEdgeLength1; double *m_dEdgeLength2; double *m_dEdgeLength3; double *m_dHeight; bool *m_bHasEdgeLength1; bool *m_bHasEdgeLength2; bool *m_bHasEdgeLength3; bool *m_bHasHeight; unsigned *m_Ids; char **m_ptrName; bool m_bRooted; unsigned m_uRootNodeIndex; }; struct PhyEnumEdgeState { PhyEnumEdgeState() { m_bInit = false; m_uNodeIndex1 = NULL_NEIGHBOR; m_uNodeIndex2 = NULL_NEIGHBOR; } bool m_bInit; unsigned m_uNodeIndex1; unsigned m_uNodeIndex2; }; const unsigned NODE_CHANGED = (unsigned) (~0); #endif // tree_h muscle-5.1.0/src/tree2.cpp000066400000000000000000000161431424453062600153340ustar00rootroot00000000000000#include "muscle.h" #include "tree.h" #define TRACE 0 // Return false when done bool PhyEnumEdges(const Tree &tree, PhyEnumEdgeState &ES) { unsigned uNode1 = UINT_MAX; if (!ES.m_bInit) { if (tree.GetNodeCount() <= 1) { ES.m_uNodeIndex1 = NULL_NEIGHBOR; ES.m_uNodeIndex2 = NULL_NEIGHBOR; return false; } uNode1 = tree.FirstDepthFirstNode(); ES.m_bInit = true; } else { uNode1 = tree.NextDepthFirstNode(ES.m_uNodeIndex1); if (NULL_NEIGHBOR == uNode1) return false; if (tree.IsRooted() && tree.IsRoot(uNode1)) { uNode1 = tree.NextDepthFirstNode(uNode1); if (NULL_NEIGHBOR == uNode1) return false; } } unsigned uNode2 = tree.GetParent(uNode1); ES.m_uNodeIndex1 = uNode1; ES.m_uNodeIndex2 = uNode2; return true; } bool PhyEnumEdgesR(const Tree &tree, PhyEnumEdgeState &ES) { unsigned uNode1 = UINT_MAX; if (!ES.m_bInit) { if (tree.GetNodeCount() <= 1) { ES.m_uNodeIndex1 = NULL_NEIGHBOR; ES.m_uNodeIndex2 = NULL_NEIGHBOR; return false; } uNode1 = tree.FirstDepthFirstNodeR(); ES.m_bInit = true; } else { uNode1 = tree.NextDepthFirstNodeR(ES.m_uNodeIndex1); if (NULL_NEIGHBOR == uNode1) return false; if (tree.IsRooted() && tree.IsRoot(uNode1)) { uNode1 = tree.NextDepthFirstNode(uNode1); if (NULL_NEIGHBOR == uNode1) return false; } } unsigned uNode2 = tree.GetParent(uNode1); ES.m_uNodeIndex1 = uNode1; ES.m_uNodeIndex2 = uNode2; return true; } static void GetLeavesSubtree(const Tree &tree, unsigned uNodeIndex1, const unsigned uNodeIndex2, unsigned Leaves[], unsigned *ptruCount) { if (tree.IsLeaf(uNodeIndex1)) { Leaves[*ptruCount] = uNodeIndex1; ++(*ptruCount); return; } const unsigned uLeft = tree.GetFirstNeighbor(uNodeIndex1, uNodeIndex2); const unsigned uRight = tree.GetSecondNeighbor(uNodeIndex1, uNodeIndex2); if (NULL_NEIGHBOR != uLeft) GetLeavesSubtree(tree, uLeft, uNodeIndex1, Leaves, ptruCount); if (NULL_NEIGHBOR != uRight) GetLeavesSubtree(tree, uRight, uNodeIndex1, Leaves, ptruCount); } static void PhyGetLeaves(const Tree &tree, unsigned uNodeIndex1, unsigned uNodeIndex2, unsigned Leaves[], unsigned *ptruCount) { *ptruCount = 0; GetLeavesSubtree(tree, uNodeIndex1, uNodeIndex2, Leaves, ptruCount); } bool PhyEnumBiParts(const Tree &tree, PhyEnumEdgeState &ES, unsigned Leaves1[], unsigned *ptruCount1, unsigned Leaves2[], unsigned *ptruCount2) { bool bOk = PhyEnumEdges(tree, ES); if (!bOk) { *ptruCount1 = 0; *ptruCount2 = 0; return false; } // Special case: in a rooted tree, both edges from the root // give the same bipartition, so skip one of them. if (tree.IsRooted() && tree.IsRoot(ES.m_uNodeIndex2) && tree.GetRight(ES.m_uNodeIndex2) == ES.m_uNodeIndex1) { bOk = PhyEnumEdges(tree, ES); if (!bOk) return false; } PhyGetLeaves(tree, ES.m_uNodeIndex1, ES.m_uNodeIndex2, Leaves1, ptruCount1); PhyGetLeaves(tree, ES.m_uNodeIndex2, ES.m_uNodeIndex1, Leaves2, ptruCount2); if (*ptruCount1 + *ptruCount2 != tree.GetLeafCount()) Die("PhyEnumBiParts %u + %u != %u", *ptruCount1, *ptruCount2, tree.GetLeafCount()); #if DEBUG { for (unsigned i = 0; i < *ptruCount1; ++i) { if (!tree.IsLeaf(Leaves1[i])) Die("PhyEnumByParts: not leaf"); for (unsigned j = 0; j < *ptruCount2; ++j) { if (!tree.IsLeaf(Leaves2[j])) Die("PhyEnumByParts: not leaf"); if (Leaves1[i] == Leaves2[j]) Die("PhyEnumByParts: dupe"); } } } #endif return true; } #if 0 void TestBiPart() { SetListFileName("c:\\tmp\\lobster.log", false); Tree tree; TextFile fileIn("c:\\tmp\\test.phy"); tree.FromFile(fileIn); tree.LogMe(); const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Leaves1 = new unsigned[uNodeCount]; unsigned *Leaves2 = new unsigned[uNodeCount]; PhyEnumEdgeState ES; bool bDone = false; for (;;) { unsigned uCount1 = UINT_MAX; unsigned uCount2 = UINT_MAX; bool bOk = PhyEnumBiParts(tree, ES, Leaves1, &uCount1, Leaves2, &uCount2); Log("PEBP=%d ES.Init=%d ES.ni1=%d ES.ni2=%d\n", bOk, ES.m_bInit, ES.m_uNodeIndex1, ES.m_uNodeIndex2); if (!bOk) break; Log("\n"); Log("Part1: "); for (unsigned n = 0; n < uCount1; ++n) Log(" %d(%s)", Leaves1[n], tree.GetLeafName(Leaves1[n])); Log("\n"); Log("Part2: "); for (unsigned n = 0; n < uCount2; ++n) Log(" %d(%s)", Leaves2[n], tree.GetLeafName(Leaves2[n])); Log("\n"); } } #endif static void GetLeavesSubtreeExcluding(const Tree &tree, unsigned uNodeIndex, unsigned uExclude, unsigned Leaves[], unsigned *ptruCount) { if (uNodeIndex == uExclude) return; if (tree.IsLeaf(uNodeIndex)) { Leaves[*ptruCount] = uNodeIndex; ++(*ptruCount); return; } const unsigned uLeft = tree.GetLeft(uNodeIndex); const unsigned uRight = tree.GetRight(uNodeIndex); if (NULL_NEIGHBOR != uLeft) GetLeavesSubtreeExcluding(tree, uLeft, uExclude, Leaves, ptruCount); if (NULL_NEIGHBOR != uRight) GetLeavesSubtreeExcluding(tree, uRight, uExclude, Leaves, ptruCount); } void GetLeavesExcluding(const Tree &tree, unsigned uNodeIndex, unsigned uExclude, unsigned Leaves[], unsigned *ptruCount) { *ptruCount = 0; GetLeavesSubtreeExcluding(tree, uNodeIndex, uExclude, Leaves, ptruCount); } void GetInternalNodesInHeightOrder(const Tree &tree, unsigned NodeIndexes[]) { const unsigned uNodeCount = tree.GetNodeCount(); if (uNodeCount < 3) Die("GetInternalNodesInHeightOrder: %u nodes, none are internal", uNodeCount); const unsigned uInternalNodeCount = (uNodeCount - 1)/2; double *Heights = new double[uInternalNodeCount]; unsigned uIndex = 0; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (tree.IsLeaf(uNodeIndex)) continue; NodeIndexes[uIndex] = uNodeIndex; Heights[uIndex] = tree.GetNodeHeight(uNodeIndex); ++uIndex; } if (uIndex != uInternalNodeCount) Die("Internal error: GetInternalNodesInHeightOrder"); // Simple but slow bubble sort (probably don't care about speed here) bool bDone = false; while (!bDone) { bDone = true; for (unsigned i = 0; i < uInternalNodeCount - 1; ++i) { if (Heights[i] > Heights[i+1]) { double dTmp = Heights[i]; Heights[i] = Heights[i+1]; Heights[i+1] = dTmp; unsigned uTmp = NodeIndexes[i]; NodeIndexes[i] = NodeIndexes[i+1]; NodeIndexes[i+1] = uTmp; bDone = false; } } } #if TRACE Log("Internal node index Height\n"); Log("------------------- --------\n"); // 1234567890123456789 123456789 for (unsigned n = 0; n < uInternalNodeCount; ++n) Log("%19u %9.3f\n", NodeIndexes[n], Heights[n]); #endif delete[] Heights; } void ApplyMinEdgeLength(Tree &tree, double dMinEdgeLength) { const unsigned uNodeCount = tree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { const unsigned uNeighborCount = tree.GetNeighborCount(uNodeIndex); for (unsigned n = 0; n < uNeighborCount; ++n) { const unsigned uNeighborNodeIndex = tree.GetNeighbor(uNodeIndex, n); if (!tree.HasEdgeLength(uNodeIndex, uNeighborNodeIndex)) continue; if (tree.GetEdgeLength(uNodeIndex, uNeighborNodeIndex) < dMinEdgeLength) tree.SetEdgeLength(uNodeIndex, uNeighborNodeIndex, dMinEdgeLength); } } } muscle-5.1.0/src/tree4.cpp000066400000000000000000000202761424453062600153400ustar00rootroot00000000000000#include "muscle.h" #include "tree.h" #include #define TRACE 0 void ClusterByHeight(const Tree &tree, double dMaxHeight, unsigned Subtrees[], unsigned *ptruSubtreeCount) { if (!tree.IsRooted()) Die("ClusterByHeight: requires rooted tree"); #if TRACE Log("ClusterByHeight, max height=%g\n", dMaxHeight); #endif unsigned uSubtreeCount = 0; const unsigned uNodeCount = tree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (tree.IsRoot(uNodeIndex)) continue; unsigned uParent = tree.GetParent(uNodeIndex); double dHeight = tree.GetNodeHeight(uNodeIndex); double dParentHeight = tree.GetNodeHeight(uParent); #if TRACE Log("Node %3u Height %5.2f ParentHeight %5.2f\n", uNodeIndex, dHeight, dParentHeight); #endif if (dParentHeight > dMaxHeight && dHeight <= dMaxHeight) { Subtrees[uSubtreeCount] = uNodeIndex; #if TRACE Log("Subtree[%u]=%u\n", uSubtreeCount, uNodeIndex); #endif ++uSubtreeCount; } } *ptruSubtreeCount = uSubtreeCount; } static void ClusterBySubfamCount_Iteration(const Tree &tree, unsigned Subfams[], unsigned uCount) { // Find highest child node of current set of subfamilies. double dHighestHeight = -1e20; int iParentSubscript = -1; for (int n = 0; n < (int) uCount; ++n) { const unsigned uNodeIndex = Subfams[n]; if (tree.IsLeaf(uNodeIndex)) continue; const unsigned uLeft = tree.GetLeft(uNodeIndex); const double dHeightLeft = tree.GetNodeHeight(uLeft); if (dHeightLeft > dHighestHeight) { dHighestHeight = dHeightLeft; iParentSubscript = n; } const unsigned uRight = tree.GetRight(uNodeIndex); const double dHeightRight = tree.GetNodeHeight(uRight); if (dHeightRight > dHighestHeight) { dHighestHeight = dHeightRight; iParentSubscript = n; } } if (-1 == iParentSubscript) Die("CBSFCIter: failed to find highest child"); const unsigned uNodeIndex = Subfams[iParentSubscript]; const unsigned uLeft = tree.GetLeft(uNodeIndex); const unsigned uRight = tree.GetRight(uNodeIndex); // Delete parent by replacing with left child Subfams[iParentSubscript] = uLeft; // Append right child to list Subfams[uCount] = uRight; #if TRACE { Log("Iter %3u:", uCount); for (unsigned n = 0; n < uCount; ++n) Log(" %u", Subfams[n]); Log("\n"); } #endif } // Divide a tree containing N leaves into k families by // cutting the tree at a horizontal line at some height. // Each internal node defines a height for the cut, // considering all internal nodes enumerates all distinct // cuts. Visit internal nodes in decreasing order of height. // Visiting the node corresponds to moving the horizontal // line down to cut the tree at the height of that node. // We consider the cut to be "infinitestimally below" // the node, so the effect is to remove the current node // from the list of subfamilies and add its two children. // We must visit a parent before its children (so care may // be needed to handle zero edge lengths properly). // We assume that N is small, and write dumb O(N^2) code. // More efficient strategies are possible for large N // by maintaining a list of nodes sorted by height. void ClusterBySubfamCount(const Tree &tree, unsigned uSubfamCount, unsigned Subfams[], unsigned *ptruSubfamCount) { const unsigned uNodeCount = tree.GetNodeCount(); const unsigned uLeafCount = (uNodeCount + 1)/2; // Special case: empty tree if (0 == uNodeCount) { *ptruSubfamCount = 0; return; } // Special case: more subfamilies than leaves if (uSubfamCount >= uLeafCount) { for (unsigned n = 0; n < uLeafCount; ++n) Subfams[n] = n; *ptruSubfamCount = uLeafCount; return; } // Initialize list of subfamilies to be root Subfams[0] = tree.GetRootNodeIndex(); // Iterate for (unsigned i = 1; i < uSubfamCount; ++i) ClusterBySubfamCount_Iteration(tree, Subfams, i); *ptruSubfamCount = uSubfamCount; } static void GetLeavesRecurse(const Tree &tree, unsigned uNodeIndex, unsigned Leaves[], unsigned &uLeafCount /* in-out */) { if (tree.IsLeaf(uNodeIndex)) { Leaves[uLeafCount] = uNodeIndex; ++uLeafCount; return; } const unsigned uLeft = tree.GetLeft(uNodeIndex); const unsigned uRight = tree.GetRight(uNodeIndex); GetLeavesRecurse(tree, uLeft, Leaves, uLeafCount); GetLeavesRecurse(tree, uRight, Leaves, uLeafCount); } void GetLeaves(const Tree &tree, unsigned uNodeIndex, unsigned Leaves[], unsigned *ptruLeafCount) { unsigned uLeafCount = 0; GetLeavesRecurse(tree, uNodeIndex, Leaves, uLeafCount); *ptruLeafCount = uLeafCount; } void Tree::PruneTree(const Tree &tree, unsigned Subfams[], unsigned uSubfamCount) { if (!tree.IsRooted()) Die("Tree::PruneTree: requires rooted tree"); Clear(); m_uNodeCount = 2*uSubfamCount - 1; InitCache(m_uNodeCount); const unsigned uUnprunedNodeCount = tree.GetNodeCount(); unsigned *uUnprunedToPrunedIndex = new unsigned[uUnprunedNodeCount]; unsigned *uPrunedToUnprunedIndex = new unsigned[m_uNodeCount]; for (unsigned n = 0; n < uUnprunedNodeCount; ++n) uUnprunedToPrunedIndex[n] = NULL_NEIGHBOR; for (unsigned n = 0; n < m_uNodeCount; ++n) uPrunedToUnprunedIndex[n] = NULL_NEIGHBOR; // Create mapping between unpruned and pruned node indexes unsigned uInternalNodeIndex = uSubfamCount; for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uUnprunedNodeIndex = Subfams[uSubfamIndex]; uUnprunedToPrunedIndex[uUnprunedNodeIndex] = uSubfamIndex; uPrunedToUnprunedIndex[uSubfamIndex] = uUnprunedNodeIndex; for (;;) { uUnprunedNodeIndex = tree.GetParent(uUnprunedNodeIndex); if (tree.IsRoot(uUnprunedNodeIndex)) break; // Already visited this node? if (NULL_NEIGHBOR != uUnprunedToPrunedIndex[uUnprunedNodeIndex]) break; uUnprunedToPrunedIndex[uUnprunedNodeIndex] = uInternalNodeIndex; uPrunedToUnprunedIndex[uInternalNodeIndex] = uUnprunedNodeIndex; ++uInternalNodeIndex; } } const unsigned uUnprunedRootIndex = tree.GetRootNodeIndex(); uUnprunedToPrunedIndex[uUnprunedRootIndex] = uInternalNodeIndex; uPrunedToUnprunedIndex[uInternalNodeIndex] = uUnprunedRootIndex; #if TRACE { Log("Pruned to unpruned:\n"); for (unsigned i = 0; i < m_uNodeCount; ++i) Log(" [%u]=%u", i, uPrunedToUnprunedIndex[i]); Log("\n"); Log("Unpruned to pruned:\n"); for (unsigned i = 0; i < uUnprunedNodeCount; ++i) { unsigned n = uUnprunedToPrunedIndex[i]; if (n != NULL_NEIGHBOR) Log(" [%u]=%u", i, n); } Log("\n"); } #endif if (uInternalNodeIndex != m_uNodeCount - 1) Die("Tree::PruneTree, Internal error"); // Nodes 0, 1 ... are the leaves for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { char szName[32]; sprintf(szName, "Subfam_%u", uSubfamIndex + 1); m_ptrName[uSubfamIndex] = mystrsave(szName); } for (unsigned uPrunedNodeIndex = uSubfamCount; uPrunedNodeIndex < m_uNodeCount; ++uPrunedNodeIndex) { unsigned uUnprunedNodeIndex = uPrunedToUnprunedIndex[uPrunedNodeIndex]; const unsigned uUnprunedLeft = tree.GetLeft(uUnprunedNodeIndex); const unsigned uUnprunedRight = tree.GetRight(uUnprunedNodeIndex); const unsigned uPrunedLeft = uUnprunedToPrunedIndex[uUnprunedLeft]; const unsigned uPrunedRight = uUnprunedToPrunedIndex[uUnprunedRight]; const double dLeftLength = tree.GetEdgeLength(uUnprunedNodeIndex, uUnprunedLeft); const double dRightLength = tree.GetEdgeLength(uUnprunedNodeIndex, uUnprunedRight); m_uNeighbor2[uPrunedNodeIndex] = uPrunedLeft; m_uNeighbor3[uPrunedNodeIndex] = uPrunedRight; m_dEdgeLength1[uPrunedLeft] = dLeftLength; m_dEdgeLength1[uPrunedRight] = dRightLength; m_uNeighbor1[uPrunedLeft] = uPrunedNodeIndex; m_uNeighbor1[uPrunedRight] = uPrunedNodeIndex; m_bHasEdgeLength1[uPrunedLeft] = true; m_bHasEdgeLength1[uPrunedRight] = true; m_dEdgeLength2[uPrunedNodeIndex] = dLeftLength; m_dEdgeLength3[uPrunedNodeIndex] = dRightLength; m_bHasEdgeLength2[uPrunedNodeIndex] = true; m_bHasEdgeLength3[uPrunedNodeIndex] = true; } m_uRootNodeIndex = uUnprunedToPrunedIndex[uUnprunedRootIndex]; m_bRooted = true; Validate(); delete[] uUnprunedToPrunedIndex; } void LeafIndexesToIds(const Tree &tree, const unsigned Leaves[], unsigned uCount, unsigned Ids[]) { for (unsigned n = 0; n < uCount; ++n) Ids[n] = tree.GetLeafId(Leaves[n]); } muscle-5.1.0/src/treefromfile.cpp000066400000000000000000000131651424453062600167770ustar00rootroot00000000000000#include "muscle.h" #include "tree.h" #include "textfile.h" #define TRACE 0 // Tokens in Newick files are: // ( ) : , ; // string // 'string' // "string" // [ comment ] // // We can't safely distinguish between identifiers and floating point // numbers at the lexical level (because identifiers may be numeric, // or start with digits), so both edge lengths and identifiers are // returned as strings. const char *Tree::NTTStr(NEWICK_TOKEN_TYPE NTT) const { switch (NTT) { #define c(x) case NTT_##x: return #x; c(Unknown) c(Lparen) c(Rparen) c(Colon) c(Comma) c(Semicolon) c(String) c(SingleQuotedString) c(DoubleQuotedString) c(Comment) #undef c } return "??"; } NEWICK_TOKEN_TYPE Tree::GetToken(TextFile &File, char szToken[], unsigned uBytes) const { // Skip leading white space File.SkipWhite(); char c; File.GetCharX(c); // In case a single-character token szToken[0] = c; szToken[1] = 0; unsigned uBytesCopied = 0; NEWICK_TOKEN_TYPE TT; switch (c) { case '(': return NTT_Lparen; case ')': return NTT_Rparen; case ':': return NTT_Colon; case ';': return NTT_Semicolon; case ',': return NTT_Comma; case '\'': TT = NTT_SingleQuotedString; File.GetCharX(c); break; case '"': TT = NTT_DoubleQuotedString; File.GetCharX(c); break; case '[': TT = NTT_Comment; break; default: TT = NTT_String; break; } for (;;) { if (TT != NTT_Comment) { if (uBytesCopied < uBytes - 2) { szToken[uBytesCopied++] = c; szToken[uBytesCopied] = 0; } else Die("Tree::GetToken: input buffer too small, token so far='%s'", szToken); } bool bEof = File.GetChar(c); if (bEof) return TT; switch (TT) { case NTT_String: if (0 != strchr("():;,", c)) { File.PushBack(c); return NTT_String; } if (isspace(c)) return NTT_String; break; case NTT_SingleQuotedString: if ('\'' == c) return NTT_String; break; case NTT_DoubleQuotedString: if ('"' == c) return NTT_String; break; case NTT_Comment: if (']' == c) return GetToken(File, szToken, uBytes); break; default: Die("Tree::GetToken, invalid TT=%u", TT); } } } // NOTE: this hack must come after definition of Tree::GetToken. #if TRACE #define GetToken GetTokenVerbose #endif void Tree::FromFile(const string &FileName) { TextFile TF(FileName.c_str(), false); FromFile(TF); TF.Close(); } void Tree::FromFile(TextFile &File) { // Assume rooted. // If we discover that it is unrooted, will convert on the fly. CreateRooted(); double dEdgeLength; bool bEdgeLength = GetGroupFromFile(File, 0, &dEdgeLength); // Next token should be either ';' for rooted tree or ',' for unrooted. char szToken[16]; NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, sizeof(szToken)); // If rooted, all done. if (NTT_Semicolon == NTT) { if (bEdgeLength) Log(" *** Warning *** edge length on root group in Newick file %s\n", File.GetFileName()); Validate(); return; } if (NTT_Comma != NTT) Die("Tree::FromFile, expected ';' or ',', got '%s'", szToken); const unsigned uThirdNode = UnrootFromFile(); bEdgeLength = GetGroupFromFile(File, uThirdNode, &dEdgeLength); if (bEdgeLength) SetEdgeLength(0, uThirdNode, dEdgeLength); Validate(); } // Return true if edge length for this group. bool Tree::GetGroupFromFile(TextFile &File, unsigned uNodeIndex, double *ptrdEdgeLength) { char szToken[1024]; NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, sizeof(szToken)); // Group is either leaf name or (left, right). if (NTT_String == NTT) { SetLeafName(uNodeIndex, szToken); #if TRACE Log("Group is leaf '%s'\n", szToken); #endif } else if (NTT_Lparen == NTT) { const unsigned uLeft = AppendBranch(uNodeIndex); const unsigned uRight = uLeft + 1; // Left sub-group... #if TRACE Log("Got '(', group is compound, expect left sub-group\n"); #endif double dEdgeLength; bool bLeftLength = GetGroupFromFile(File, uLeft, &dEdgeLength); #if TRACE if (bLeftLength) Log("Edge length for left sub-group: %.3g\n", dEdgeLength); else Log("No edge length for left sub-group\n"); #endif if (bLeftLength) SetEdgeLength(uNodeIndex, uLeft, dEdgeLength); // ... then comma ... #if TRACE Log("Expect comma\n"); #endif NTT = GetToken(File, szToken, sizeof(szToken)); if (NTT_Comma != NTT) Die("Tree::GetGroupFromFile, expected ',', got '%s'", szToken); // ...then right sub-group... #if TRACE Log("Expect right sub-group\n"); #endif bool bRightLength = GetGroupFromFile(File, uRight, &dEdgeLength); if (bRightLength) SetEdgeLength(uNodeIndex, uRight, dEdgeLength); #if TRACE if (bRightLength) Log("Edge length for right sub-group: %.3g\n", dEdgeLength); else Log("No edge length for right sub-group\n"); #endif // ... then closing parenthesis. #if TRACE Log("Expect closing parenthesis (or comma if > 2-ary)\n"); #endif NTT = GetToken(File, szToken, sizeof(szToken)); if (NTT_Rparen == NTT) ; else if (NTT_Comma == NTT) { File.PushBack(','); return false; } else Die("Tree::GetGroupFromFile, expected ')' or ',', got '%s'", szToken); } else Die("Tree::GetGroupFromFile, expected '(' or leaf name, got '%s'", szToken); // Group may optionally be followed by edge length. bool bEof = File.SkipWhiteX(); if (bEof) return false; char c; File.GetCharX(c); #if TRACE Log("Character following group, could be colon, is '%c'\n", c); #endif if (':' == c) { NTT = GetToken(File, szToken, sizeof(szToken)); if (NTT_String != NTT) Die("Tree::GetGroupFromFile, expected edge length, got '%s'", szToken); *ptrdEdgeLength = atof(szToken); return true; } File.PushBack(c); return false; } muscle-5.1.0/src/treeperm.cpp000066400000000000000000000010641424453062600161320ustar00rootroot00000000000000#include "muscle.h" #include "treeperm.h" TREEPERM StrToTREEPERM(const string &s) { if (s == "none") return TP_None; if (s == "abc") return TP_ABC; if (s == "acb") return TP_ACB; if (s == "bca") return TP_BCA; if (s == "all") return TP_All; Die("Invalid perm '%s'", s.c_str()); return TP_None; } const char *TREEPERMToStr(TREEPERM TP) { switch (TP) { case TP_None: return "none"; case TP_ABC: return "abc"; case TP_ACB: return "acb"; case TP_BCA: return "bca"; case TP_All: return "all"; default: break; } asserta(false); return "?"; } muscle-5.1.0/src/treeperm.h000066400000000000000000000002661424453062600156020ustar00rootroot00000000000000#pragma once enum TREEPERM { TP_None = 0, TP_ABC = 1, TP_ACB = 2, TP_BCA = 3, TP_All = 4 }; TREEPERM StrToTREEPERM(const string &s); const char *TREEPERMToStr(TREEPERM TP); muscle-5.1.0/src/treesplitter.cpp000066400000000000000000000104621424453062600170370ustar00rootroot00000000000000#include "myutils.h" #include "muscle.h" #include "treesplitter.h" #include "sort.h" void Splitter::Run(const Tree &T, uint SplitCount) { m_T = &T; const uint NodeCount = m_T->GetNodeCount(); const uint Root = m_T->GetRoot(); m_TargetSize = NodeCount/SplitCount; m_SplitCount = SplitCount; if (m_TargetSize == 0) m_TargetSize = 1; m_SubtreeNodes.push_back(Root); bool TerminatedEarly = false; for (m_SplitIndex = 1; m_SplitIndex < m_SplitCount; ++m_SplitIndex) { asserta(SIZE(m_SubtreeNodes) == m_SplitIndex); uint BiggestNode = GetBiggestNode(); if (m_T->IsLeaf(BiggestNode)) { TerminatedEarly = true; break; } uint Left = m_T->GetLeft(BiggestNode); uint Right = m_T->GetRight(BiggestNode); asserta(Left != UINT_MAX); asserta(Right != UINT_MAX); vector NewSubtreeNodes; for (uint i = 0; i < SIZE(m_SubtreeNodes); ++i) { uint Node = m_SubtreeNodes[i]; if (Node == BiggestNode) { uint Left = m_T->GetLeft(BiggestNode); NewSubtreeNodes.push_back(Left); NewSubtreeNodes.push_back(Right); } else NewSubtreeNodes.push_back(Node); } m_SubtreeNodes = NewSubtreeNodes; LogState(); } if (!TerminatedEarly) asserta(SIZE(m_SubtreeNodes) == m_SplitCount); } void Splitter::GetSizeOrder(vector &Order) const { vector Sizes; for (uint i = 0; i < SIZE(m_SubtreeNodes); ++i) { uint Node = m_SubtreeNodes[i]; uint Size = m_T->GetSubtreeLeafCount(Node); Sizes.push_back(Size); } uint N = SIZE(Sizes); Order.resize(N); QuickSortOrderDesc(Sizes.data(), N, Order.data()); } void Splitter::LogState() const { Log("\n"); Log("_______________ Split %u ______________\n", m_SplitIndex); Log(" Node Size LSize RSize\n"); // 12345 12345 12345 12345\n"); vector Order; GetSizeOrder(Order); uint SumSize = 0; for (uint i = 0; i < SIZE(m_SubtreeNodes); ++i) { uint k = Order[i]; uint Node = m_SubtreeNodes[k]; uint Size = m_T->GetSubtreeLeafCount(Node); SumSize += Size; Log("%5u", Node); Log(" %5u", Size); if (!m_T->IsLeaf(Node)) { uint Left = m_T->GetLeft(Node); uint Right = m_T->GetLeft(Node); uint LSize = m_T->GetSubtreeLeafCount(Left); uint RSize = m_T->GetSubtreeLeafCount(Right); Log(" %5u %5u", LSize, RSize); } Log("\n"); } Log("Total %u\n", SumSize); } uint Splitter::GetBiggestNode() const { uint MaxSize = 0; uint MaxNode = UINT_MAX; for (uint i = 0; i < SIZE(m_SubtreeNodes); ++i) { uint Node = m_SubtreeNodes[i]; uint Size = m_T->GetSubtreeLeafCount(Node); if (Size > MaxSize) { MaxNode = Node; MaxSize = Size; } } asserta(MaxNode != UINT_MAX); return MaxNode; } void Splitter::GetLabelsVec(vector > &LabelsVec) const { const uint SplitCount = SIZE(m_SubtreeNodes); LabelsVec.clear(); LabelsVec.resize(SplitCount); vector Order; GetSizeOrder(Order); asserta(SIZE(Order) == SplitCount); for (uint i = 0; i < SplitCount; ++i) { uint k = Order[i]; uint Node = m_SubtreeNodes[k]; m_T->GetSubtreeLeafLabels(Node, LabelsVec[i]); } } void Splitter::WriteLabels(const string &FileNamePrefix) const { if (FileNamePrefix.empty()) return; vector Order; GetSizeOrder(Order); string LabelsFileName; for (uint i = 0; i < SIZE(m_SubtreeNodes); ++i) { uint k = Order[i]; uint Node = m_SubtreeNodes[k]; vector Labels; m_T->GetSubtreeLeafLabels(Node, Labels); LabelsFileName = opt(prefix); Psa(LabelsFileName, "%u", i+1); FILE *f = CreateStdioFile(LabelsFileName); for (uint j = 0; j < SIZE(Labels); ++j) fprintf(f, "%s\n", Labels[j].c_str()); CloseStdioFile(f); } } void Splitter::GetSubtree(Tree &Subtree, vector &SplitLabels) const { SplitLabels.clear(); asserta(m_T != 0); const uint Size = SIZE(m_SubtreeNodes); for (uint i = 0; i < Size; ++i) { string Label; Ps(Label, "split%u", i); SplitLabels.push_back(Label); } MakeSubsetNodes(*m_T, m_SubtreeNodes, SplitLabels, Subtree); } void cmd_split_tree() { const string &TreeFileName = opt(split_tree); uint n = 16; if (optset_n) n = opt(n); asserta(n > 1); Tree T; T.FromFile(TreeFileName); asserta(T.IsRooted()); Splitter S; S.Run(T, n); S.WriteLabels(opt(prefix)); if (optset_output) { Tree Subtree; vector SubLabels; S.GetSubtree(Subtree, SubLabels); Subtree.ToFile(opt(output)); } } muscle-5.1.0/src/treesplitter.h000066400000000000000000000013261424453062600165030ustar00rootroot00000000000000#pragma once #include "tree.h" class Splitter { public: const Tree *m_T = 0; uint m_SplitIndex = 0; uint m_SplitCount = 0; uint m_TargetSize = 0; vector m_SubtreeNodes; public: void Run(const Tree &T, uint SplitCount); uint GetBiggestNode() const; void GetLabelsVec(vector > &LabelsVec) const; void WriteLabels(const string &FileNamePrefix) const; void LogState() const; void GetSizeOrder(vector &Order) const; void GetSubtree(Tree &Subtree, vector &SplitLabels) const; uint GetSplitCount() const { return SIZE(m_SubtreeNodes); } }; void MakeSubsetNodes(const Tree &InputTree, const vector &SubsetNodes, const vector &SubsetLabels, Tree &SubsetTree); muscle-5.1.0/src/treesubsetnodes.cpp000066400000000000000000000122671424453062600175340ustar00rootroot00000000000000#include "myutils.h" #include "muscle.h" #include "tree.h" #include void IntsFromFile(const string &FileName, vector &Ints, vector &Labels) { Ints.clear(); Labels.clear(); FILE *f = OpenStdioFile(FileName); string Line; vector Fields; Progress("Reading %s...", FileName.c_str()); while (ReadLineStdioFile(f, Line)) { Split(Line, Fields, '\t'); asserta(SIZE(Fields) == 2); uint i = StrToUint(Fields[0]); const string &Label = Fields[1]; Ints.push_back(i); Labels.push_back(Label); } Progress("done.\n"); CloseStdioFile(f); } void MakeSubsetNodes(const Tree &InputTree, const vector &SubsetNodes, const vector &SubsetLabels, Tree &SubsetTree) { if (!InputTree.IsRooted()) Die("Tree must be rooted"); const uint SubsetNodeCount = SIZE(SubsetNodes); if (SubsetNodeCount < 2) Die("Need at least two nodes"); asserta(SIZE(SubsetLabels) == SubsetNodeCount); const uint NodeCount = InputTree.GetNodeCount(); map OldNodeToNewLabel; set SubsetSet; for (uint i = 0; i < SubsetNodeCount; ++i) { uint SubsetNode = SubsetNodes[i]; SubsetSet.insert(SubsetNode); OldNodeToNewLabel[SubsetNode] = SubsetLabels[i]; } // ParentSet includes subset nodes which are // parents of other subset nodes set ParentSet; vector Path; for (uint i = 0; i < SubsetNodeCount; ++i) { uint SubsetNode = SubsetNodes[i]; InputTree.GetPathToRoot(SubsetNode, Path); const uint n = SIZE(Path); asserta(Path[0] == SubsetNode); for (uint j = 1; j < n; ++j) { uint Node = Path[j]; asserta(Node < NodeCount); if (SubsetSet.find(Node) != SubsetSet.end()) { ProgressLog("Node %u is parent of %u\n", Node, SubsetNode); ParentSet.insert(Node); } } } const uint ParentCount = SIZE(ParentSet); set NewTips; for (set::const_iterator p = SubsetSet.begin(); p != SubsetSet.end(); ++p) { uint Node = *p; if (ParentSet.find(Node) == ParentSet.end()) NewTips.insert(Node); } const uint NewTipCount = SIZE(NewTips); if (NewTipCount == 1) Die("One tip in subset tree"); asserta(NewTipCount + ParentCount == SubsetNodeCount); vector NodeToPathCount(NodeCount); for (set::const_iterator p = NewTips.begin(); p != NewTips.end(); ++p) { uint Tip = *p; InputTree.GetPathToRoot(Tip, Path); const uint n = SIZE(Path); asserta(Path[0] == Tip); for (uint j = 0; j < n; ++j) { uint Node2 = Path[j]; ++NodeToPathCount[Node2]; } } vector OldNodeToNewParent(NodeCount, UINT_MAX); set Pending; set Done; for (set::const_iterator p = NewTips.begin(); p != NewTips.end(); ++p) Pending.insert(*p); for (;;) { if (Pending.empty()) break; uint Node = *Pending.begin(); asserta(Done.find(Node) == Done.end()); Done.insert(Node); Pending.erase(Node); if (InputTree.IsRoot(Node)) continue; InputTree.GetPathToRoot(Node, Path); asserta(Path[0] == Node); const uint n = SIZE(Path); uint NewParent = UINT_MAX; for (uint j = 1; j < n; ++j) { uint Node2 = Path[j]; uint Left = InputTree.GetLeft(Node2); uint Right = InputTree.GetRight(Node2); asserta(Left != UINT_MAX); asserta(Right != UINT_MAX); uint LeftPathCount = NodeToPathCount[Left]; uint RightPathCount = NodeToPathCount[Right]; if (LeftPathCount > 0 && RightPathCount > 0) { NewParent = Node2; break; } } asserta(OldNodeToNewParent[Node] == UINT_MAX); OldNodeToNewParent[Node] = NewParent; if (NewParent != UINT_MAX && Done.find(NewParent) == Done.end()) Pending.insert(NewParent); } vector OldNodeToNewNode(NodeCount, UINT_MAX); vector NewNodeToOldNode; for (set::const_iterator p = Done.begin(); p != Done.end(); ++p) { uint OldNode = *p; asserta(OldNode < SIZE(OldNodeToNewNode)); uint NewNode = SIZE(NewNodeToOldNode); OldNodeToNewNode[OldNode] = NewNode; NewNodeToOldNode.push_back(OldNode); } vector NewParents; vector NewLengths; vector NewLabels; for (set::const_iterator p = Done.begin(); p != Done.end(); ++p) { uint OldNode = *p; asserta(OldNode < SIZE(OldNodeToNewNode)); uint NewNode = SIZE(NewNodeToOldNode); asserta(OldNode < SIZE(OldNodeToNewParent)); uint Parent = OldNodeToNewParent[OldNode]; uint NewParent = UINT_MAX; if (Parent != UINT_MAX) NewParent = OldNodeToNewNode[Parent]; string Label; map::const_iterator q = OldNodeToNewLabel.find(OldNode); if (q == OldNodeToNewLabel.end()) InputTree.GetLabel(OldNode, Label); else Label = q->second; float Distance = (Parent == UINT_MAX ? 0 : (float) InputTree.GetDistance(OldNode, Parent)); NewParents.push_back(NewParent); NewLabels.push_back(Label); NewLengths.push_back(Distance); } SubsetTree.FromVectors(NewLabels, NewParents, NewLengths); } void cmd_tree_subset_nodes() { const string &InputFileName = opt(tree_subset_nodes); const string &NodesFileName = opt(nodes); vector Nodes; vector NewLabels; IntsFromFile(NodesFileName, Nodes, NewLabels); Tree T; T.FromFile(InputFileName); Tree Subtree; MakeSubsetNodes(T, Nodes, NewLabels, Subtree); T.Ladderize(opt(right)); Subtree.ToFile(opt(output)); } muscle-5.1.0/src/treetofile.cpp000066400000000000000000000041651424453062600164560ustar00rootroot00000000000000#include "muscle.h" #include "tree.h" #include "textfile.h" unsigned Tree::GetAnyNonLeafNode() const { for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) if (!IsLeaf(uNodeIndex)) return uNodeIndex; return NULL_NEIGHBOR; } void Tree::ToFile(const string &FileName) const { if (FileName.empty()) return; TextFile TF(FileName.c_str(), true); ToFile(TF); TF.Close(); } void Tree::ToFile(TextFile &File) const { if (IsRooted()) { ToFileNodeRooted(File, m_uRootNodeIndex); File.PutString(";\n"); return; } // Unrooted. unsigned uNodeIndex = GetAnyNonLeafNode(); File.PutString("(\n"); ToFileNodeUnrooted(File, m_uNeighbor1[uNodeIndex], uNodeIndex); File.PutString(",\n"); ToFileNodeUnrooted(File, m_uNeighbor2[uNodeIndex], uNodeIndex); File.PutString(",\n"); ToFileNodeUnrooted(File, m_uNeighbor3[uNodeIndex], uNodeIndex); File.PutString(");\n"); } void Tree::ToFileNodeUnrooted(TextFile &File, unsigned uNodeIndex, unsigned uParent) const { assert(!IsRooted()); bool bGroup = !IsLeaf(uNodeIndex); if (bGroup) File.PutString("(\n"); if (IsLeaf(uNodeIndex)) File.PutString(GetName(uNodeIndex)); else { ToFileNodeUnrooted(File, GetFirstNeighbor(uNodeIndex, uParent), uNodeIndex); File.PutString(",\n"); ToFileNodeUnrooted(File, GetSecondNeighbor(uNodeIndex, uParent), uNodeIndex); } if (bGroup) File.PutString(")"); if (HasEdgeLength(uNodeIndex, uParent)) File.PutFormat(":%g", GetEdgeLength(uNodeIndex, uParent)); File.PutString("\n"); } void Tree::ToFileNodeRooted(TextFile &File, unsigned uNodeIndex) const { assert(IsRooted()); bool bGroup = !IsLeaf(uNodeIndex) || IsRoot(uNodeIndex); if (bGroup) File.PutString("(\n"); if (IsLeaf(uNodeIndex)) File.PutString(GetName(uNodeIndex)); else { ToFileNodeRooted(File, GetLeft(uNodeIndex)); File.PutString(",\n"); ToFileNodeRooted(File, GetRight(uNodeIndex)); } if (bGroup) File.PutString(")"); if (!IsRoot(uNodeIndex)) { unsigned uParent = GetParent(uNodeIndex); if (HasEdgeLength(uNodeIndex, uParent)) File.PutFormat(":%g", GetEdgeLength(uNodeIndex, uParent)); } File.PutString("\n"); } muscle-5.1.0/src/trimtoref.cpp000066400000000000000000000063351424453062600163300ustar00rootroot00000000000000#include "myutils.h" #include "alpha.h" #include "msa.h" void DeleteAllGapColumns(MSA &M) { const uint SeqCount = M.GetSeqCount(); const uint ColCount = M.GetColCount(); vector Keeps; uint KeepCount = 0; for (uint ColIndex = 0; ColIndex < ColCount; ++ColIndex) { bool Keep = (M.GetGapCount(ColIndex) < SeqCount); Keeps.push_back(Keep); if (Keep) ++KeepCount; } if (KeepCount == 0) Die("MSA is all gaps"); for (uint SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) { char *Seq = M.m_szSeqs[SeqIndex]; uint ToCol = 0; for (uint ColIndex = 0; ColIndex < ColCount; ++ColIndex) { char c = Seq[ColIndex]; if (!Keeps[ColIndex]) { asserta(isgap(c)); continue; } Seq[ToCol++] = c; } asserta(ToCol == KeepCount); } M.m_uColCount = KeepCount; } void TrimToRef(const MSA &Test, const MSA &Ref, MSA &Trimmed) { Trimmed.Clear(); const uint TestSeqCount = Test.GetSeqCount(); const uint TestColCount = Test.GetColCount(); const uint RefSeqCount = Test.GetSeqCount(); const uint RefColCount = Ref.GetColCount(); map RefLabelToSeqIndex; map TestLabelToSeqIndex; vector RefLabels; vector TestLabels; Ref.GetLabelToSeqIndex(RefLabels, RefLabelToSeqIndex); Test.GetLabelToSeqIndex(TestLabels, TestLabelToSeqIndex); asserta(SIZE(RefLabels) == RefSeqCount); asserta(SIZE(TestLabels) == TestSeqCount); vector Labels; vector RefSeqIndexes; for (uint TestSeqIndex = 0; TestSeqIndex < TestSeqCount; ++TestSeqIndex) { const string &Label = TestLabels[TestSeqIndex]; map::const_iterator p = RefLabelToSeqIndex.find(Label); if (p != RefLabelToSeqIndex.end()) { uint RefSeqIndex = p->second; Labels.push_back(Label); RefSeqIndexes.push_back(RefSeqIndex); } } const uint TrimmedSeqCount = SIZE(RefSeqIndexes); Trimmed.SetSize(TrimmedSeqCount, TestColCount); for (uint TrimmedSeqIndex = 0; TrimmedSeqIndex < TrimmedSeqCount; ++TrimmedSeqIndex) { uint RefSeqIndex = RefSeqIndexes[TrimmedSeqIndex]; const string &Label = Labels[TrimmedSeqIndex]; Trimmed.m_szNames[TrimmedSeqIndex] = mystrsave(Label.c_str()); vector PosToUpper; const char *RefRow = Ref.m_szSeqs[RefSeqIndex]; for (uint RefColIndex = 0; RefColIndex < RefColCount; ++RefColIndex) { char c = RefRow[RefColIndex]; if (!isgap(c)) { bool Upper = isupper(c); PosToUpper.push_back(Upper); } } map::const_iterator p = TestLabelToSeqIndex.find(Label); asserta(p != TestLabelToSeqIndex.end()); uint TestSeqIndex = p->second; const char *TestRow = Test.m_szSeqs[TestSeqIndex]; char *TrimmedRow = Trimmed.m_szSeqs[TrimmedSeqIndex]; uint Pos = 0; for (uint TestColIndex = 0; TestColIndex < TestColCount; ++TestColIndex) { char c = TestRow[TestColIndex]; if (!isgap(c)) { bool Upper = PosToUpper[Pos++]; if (!Upper) c = '-'; } TrimmedRow[TestColIndex] = c; } asserta(Pos == SIZE(PosToUpper)); } DeleteAllGapColumns(Trimmed); } void cmd_trimtoref() { MSA Test; MSA Ref; Test.FromFASTAFile(opt(trimtoref)); Ref.FromFASTAFile_PreserveCase(opt(ref)); MSA Trimmed; TrimToRef(Test, Ref, Trimmed); Trimmed.ToFASTAFile(opt(output)); } muscle-5.1.0/src/trimtorefefa.cpp000066400000000000000000000013761424453062600170040ustar00rootroot00000000000000#include "myutils.h" #include "alpha.h" #include "msa.h" #include "ensemble.h" void TrimToRef(const MSA &Test, const MSA &Ref, MSA &Trimmed); void cmd_trimtoref_efa() { const string EfaFileName = opt(trimtoref_efa); const string RefFileName = opt(ref); const string OutputFileName = opt(output); Ensemble E; E.FromFile(EfaFileName); MSA Ref; Ref.FromFASTAFile_PreserveCase(RefFileName); FILE *fOut = CreateStdioFile(OutputFileName); const uint MSACount = E.GetMSACount(); for (uint MSAIndex = 0; MSAIndex < MSACount; ++MSAIndex) { Pf(fOut, "<%s\n", E.GetMSAName(MSAIndex).c_str()); const MSA &TestMSA = E.GetMSA(MSAIndex); MSA TrimmedMSA; TrimToRef(TestMSA, Ref, TrimmedMSA); TrimmedMSA.ToFASTAFile(fOut); } CloseStdioFile(fOut); } muscle-5.1.0/src/types.h000066400000000000000000000003401424453062600151140ustar00rootroot00000000000000#pragma once class MSA; class Seq; class TextFile; class Tree; class SeqVect; enum LINKAGE { LINKAGE_Min, LINKAGE_Max, LINKAGE_Avg, LINKAGE_Biased, }; typedef float t_ByteVec[256]; typedef float t_ByteMx[256][256]; muscle-5.1.0/src/uclust.cpp000066400000000000000000000125771424453062600156410ustar00rootroot00000000000000#include "muscle.h" #include "uclust.h" #include "pwpath.h" float UClust::AlignSeqPair(uint SeqIndex1, uint SeqIndex2, string &Path) { Path.clear(); const Sequence *Seq1 = m_InputSeqs->GetSequence(SeqIndex1); const Sequence *Seq2 = m_InputSeqs->GetSequence(SeqIndex2); asserta(Seq1 != 0); asserta(Seq2 != 0); float EA = AlignPairFlat(Seq1, Seq2, Path); return EA; } void UClust::AddSeqToIndex(uint SeqIndex) { const Sequence *Seq = m_InputSeqs->GetSequence(SeqIndex); const byte *ByteSeq = Seq->GetBytePtr(); const uint L = Seq->GetLength(); m_US.AddSeq(ByteSeq, L, SeqIndex); } uint UClust::Search(uint SeqIndex, string &Path) { const Sequence *Seq = m_InputSeqs->GetSequence(SeqIndex); const byte *ByteSeq = Seq->GetBytePtr(); const uint L = Seq->GetLength(); vector TopSeqIndexes; vector TopWordCounts; m_US.SearchSeq(ByteSeq, L, TopSeqIndexes, TopWordCounts); uint TopCount = SIZE(TopSeqIndexes); asserta(SIZE(TopWordCounts) == TopCount); if (TopCount == 0) return UINT_MAX; if (TopCount > MAX_REJECTS) TopCount = MAX_REJECTS; uint ThreadCount = GetRequestedThreadCount(); uint CentroidSeqIndex = UINT_MAX; for (int TopIndex = 0; TopIndex < (int) TopCount; ++TopIndex) { uint TopSeqIndex = TopSeqIndexes[TopIndex]; float EA = AlignSeqPair(SeqIndex, TopSeqIndex, Path); if (EA >= m_MinEA) { CentroidSeqIndex = TopSeqIndex; break; } } return CentroidSeqIndex; } void UClust::Run(MultiSequence &InputSeqs, float MinEA) { m_InputSeqs = &InputSeqs; m_MinEA = MinEA; m_US.Init(); const uint InputSeqCount = m_InputSeqs->GetSeqCount(); const uint GSICount = GetGlobalMSSeqCount(); vector GSIToInputSeqIndex(GSICount, UINT_MAX); for (uint SeqIndex = 0; SeqIndex < InputSeqCount; ++SeqIndex) { uint GSI = InputSeqs.GetGSI(SeqIndex); asserta(GSI < GSICount); asserta(GSIToInputSeqIndex[GSI] == UINT_MAX); GSIToInputSeqIndex[GSI] = SeqIndex;; } uint CentroidCount = 0; uint MemberCount = 0; m_SeqIndexToCentroidSeqIndex.clear(); m_SeqIndexToPath.clear(); m_SeqIndexToCentroidSeqIndex.resize(InputSeqCount, UINT_MAX); m_SeqIndexToPath.resize(InputSeqCount); #if DEBUG vector Done(InputSeqCount, false); #endif vector Order; InputSeqs.GetLengthOrder(Order); uint LastLength = UINT_MAX; const float MinEE = (1 - m_MinEA); for (uint k = 0; k < InputSeqCount; ++k) { uint SeqIndex = Order[k]; asserta(SeqIndex < InputSeqCount); #if DEBUG asserta(Done[SeqIndex] == false); Done[SeqIndex] = true; #endif const uint L = (uint) InputSeqs.GetSeqLength(SeqIndex); asserta(L <= LastLength); LastLength = L; ProgressStep(k, InputSeqCount, "UCLUST %u seqs EE<%.2f, %u centroids, %u members", InputSeqCount, MinEE, CentroidCount, MemberCount); string &Path = m_SeqIndexToPath[SeqIndex]; uint RepSeqIndex = Search(SeqIndex, Path); if (RepSeqIndex == UINT_MAX) { m_CentroidSeqIndexes.push_back(SeqIndex); AddSeqToIndex(SeqIndex); ++CentroidCount; RepSeqIndex = SeqIndex; Path.clear(); } else ++MemberCount; m_SeqIndexToCentroidSeqIndex[SeqIndex] = RepSeqIndex; } } void UClust::GetCentroidSeqs(MultiSequence &CentroidSeqs) const { const uint CentroidCount = SIZE(m_CentroidSeqIndexes); for (uint i = 0; i < CentroidCount; ++i) { uint SeqIndex = m_CentroidSeqIndexes[i]; const Sequence *Seq = m_InputSeqs->GetSequence(SeqIndex); CentroidSeqs.AddSequence(Seq, false); } AssertSameLabels(CentroidSeqs); } void UClust::GetGSIs( vector &CentroidGSIs, vector &MemberGSIs, vector &MemberCentroidGSIs, vector &GSIToMemberCentroidPath) const { CentroidGSIs.clear(); MemberGSIs.clear(); MemberCentroidGSIs.clear(); GSIToMemberCentroidPath.clear(); const uint InputSeqCount = m_InputSeqs->GetSeqCount(); const uint GSICount = GetGSICount(); GSIToMemberCentroidPath.resize(GSICount); const uint ClusterCount = SIZE(m_CentroidSeqIndexes); for (uint ClusterIndex = 0; ClusterIndex < ClusterCount; ++ClusterIndex) { uint CentroidSeqIndex = m_CentroidSeqIndexes[ClusterIndex]; const Sequence *Seq = m_InputSeqs->GetSequence(CentroidSeqIndex); uint CentroidGSI = Seq->GetGSI(); CentroidGSIs.push_back(CentroidGSI); } asserta(SIZE(m_SeqIndexToCentroidSeqIndex) == InputSeqCount); for (uint MemberSeqIndex = 0; MemberSeqIndex < InputSeqCount; ++MemberSeqIndex) { uint CentroidSeqIndex = m_SeqIndexToCentroidSeqIndex[MemberSeqIndex]; if (CentroidSeqIndex == MemberSeqIndex) continue; const string &Path = m_SeqIndexToPath[MemberSeqIndex]; const Sequence *MemberSeq = m_InputSeqs->GetSequence(MemberSeqIndex); const Sequence *CentroidSeq = m_InputSeqs->GetSequence(CentroidSeqIndex); uint MemberGSI = MemberSeq->GetGSI(); uint MemberCentroidGSI = CentroidSeq->GetGSI(); MemberGSIs.push_back(MemberGSI); MemberCentroidGSIs.push_back(MemberCentroidGSI); asserta(!Path.empty()); GSIToMemberCentroidPath[MemberGSI] = Path; } } void cmd_uclust() { const string &InputFileName = opt(uclust); const string &OutputFileName = opt(output); const float MinPctId = (float) optd(pctid, 90); MultiSequence InputSeqs; InputSeqs.FromFASTA(InputFileName); bool IsNucleo = InputSeqs.GuessIsNucleo(); if (IsNucleo) SetAlpha(ALPHA_Nucleo); else SetAlpha(ALPHA_Amino); UClust U; U.Run(InputSeqs, MinPctId); MultiSequence *CentroidSeqs = new MultiSequence; U.GetCentroidSeqs(*CentroidSeqs); CentroidSeqs->WriteMFA(OutputFileName); } muscle-5.1.0/src/uclust.h000066400000000000000000000013041424453062600152700ustar00rootroot00000000000000#pragma once #include "usorter.h" class UClust { const uint MAX_REJECTS = 8; public: MultiSequence *m_InputSeqs = 0; float m_MinEA = 0.99f; USorter m_US; vector m_CentroidSeqIndexes; vector m_SeqIndexToCentroidSeqIndex; vector m_SeqIndexToPath; public: void Run(MultiSequence &InputSeqs, float MinEA); uint Search(uint SeqIndex, string &Path); void AddSeqToIndex(uint SeqIndex); float AlignSeqPair(uint SeqIndex1, uint SeqIndex2, string &Path); void GetCentroidSeqs(MultiSequence &CentroidSeqs) const; void GetGSIs( vector &CentroidGSIs, vector &MemberGSIs, vector &MemberCentroidGSIs, vector &GSIToMemberCentroidPath) const; }; muscle-5.1.0/src/upgma5.cpp000066400000000000000000000272471424453062600155200ustar00rootroot00000000000000#include "muscle.h" #include "upgma5.h" #include "textfile.h" #include "tree.h" // UPGMA clustering in O(N^2) time and space. #define TRACE 0 static inline float AVG(float x, float y) { return (x + y)/2; } void UPGMA5::LogMe() const { Log("Dist matrix\n"); Log(" "); for (uint i = 0; i < m_LeafCount; ++i) { if (UINT_MAX == m_NodeIndex[i]) continue; Log(" %5u", m_NodeIndex[i]); } Log("\n"); for (uint i = 0; i < m_LeafCount; ++i) { if (UINT_MAX == m_NodeIndex[i]) continue; Log("%5u ", m_NodeIndex[i]); for (uint j = 0; j < m_LeafCount; ++j) { if (UINT_MAX == m_NodeIndex[j]) continue; if (i == j) Log(" "); else { uint v = TriangleSubscript(i, j); Log("%5.2g ", m_Dist[v]); } } Log(" %s", m_Labels[i].c_str()); Log("\n"); } Log("\n"); Log(" i Node NrNb Dist\n"); Log("----- ----- ----- --------\n"); for (uint i = 0; i < m_LeafCount; ++i) { if (UINT_MAX == m_NodeIndex[i]) continue; Log("%5u %5u %5u %8.3f\n", i, m_NodeIndex[i], m_NearestNeighbor[i], m_MinDist[i]); } Log("\n"); Log(" Node L R Height LLength RLength\n"); Log("----- ----- ----- ------ ------- -------\n"); for (uint i = 0; i <= m_InternalNodeIndex; ++i) Log("%5u %5u %5u %6.2g %6.2g %6.2g\n", i, m_Left[i], m_Right[i], m_Height[i], m_LeftLength[i], m_RightLength[i]); } void UPGMA5::Run(LINKAGE Linkage, Tree &tree) { m_LeafCount = SIZE(m_Labels); asserta(SIZE(m_DistMx) == m_LeafCount); for (uint i = 0; i < m_LeafCount; ++i) asserta(SIZE(m_DistMx[i]) == m_LeafCount); m_TriangleSize = (m_LeafCount*(m_LeafCount - 1))/2; m_InternalNodeCount = m_LeafCount - 1; m_Dist = new float[m_TriangleSize]; m_NodeIndex = new uint[m_LeafCount]; m_NearestNeighbor = new uint[m_LeafCount]; m_MinDist = new float[m_LeafCount]; uint *Ids = new uint [m_LeafCount]; char **Names = new char *[m_LeafCount]; m_Left = new uint[m_InternalNodeCount]; m_Right = new uint[m_InternalNodeCount]; m_Height = new float[m_InternalNodeCount]; m_LeftLength = new float[m_InternalNodeCount]; m_RightLength = new float[m_InternalNodeCount]; for (uint i = 0; i < m_LeafCount; ++i) { m_MinDist[i] = FLT_MAX; m_NodeIndex[i] = i; m_NearestNeighbor[i] = UINT_MAX; Ids[i] = i; Names[i] = mystrsave(m_Labels[i].c_str()); } for (uint i = 0; i < m_InternalNodeCount; ++i) { m_Left[i] = UINT_MAX; m_Right[i] = UINT_MAX; m_LeftLength[i] = FLT_MAX; m_RightLength[i] = FLT_MAX; m_Height[i] = FLT_MAX; } // Compute initial NxN triangular distance matrix. // Store minimum distance for each full (not triangular) row. // Loop from 1, not 0, because "row" is 0, 1 ... i-1, // so nothing to do when i=0. for (uint i = 1; i < m_LeafCount; ++i) { uint Base = TriangleSubscript(i, 0); float *Row = m_Dist + Base; for (uint j = 0; j < i; ++j) { float d = m_DistMx[i][j]; if (d < 0) { d = 0; m_DistMx[i][j] = 0; m_DistMx[j][i] = 0; } m_Dist[Base++] = d; if (d < m_MinDist[i]) { m_MinDist[i] = d; m_NearestNeighbor[i] = j; } if (d < m_MinDist[j]) { m_MinDist[j] = d; m_NearestNeighbor[j] = i; } } } #if TRACE Log("Initial state:\n"); LogMe(); #endif const uint JoinCount = m_LeafCount - 1; for (m_InternalNodeIndex = 0; m_InternalNodeIndex < JoinCount; ++m_InternalNodeIndex) { ProgressStep(m_InternalNodeIndex, JoinCount, "UPGMA5"); #if TRACE Log("\n"); Log("Internal node index %5u\n", m_InternalNodeIndex); Log("-------------------------\n"); #endif // Find nearest neighbors uint Lmin = UINT_MAX; uint Rmin = UINT_MAX; float dtMinDist = FLT_MAX; for (uint j = 0; j < m_LeafCount; ++j) { if (UINT_MAX == m_NodeIndex[j]) continue; float d = m_MinDist[j]; if (d < dtMinDist) { dtMinDist = d; Lmin = j; Rmin = m_NearestNeighbor[j]; assert(UINT_MAX != Rmin); assert(UINT_MAX != m_NodeIndex[Rmin]); } } assert(Lmin != UINT_MAX); assert(Rmin != UINT_MAX); assert(dtMinDist != FLT_MAX); #if TRACE Log("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n", Lmin, m_NodeIndex[Lmin], Rmin, m_NodeIndex[Rmin], dtMinDist); #endif // Compute distances to new node // New node overwrites row currently assigned to Lmin float dtNewMinDist = FLT_MAX; uint uNewNearestNeighbor = UINT_MAX; for (uint j = 0; j < m_LeafCount; ++j) { if (j == Lmin || j == Rmin) continue; if (UINT_MAX == m_NodeIndex[j]) continue; const uint vL = TriangleSubscript(Lmin, j); const uint vR = TriangleSubscript(Rmin, j); const float dL = m_Dist[vL]; const float dR = m_Dist[vR]; float dtNewDist; switch (Linkage) { case LINKAGE_Avg: dtNewDist = AVG(dL, dR); break; case LINKAGE_Min: dtNewDist = min(dL, dR); break; case LINKAGE_Max: dtNewDist = max(dL, dR); break; case LINKAGE_Biased: dtNewDist = 0.1f*AVG(dL, dR) + (1 - 0.1f)*min(dL, dR); break; default: Die("UPGMA5: Invalid LINKAGE_%u", Linkage); } // Nasty special case. // If nearest neighbor of j is Lmin or Rmin, then make the new // node (which overwrites the row currently occupied by Lmin) // the nearest neighbor. This situation can occur when there are // equal distances in the matrix. If we don't make this fix, // the nearest neighbor pointer for j would become invalid. // (We don't need to test for == Lmin, because in that case // the net change needed is zero due to the change in row // numbering). if (m_NearestNeighbor[j] == Rmin) m_NearestNeighbor[j] = Lmin; #if TRACE Log("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n", j, Lmin, dL, Rmin, dR, dtNewDist); #endif m_Dist[vL] = dtNewDist; if (dtNewDist < dtNewMinDist) { dtNewMinDist = dtNewDist; uNewNearestNeighbor = j; } } assert(m_InternalNodeIndex < m_LeafCount - 1 || FLT_MAX != dtNewMinDist); assert(m_InternalNodeIndex < m_LeafCount - 1 || UINT_MAX != uNewNearestNeighbor); const uint v = TriangleSubscript(Lmin, Rmin); const float dLR = m_Dist[v]; const float dHeightNew = dLR/2; const uint uLeft = m_NodeIndex[Lmin]; const uint uRight = m_NodeIndex[Rmin]; const float HeightLeft = uLeft < m_LeafCount ? 0 : m_Height[uLeft - m_LeafCount]; const float HeightRight = uRight < m_LeafCount ? 0 : m_Height[uRight - m_LeafCount]; m_Left[m_InternalNodeIndex] = uLeft; m_Right[m_InternalNodeIndex] = uRight; m_LeftLength[m_InternalNodeIndex] = dHeightNew - HeightLeft; m_RightLength[m_InternalNodeIndex] = dHeightNew - HeightRight; m_Height[m_InternalNodeIndex] = dHeightNew; // Row for left child overwritten by row for new node m_NodeIndex[Lmin] = m_LeafCount + m_InternalNodeIndex; m_NearestNeighbor[Lmin] = uNewNearestNeighbor; m_MinDist[Lmin] = dtNewMinDist; // Delete row for right child m_NodeIndex[Rmin] = UINT_MAX; #if TRACE Log("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n", m_InternalNodeIndex, Lmin, Rmin); LogMe(); #endif } uint uRoot = m_LeafCount - 2; tree.Create(m_LeafCount, uRoot, m_Left, m_Right, m_LeftLength, m_RightLength, Ids, Names); #if TRACE tree.LogMe(); #endif delete[] m_Dist; delete[] m_NodeIndex; delete[] m_NearestNeighbor; delete[] m_MinDist; delete[] m_Height; delete[] m_Left; delete[] m_Right; delete[] m_LeftLength; delete[] m_RightLength; for (uint i = 0; i < m_LeafCount; ++i) free(Names[i]); delete[] Names; delete[] Ids; m_Dist = 0; m_NodeIndex = 0; m_NearestNeighbor = 0; m_MinDist = 0; m_Height = 0; m_Left = 0; m_LeftLength = 0; m_RightLength = 0; Names = 0; Ids = 0; } void UPGMA5::Clear() { m_Labels.clear(); m_DistMx.clear(); m_LabelToIndex.clear(); } void UPGMA5::Init(const vector &Labels, const vector > &DistMx) { Clear(); m_DistMx = DistMx; m_Labels = Labels; for (uint i = 0; i < SIZE(Labels); ++i) { const string &Label = Labels[i]; if (m_LabelToIndex.find(Label) != m_LabelToIndex.end()) Die("UPGMA5::Init(), duplicate label >%s", Label.c_str()); m_LabelToIndex[Label] = i; } m_LeafCount = SIZE(m_Labels); } void UPGMA5::AddLabel(const string &Label) { asserta(!Label.empty()); if (m_LabelToIndex.find(Label) != m_LabelToIndex.end()) return; uint Index = SIZE(m_Labels); m_Labels.push_back(Label); m_LabelToIndex[Label] = Index; } uint UPGMA5::GetLabelIndex(const string &Label) const { map::const_iterator p = m_LabelToIndex.find(Label); asserta(p != m_LabelToIndex.end()); uint Index = p->second; return Index; } void UPGMA5::ReadDistMx(const string &FileName) { Progress("Reading dist mx..."); // Pass 1, labels FILE *f = OpenStdioFile(FileName); m_Labels.clear(); m_LabelToIndex.clear(); string Line; vector Fields; while (ReadLineStdioFile(f, Line)) { Split(Line, Fields, '\t'); asserta(SIZE(Fields) == 3); const string &Label1 = Fields[0]; const string &Label2 = Fields[1]; AddLabel(Label1); AddLabel(Label2); } m_LeafCount = SIZE(m_Labels); m_DistMx.clear(); m_DistMx.resize(m_LeafCount); for (uint i = 0; i < m_LeafCount; ++i) m_DistMx[i].resize(m_LeafCount, FLT_MAX); // Pass 2, distances SetStdioFilePos(f, 0); uint LineNr = 0; while (ReadLineStdioFile(f, Line)) { ++LineNr; Split(Line, Fields, '\t'); asserta(SIZE(Fields) == 3); const string &Label1 = Fields[0]; const string &Label2 = Fields[1]; uint Index1 = GetLabelIndex(Label1); uint Index2 = GetLabelIndex(Label2); float Dist = (float) StrToFloat(Fields[2]); if (Index1 == Index2) Die("Line %u Index1=%u Index2=%u Label1='%s' Label2='%s'", LineNr, Index1, Index2, Label1.c_str(), Label2.c_str()); m_DistMx[Index1][Index2] = Dist; m_DistMx[Index2][Index1] = Dist; } CloseStdioFile(f); Progress(" done.\n"); } void UPGMA5::FixEADistMx() { for (uint i = 0; i < m_LeafCount; ++i) { m_DistMx[i][i] = 0; for (uint j = 0; j < i; ++j) { float d = m_DistMx[i][j]; asserta(d >= 0 && d <= 1); float NewDist = 1 - d; m_DistMx[i][j] = NewDist; m_DistMx[j][i] = NewDist; } } } void UPGMA5::ScaleDistMx() { const float SCALE = 10.0f; float MinDist = m_DistMx[0][1]; float MaxDist = m_DistMx[0][1]; for (uint i = 0; i < m_LeafCount; ++i) { for (uint j = 0; j < i; ++j) { float d = m_DistMx[i][j]; asserta(m_DistMx[j][i] == d); MinDist = min(MinDist, d); MaxDist = max(MaxDist, d); } } ProgressLog("Min dist %.4g, max %.4g\n", MinDist, MaxDist); float MinNewDist = FLT_MAX; float MaxNewDist = FLT_MAX; for (uint i = 0; i < m_LeafCount; ++i) { for (uint j = 0; j < i; ++j) { float d = m_DistMx[i][j]; float NewDist = SCALE*(MaxDist - d)/(MaxDist - MinDist); if (MinNewDist == FLT_MAX || NewDist < MinNewDist) MinNewDist = NewDist; if (MaxNewDist == FLT_MAX || NewDist > MaxNewDist) MaxNewDist = NewDist; m_DistMx[i][j] = NewDist; m_DistMx[j][i] = NewDist; } } ProgressLog("Scaled min dist %.3g, max %.3g. scale\n", MinNewDist, MaxNewDist, SCALE); } void cmd_upgma5() { const string &InputFileName = opt(upgma5); const string &OutputFileName = opt(output); UPGMA5 U; U.ReadDistMx(InputFileName); if (opt(scaledist)) U.ScaleDistMx(); else if (opt(eadist)) U.FixEADistMx(); LINKAGE Linkage = LINKAGE_Avg; string sLink = "avg"; if (optset_linkage) { sLink = opt(linkage); if (sLink == "avg") Linkage = LINKAGE_Avg; else if (sLink == "min") Linkage = LINKAGE_Min; else if (sLink == "max") Linkage = LINKAGE_Max; else if (sLink == "biased") Linkage = LINKAGE_Biased; else Die("Invalid -linkage %s", sLink.c_str()); } ProgressLog("UPGMA5(%s)\n", sLink.c_str()); Tree t; U.Run(Linkage, t); TextFile TF(OutputFileName.c_str(), true); t.ToFile(TF); TF.Close(); ProgressLog("All done.\n"); } muscle-5.1.0/src/upgma5.h000066400000000000000000000041051424453062600151510ustar00rootroot00000000000000#pragma once class UPGMA5 { public: uint m_LeafCount = 0; uint m_TriangleSize = 0; uint m_InternalNodeCount = 0; uint m_InternalNodeIndex = 0; // Triangular distance matrix is m_Dist, which is allocated // as a one-dimensional vector of length m_TriangleSize. // TriangleSubscript(i,j) maps row,column=i,j to the subscript // into this vector. // Row / column coordinates are a bit messy. // Initially they are leaf indexes 0..N-1. // But each time we create a new node (=new cluster, new subtree), // we re-use one of the two rows that become available (the children // of the new node). This saves memory. // We keep track of this through the m_NodeIndex vector. float *m_Dist = 0; // Distance to nearest neighbor in row i of distance matrix. // Subscript is distance matrix row. float *m_MinDist = 0; // Nearest neighbor to row i of distance matrix. // Subscript is distance matrix row. uint *m_NearestNeighbor = 0; // Node index of row i in distance matrix. // Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes. // Subscript is distance matrix row. uint *m_NodeIndex = 0; // The following vectors are defined on internal nodes, // subscripts are internal node index 0..N-2. // For m_Left/Right, value is the node index 0 .. 2N-2 // because a child can be internal or leaf. uint *m_Left = 0; uint *m_Right = 0; float *m_Height = 0; float *m_LeftLength = 0; float *m_RightLength = 0; vector m_Labels; vector > m_DistMx; map m_LabelToIndex; public: void Clear(); void Init(const vector &Labels, const vector > &DistMx); void Run(LINKAGE Linkage, Tree &tree); void ReadDistMx(const string &FileName); void ScaleDistMx(); void FixEADistMx(); void LogMe() const; void AddLabel(const string &Label); uint GetLabelIndex(const string &Label) const; uint TriangleSubscript(uint uIndex1, uint uIndex2) const { uint v; if (uIndex1 >= uIndex2) v = uIndex2 + (uIndex1*(uIndex1 - 1))/2; else v = uIndex1 + (uIndex2*(uIndex2 - 1))/2; assert(v < (m_LeafCount*(m_LeafCount - 1))/2); return v; } }; muscle-5.1.0/src/usage.cpp000066400000000000000000000001431424453062600154100ustar00rootroot00000000000000#include "muscle.h" void Usage(FILE *f) { PrintBanner(f); fprintf(f, #include "usage.h" ); } muscle-5.1.0/src/usage.h000066400000000000000000000055461424453062600150710ustar00rootroot00000000000000"Align FASTA input, write aligned FASTA (AFA) output:\n" " muscle -align input.fa -output aln.afa\n" "\n" "Align large input using Super5 algorithm if -align is too expensive,\n" "typically needed with more than a few hundred sequences:\n" " muscle -super5 input.fa -output aln.afa\n" "\n" "Single replicate alignment:\n" " muscle -align input.fa -perm PERM -perturb SEED -output aln.afa\n" " muscle -super5 input.fa -perm PERM -perturb SEED -output aln.afa\n" " PERM is guide tree permutation none, abc, acb, bca (default none).\n" " SEED is perturbation seed 0, 1, 2... (default 0 = don't perturb).\n" "\n" "Ensemble of replicate alignments, output in Ensemble FASTA (EFA) format,\n" "EFA has one aligned FASTA for each replicate with header line \" 0); if (g_Alpha == ALPHA_Amino) { m_WordLength = 3; m_DictSize = myipow(20, m_WordLength); } else if (g_Alpha == ALPHA_Nucleo) { m_WordLength = 8; m_DictSize = myipow(4, m_WordLength); } else asserta(false); m_Rows.clear(); m_Rows.resize(m_DictSize); } uint USorter::CharsToWord(const byte *Chars) { if (g_Alpha == ALPHA_Amino) return CharsToWord_Amino(Chars); else if (g_Alpha == ALPHA_Nucleo) return CharsToWord_Nucleo(Chars); asserta(false); return UINT_MAX; } uint USorter::CharsToWord_Amino(const byte *Chars) { uint Word = 0; for (uint i = 0; i < m_WordLength; ++i) { char c = Chars[i]; uint Letter = g_CharToLetter[c]; if (Letter >= 20) return UINT_MAX; Word = Word*20 + Letter; } return Word; } uint USorter::CharsToWord_Nucleo(const byte *Chars) { uint Word = 0; for (uint i = 0; i < m_WordLength; ++i) { char c = Chars[i]; uint Letter = g_CharToLetter[c]; if (Letter >= 4) return UINT_MAX; Word = Word*4 + Letter; } return Word; } void USorter::AddSeq(const byte *Seq, uint L, uint SeqIndex) { asserta(g_AlphaSize > 0); uint Index = SIZE(m_IndexSeqIndexes); asserta(L >= m_WordLength); const uint WordCount = L + 1 - m_WordLength; for (uint i = 0; i < WordCount; ++i) { uint Word = CharsToWord(Seq + i); if (Word < m_DictSize) m_Rows[Word].push_back(Index); } m_IndexSeqIndexes.push_back(SeqIndex); } void USorter::SearchSeq(const byte *Seq, uint L, vector &TopSeqIndexes, vector &TopWordCounts) { TopSeqIndexes.clear(); TopWordCounts.clear(); uint IndexSize = SIZE(m_IndexSeqIndexes); if (IndexSize == 0) return; asserta(L >= m_WordLength); const uint WordCount = L + 1 - m_WordLength; vector U(IndexSize, 0); for (uint i = 0; i < WordCount; ++i) { uint Word = CharsToWord(Seq + i); if (Word >= m_DictSize) continue; const vector &Row = m_Rows[Word]; const uint n = SIZE(Row); for (uint j = 0; j < n; ++j) { uint TargetSeqIndex = Row[j]; asserta(TargetSeqIndex < IndexSize); U[TargetSeqIndex] += 1; } } uint *Order = myalloc(uint, IndexSize); QuickSortOrderDesc(U.data(), IndexSize, Order); uint TopSeqIndex = Order[0]; uint TopWordCount = U[TopSeqIndex]; uint MinU = TopWordCount/2 - 1; if (MinU == 0) MinU = 1; uint LastWordCount = TopWordCount; for (uint i = 0; i < IndexSize; ++i) { uint Index = Order[i]; uint WordCount = U[Index]; asserta(WordCount <= LastWordCount); LastWordCount = WordCount; if (WordCount < MinU) break; uint SeqIndex = m_IndexSeqIndexes[Index]; TopSeqIndexes.push_back(SeqIndex); TopWordCounts.push_back(WordCount); } myfree(Order); } void cmd_usorter() { const string QueryFileName = opt(usorter); const string DBFileName = opt(db); MultiSequence Query; Query.FromFASTA(QueryFileName); MultiSequence DB; DB.FromFASTA(DBFileName); SetAlpha(ALPHA_Amino); USorter US; US.Init(); const uint DBSeqCount = DB.GetSeqCount(); for (uint DBSeqIndex = 0; DBSeqIndex < DBSeqCount; ++DBSeqIndex) { const Sequence *seq = DB.GetSequence(DBSeqIndex); const byte *SeqChars = (const byte *) seq->GetBytePtr(); uint L = (uint) seq->GetLength(); US.AddSeq(SeqChars, L, DBSeqIndex); } const uint QuerySeqCount = Query.GetSeqCount(); for (uint QuerySeqIndex = 0; QuerySeqIndex < QuerySeqCount; ++QuerySeqIndex) { const Sequence *seq = Query.GetSequence(QuerySeqIndex); const byte *SeqChars = (const byte *) seq->GetBytePtr(); uint L = (uint) seq->GetLength(); vector TopSeqIndexes; vector TopWordCounts; US.SearchSeq(SeqChars, L, TopSeqIndexes, TopWordCounts); const uint n = SIZE(TopSeqIndexes); asserta(SIZE(TopWordCounts) == n); Log("\n"); Log("Q>%s, %u hits\n", Query.GetLabel(QuerySeqIndex)); for (uint i = 0; i < n; ++i) { uint DBSeqIndex = TopSeqIndexes[i]; uint Count = TopWordCounts[i]; Log(" [%4u] %s\n", Count, DB.GetLabel(DBSeqIndex)); } } } muscle-5.1.0/src/usorter.h000066400000000000000000000010001424453062600154450ustar00rootroot00000000000000#pragma once class USorter { public: const MultiSequence m_MFA; vector > m_Rows; vector m_IndexSeqIndexes; uint m_WordLength = 0; // 3; uint m_DictSize = 0; // myipow(20, 3); public: void Init(); void AddSeq(const byte *Seq, uint L, uint SeqIndex); uint CharsToWord(const byte *Chars); uint CharsToWord_Nucleo(const byte *Chars); uint CharsToWord_Amino(const byte *Chars); void SearchSeq(const byte *Seq, uint L, vector &TopSeqIndexes, vector &TopWordCounts); };