pax_global_header00006660000000000000000000000064134066052030014511gustar00rootroot0000000000000052 comment=0a68a11d2b538a1a61e120ec1911add28a390e55 xavs2-1.3/000077500000000000000000000000001340660520300124175ustar00rootroot00000000000000xavs2-1.3/.gitattributes000066400000000000000000000037721340660520300153230ustar00rootroot00000000000000#common settings that generally should always be used with your language specific settings # Auto detect text files and perform LF normalization # http://davidlaing.com/2012/09/19/customise-your-gitattributes-to-become-a-git-ninja/ * text=auto # # The above will handle all files NOT found below # # Scripts *.bat text eol=crlf *.cmd text eol=crlf *.ps1 text eol=crlf *.sh text eol=lf # Documents *.doc diff=astextplain *.DOC diff=astextplain *.docx diff=astextplain *.DOCX diff=astextplain *.dot diff=astextplain *.DOT diff=astextplain *.ppt diff=astextplain *.PPT diff=astextplain *.pptx diff=astextplain *.PPTX diff=astextplain *.pdf diff=astextplain *.PDF diff=astextplain *.rtf diff=astextplain *.RTF diff=astextplain *.md text *.adoc text *.textile text *.mustache text *.csv text *.tab text *.tsv text *.sql text # Graphics *.png binary *.jpg binary *.jpeg binary *.gif binary *.tif binary *.tiff binary *.ico binary # SVG treated as an asset (binary) by default. If you want to treat it as text, # comment-out the following line and uncomment the line after. *.svg binary #*.svg text *.eps binary #sources *.c text eol=crlf *.cc text eol=crlf *.cxx text eol=crlf *.cpp text eol=crlf *.c++ text eol=crlf *.hpp text eol=crlf *.h text eol=crlf *.h++ text eol=crlf *.hh text eol=crlf *.asm text eol=crlf *.S text eol=crlf *.cfg text eol=crlf *.txt text eol=lf # QT Project files *.pro text eol=lf # Compiled Object files *.slo binary *.lo binary *.o binary *.obj binary # Precompiled Headers *.gch binary *.pch binary # Compiled Dynamic libraries *.so binary *.dylib binary *.dll binary # Compiled Static libraries *.lai binary *.la binary *.a binary *.lib binary # Executables *.exe binary *.out binary *.app binary # Custom for Visual Studio *.sln text eol=crlf *.csproj text eol=crlf *.vbproj text eol=crlf *.fsproj text eol=crlf *.dbproj text eol=crlf *.vcproj text eol=crlf *.vcxproj text eol=crlf *.sln text eol=crlf *.vcxitems text eol=crlf *.props text eol=crlf *.filters text eol=crlf xavs2-1.3/.github/000077500000000000000000000000001340660520300137575ustar00rootroot00000000000000xavs2-1.3/.github/ISSUE_TEMPLATE/000077500000000000000000000000001340660520300161425ustar00rootroot00000000000000xavs2-1.3/.github/ISSUE_TEMPLATE/----.md000066400000000000000000000002671340660520300170340ustar00rootroot00000000000000--- name: 问题咨询 about: 使用问题/安全问题/其他问题 --- 请发送邮件至: sswang@pku.edu.cn 或在应用内“高级设置” - “建议反馈” 填写表单 xavs2-1.3/.github/ISSUE_TEMPLATE/bug_report.md000066400000000000000000000021101340660520300206260ustar00rootroot00000000000000--- name: Bug report about: Create a report of bug / 如果你认为你发现了一项代码问题 --- **Describe the bug** A clear and concise description of what the bug is. 请详细的描述这个bug的细节 **To Reproduce** Steps to reproduce the behavior (including the commond line parameters) 请详细描述重现这个bug的步骤(运行的命令行参数、输入的文件) **Expected behavior** A clear and concise description of what you expected to happen. 你认为这个功能本应如何工作 **Screenshots** If applicable, add screenshots to help explain your problem. 如果有可能,请提供截图 **Desktop (please complete the following information):** - OS: [e.g. Windows10, Ubuntu 18.04] - Compiler [e.g. Visual Studio 2013, GCC 5.6.0] - yasm [e.g. 1.2.0, 1.3.0-luofl] 你的操作系统(包括版本)、编译器(GCC/G++, VS)、汇编器yasm(版本号)。 **Additional context** Add any other context about the problem here, i.e. video sequences and bitstreams. 额外的材料,例如输入的视频序列、码流文件等。 xavs2-1.3/.github/ISSUE_TEMPLATE/feature_request.md000066400000000000000000000003441340660520300216700ustar00rootroot00000000000000--- name: Feature request about: Suggest an idea for this project / 功能请求 --- 请详细填写以下四项关键元素 ## 功能描述 ## 功能带来的效应 ## 缺少此功能的影响 ## 实现的思路与方式 xavs2-1.3/.gitignore000066400000000000000000000004561340660520300144140ustar00rootroot00000000000000Debug/ Release/ x64_Debug/ x64_Release/ test/ My*/ *.user *.suo *.ncb *.aps *.pdb *.ipdb *.res *.dat *.manifest *.map *.dep *.idb *.ilk *.htm *.exp *.lib *.obj *.iobj *.dll* *.exe *.avs *.mkv *.mp4 *.y4m *.yuv *.log *.bak *.o *.a *.so *.cd *.sdf *.opensdf *.depend version.h *.pc *.mak *.so.* config.h xavs2-1.3/COPYING000066400000000000000000000433111340660520300134540ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License. This program is also available under a commercial proprietary license. For more information, contact us at sswang @ pku.edu.cn. xavs2-1.3/README.md000066400000000000000000000074001340660520300136770ustar00rootroot00000000000000# xavs2 **xavs2** is an open-source encoder of `AVS2-P2/IEEE1857.4` video coding standard. A decoder, **davs2**, can be found at [Github][4] or [Gitee (mirror in China)][5]. [![GitHub tag](https://img.shields.io/github/tag/pkuvcl/xavs2.svg?style=plastic)]() [![GitHub issues](https://img.shields.io/github/issues/pkuvcl/xavs2.svg)](https://github.com/pkuvcl/xavs2/issues) [![GitHub forks](https://img.shields.io/github/forks/pkuvcl/xavs2.svg)](https://github.com/pkuvcl/xavs2/network) [![GitHub stars](https://img.shields.io/github/stars/pkuvcl/xavs2.svg)](https://github.com/pkuvcl/xavs2/stargazers) ## Build it ### Windows Use `VS2013` or later version of visual studio to open the solution file `./build/vs2013/xavs2.sln`, then set the `xavs2` as the start project and build it. #### Notes 1. A `shell executor`, i.e. the bash in git for windows, is needed and should be found in `PATH` variable. For example, the path `C:\Program Files\Git\bin` can be added if git-for-windows is installed. 2. `vsyasm` is needed and `1.2.0` is suggested for windows platform. It can be downloaded through: http://yasm.tortall.net/Download.html . A later version `1.3.0` (unofficial revision, please read the instructions of `yasm` to build it for your work), can be found in https://github.com/luofalei/yasm/tree/vs2013 . The installation of `vsyasm` is as follows (if you were using `VS2013`): ``` (1) Copy `vsyasm.exe` to the following directory, "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\" (2) Copy the other 3 files in `vsyasm` to the `MSBuild template` directorty, as follows, "C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V120\BuildCustomizations\" (3) Re-open the solution. ``` ### Linux ``` $ cd build/linux $ ./configure $ make ``` ## Try it ``` ./xavs2 [-f encoder.cfg [-f seq.cfg]] [-p ParameterName=value] [--ParameterName=value] ``` ### Encode with configuration files ``` ./xavs2 -f encoder.cfg -f seq4K.cfg -p InputFile=input.yuv -p FramesToBeEncoded=500 \ -p preset=0 -p recon=. -p initial_qp=32 -p OutputFile=test.avs ``` ### Enocde without configuraton files ``` ./xavs2 -p InputFile=input.yuv --FramesToBeEncoded=500 --FrameRate=6 \ --SourceWidth=3840 --SourceHeight=2160 --InputSampleBitDepth=8 --SampleBitDepth=8 \ --thread_frames=1 --thread_rows=1 --preset=0 \ --recon=. --initial_qp=32 --OutputFile=test.avs ``` ## How to Report Bugs and Provide Feedback Use the ["Issues" tab on Github][6]. ## How to Contribute We welcome community contributions to this project. Thank you for your time! By contributing to the project, you agree to the license and copyright terms therein and to the release of your contribution under these terms. If you have some bugs or features fixed, and would like to share with the public, please [make a Pull Request][7]. ### Contribution process - Validate that your changes do not break a build - Perform smoke tests and ensure they pass - Submit a pull request for review to the maintainer ### Known workitems or bugs - high bit-depth (i.e. 10-bit) support and SIMD optimization. - Rate-control in CBR, VBR. - Adaptive scene change detection and frame type decision. - NEON support for ARM platform. - and so on. ## Homepages [PKU-VCL][1] `AVS2-P2/IEEE1857.4` Encoder: [xavs2 (Github)][2], [xavs2 (mirror in China)][3] `AVS2-P2/IEEE1857.4` Decoder: [davs2 (Github)][4], [davs2 (mirror in China)][5] [1]: http://vcl.idm.pku.edu.cn/ "PKU-VCL" [2]: https://github.com/pkuvcl/xavs2 "xavs2 github repository" [3]: https://gitee.com/pkuvcl/xavs2 "xavs2 gitee repository" [4]: https://github.com/pkuvcl/davs2 "davs2 decoder@github" [5]: https://gitee.com/pkuvcl/davs2 "davs2 decoder@gitee" [6]: https://github.com/pkuvcl/xavs2/issues "report issues" [7]: https://github.com/pkuvcl/xavs2/pulls "pull request" xavs2-1.3/README.zh.md000066400000000000000000000075261340660520300143300ustar00rootroot00000000000000# xavs2 遵循 `AVS2-P2/IEEE1857.4` 视频编码标准的编码器. 对应的解码器 **davs2** 可在 [Github][4] 或 [Gitee (mirror in China)][5] 上找到. [![GitHub tag](https://img.shields.io/github/tag/pkuvcl/xavs2.svg?style=plastic)]() [![GitHub issues](https://img.shields.io/github/issues/pkuvcl/xavs2.svg)](https://github.com/pkuvcl/xavs2/issues) [![GitHub forks](https://img.shields.io/github/forks/pkuvcl/xavs2.svg)](https://github.com/pkuvcl/xavs2/network) [![GitHub stars](https://img.shields.io/github/stars/pkuvcl/xavs2.svg)](https://github.com/pkuvcl/xavs2/stargazers) ## 编译方法 ### Windows 可使用`VS2013`打开解决方案`./build/win32/xavs2.sln`进行编译, 也可以使用更新的vs版本打开上述解决方案. 打开解决方案后, 将工程`xavs2`设置为启动项, 进行编译即可. #### 注意 1. 首次编译本项目时, 需要安装一个 `shell 执行器`, 比如 `git-for-windows` 中的 `bash`, 需要将该 `bash` 所在的目录添加到系统环境变量 `PATH` 中. 如上所述, 如果您以默认配置安装了`git-for-windows`, 那么将 `C:\Program Files\Git\bin` 添加到环境变量中即可. 2. 需要安装 `vsyasm`, 我们建议的版本号是 `1.2.0`, 因为官方更新的版本存在编译问题. 下载地址: http://yasm.tortall.net/Download.html . 一个修改过可以正常编译的 `1.3.0` 版本(注意:此修改非官方, 编译请参考yasm的编译指南)可以在这里找到: https://github.com/luofalei/yasm/tree/vs2013 . 其典型的安装步骤如下(使用VS2013时): ``` (1) 将vsyasm.exe文件拷贝到如下目录: "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\" (2) 将剩余三个vsyasm文件拷贝到MSBuild模板目录: "C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V120\BuildCustomizations\" (3) 重新打开VS2013, asmopt工程应已正常加载, 编译无错误. ``` ### Linux 对于linux系统, 依次执行如下命令即可完成编译: ``` $ cd build/linux $ ./configure $ make ``` ## 运行和测试 ``` ./xavs2 [-f encoder.cfg [-f seq.cfg]] [-p ParameterName=value] [--ParameterName=value] ``` ### 使用配置文件进行参数设置 ``` ./xavs2 -f encoder.cfg -f seq4K.cfg -p InputFile=input.yuv -p FramesToBeEncoded=500 \ -p preset=0 -p recon=. -p initial_qp=32 -p OutputFile=test.avs ``` ### 不使用配置文件 ``` ./xavs2 -p InputFile=input.yuv --FramesToBeEncoded=500 --FrameRate=6 \ --SourceWidth=3840 --SourceHeight=2160 --InputSampleBitDepth=8 --SampleBitDepth=8 \ --thread_frames=1 --thread_rows=1 --preset=0 \ --recon=. --initial_qp=32 --OutputFile=test.avs ``` ## Issue & Pull Request 欢迎提交 issue,请写清楚遇到问题的环境与运行参数,包括操作系统环境、编译器环境等,重现的流程, 如果可能提供原始输入YUV/码流文件,请尽量提供以方便更快地重现结果。 [反馈问题的 issue 请按照模板格式填写][6]。 如果有开发能力,建议在本地调试出错的代码,并[提供相应修正的 Pull Request][7]。 ### 已知问题与工作清单 - 高比特精度(10-bit)支持与其SIMD指令优化. - 码率控制. - 场景切换检测与自适应帧类型选择. - ARM平台的NEON指令优化. - 等等. ## 主页链接 [北京大学-视频编码算法研究室(PKU-VCL)][1] `AVS2-P2/IEEE1857.4` 编码器: [xavs2 (Github)][2], [xavs2 (mirror in China)][3] `AVS2-P2/IEEE1857.4` 解码器: [davs2 (Github)][4], [davs2 (mirror in China)][5] [1]: http://vcl.idm.pku.edu.cn/ "PKU-VCL" [2]: https://github.com/pkuvcl/xavs2 "xavs2 github repository" [3]: https://gitee.com/pkuvcl/xavs2 "xavs2 gitee repository" [4]: https://github.com/pkuvcl/davs2 "davs2 decoder@github" [5]: https://gitee.com/pkuvcl/davs2 "davs2 decoder@gitee" [6]: https://github.com/pkuvcl/xavs2/issues "report issues" [7]: https://github.com/pkuvcl/xavs2/pulls "pull request" xavs2-1.3/build/000077500000000000000000000000001340660520300135165ustar00rootroot00000000000000xavs2-1.3/build/linux/000077500000000000000000000000001340660520300146555ustar00rootroot00000000000000xavs2-1.3/build/linux/Makefile000066400000000000000000000264061340660520300163250ustar00rootroot00000000000000# Makefile include config.mak vpath %.c $(SRCPATH) vpath %.h $(SRCPATH) vpath %.S $(SRCPATH) vpath %.asm $(SRCPATH) vpath %.rc $(SRCPATH) CFLAGS += -I. -I$(SRCPATH) \ -I$(SRCPATH)/common \ -I$(SRCPATH)/common/x86 \ -I$(SRCPATH)/common/vec \ -I$(SRCPATH)/encoder \ -I$(SRCPATH)/test GENERATED = all: default default: # common sources SRCS = \ common/block_info.c common/common.c \ common/cpu.c common/cudata.c \ common/cg_scan.c \ common/frame.c common/intra.c common/mc.c \ common/pixel.c common/quant.c \ common/threadpool.c common/transform.c \ common/win32thread.c \ common/primitives.c \ common/filter_alf.c \ common/filter_deblock.c \ common/filter_sao.c # encoder sources SRCS += \ encoder/aec_ctx.c encoder/aec.c \ encoder/aec_rdo.c encoder/aec_fastrdo.c encoder/aec_vrdo.c \ encoder/alf.c \ encoder/encoder.c \ encoder/encoder_report.c \ encoder/header.c \ encoder/me.c encoder/ratecontrol.c \ encoder/sao.c encoder/wquant.c \ encoder/md_intra.c \ encoder/md_inter.c \ encoder/presets.c \ encoder/pre_encode.c \ encoder/rdo.c encoder/rdoq.c encoder/rps.c \ encoder/slice.c \ encoder/tdrdo.c \ encoder/yuv_writer.c \ encoder/wrapper.c \ encoder/xavs2.c encoder/xavs2_api.c \ encoder/xlist.c \ encoder/parameters.c SRCCLI = test/test.c SRCSO = OBJS = OBJAVX = OBJSO = OBJCLI = #OBJCHK = tools/checkasm.o CONFIG: $(shell cat config.h) ifneq ($(findstring HAVE_THREAD 1, $(CONFIG)),) SRCS += common/threadpool.c endif # MMX/SSE optims ifneq ($(AS),) # asm -------------------------------------------------------------- X86SRC = common/x86/blockcopy8.asm \ common/x86/const-a.asm \ common/x86/cpu-a.asm \ common/x86/dct8.asm \ common/x86/mc-a.asm \ common/x86/mc-a2.asm \ common/x86/pixel-32.asm \ common/x86/pixel-a.asm \ common/x86/pixel-util8.asm \ common/x86/pixeladd8.asm \ common/x86/quant8.asm \ common/x86/sad-a.asm \ common/x86/sad-vpp.asm \ common/x86/satd-a.asm \ common/x86/ssd-a.asm \ common/x86/x86inc.asm \ common/x86/x86util.asm ifeq ($(SYS_ARCH),X86) ARCH_X86 = yes ASMSRC = $(X86SRC) endif ## Until now, we do not have 64-bit asm ifeq ($(SYS_ARCH),X86_64) ARCH_X86 = yes SRCS += common/vec/intrinsic.c \ common/vec/intrinsic_alf.c \ common/vec/intrinsic_sao.c \ common/vec/intrinsic_deblock.c \ common/vec/intrinsic_inter_pred.c \ common/vec/intrinsic_intra-pred.c \ common/vec/intrinsic_intra-filledge.c \ common/vec/intrinsic_idct.c \ common/vec/intrinsic_dct.c \ common/vec/intrinsic_quant.c \ common/vec/intrinsic_cg_scan.c \ common/vec/intrinsic_mad.c \ common/vec/intrinsic_pixel.c SRCSAVX = common/vec/intrinsic_dct_avx.c \ common/vec/intrinsic_idct_avx2.c \ common/vec/intrinsic_quant_avx2.c \ common/vec/intrinsic_pixel_avx.c \ common/vec/intrinsic_cg_scan_avx.c \ common/vec/intrinsic_deblock_avx2.c \ common/vec/intrinsic_sao_avx2.c \ common/vec/intrinsic_inter_pred_avx2.c \ common/vec/intrinsic_intra-pred_avx2.c CFLAGS += -mmmx -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -msse4a # ASMSRC = $(X86SRC:-32.asm=-64.asm) ASMSRC = $(X86SRC) ASFLAGS += -DARCH_X86_64=1 OBJASM = $(ASMSRC:%.asm=%.o) $(OBJASM): common/x86/x86inc.asm common/x86/x86util.asm endif ifdef ARCH_X86 ASFLAGS += -I$(SRCPATH)/common/x86/ #SRCS += common/x86/mc-c.c common/x86/predict-c.c OBJASM = $(ASMSRC:%.asm=%.o) $(OBJASM): common/x86/x86inc.asm common/x86/x86util.asm endif # AltiVec optims ifeq ($(SYS_ARCH),PPC) SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \ common/ppc/quant.c common/ppc/deblock.c \ common/ppc/predict.c endif # NEON optims ifeq ($(SYS_ARCH),ARM) # x264 ARM asm sources # ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \ # common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S \ # common/arm/predict-a.S common/arm/bitstream-a.S # SRCS += common/arm/mc-c.c common/arm/predict-c.c # x265 ARM asm sources ASMSRC += common/arm/blockcopy8.S common/arm/cpu-a.S common/arm/dct-a.S \ common/arm/ipfilter8.S common/arm/mc-a.S common/arm/pixel-util.S \ common/arm/sad-a.S common/arm/ssd-a.S OBJASM = $(ASMSRC:%.S=%.o) endif # AArch64 NEON optims ifeq ($(SYS_ARCH),AARCH64) ASMSRC += common/aarch64/bitstream-a.S \ common/aarch64/cabac-a.S \ common/aarch64/dct-a.S \ common/aarch64/deblock-a.S \ common/aarch64/mc-a.S \ common/aarch64/pixel-a.S \ common/aarch64/predict-a.S \ common/aarch64/quant-a.S SRCS += common/aarch64/asm-offsets.c \ common/aarch64/mc-c.c \ common/aarch64/predict-c.c OBJASM = $(ASMSRC:%.S=%.o) OBJCHK += tools/checkasm-aarch64.o endif # MSA optims ifeq ($(SYS_ARCH),MIPS) ifneq ($(findstring HAVE_MSA 1, $(CONFIG)),) SRCS += common/mips/mc-c.c common/mips/dct-c.c \ common/mips/deblock-c.c common/mips/pixel-c.c \ common/mips/predict-c.c common/mips/quant-c.c endif endif # asm -------------------------------------------------------------- endif # here ends ifneq ($(AS),) ifneq ($(HAVE_GETOPT_LONG),1) SRCS += compat/getopt/getopt.c endif ## Windows Dll ## ifeq ($(SYS), WINDOWS) ## # OBJCLI += $(if $(RC), xavs2res.o) ## ifneq ($(SONAME),) ## SRCSO += xavs2dll.c ## OBJSO += $(if $(RC), xavs2res.dll.o) ## endif ## endif OBJS += $(SRCS:%.c=%.o) OBJAVX += $(SRCSAVX:%.c=%.o) OBJCLI += $(SRCCLI:%.c=%.o) OBJSO += $(SRCSO:%.c=%.o) .PHONY: all default fprofiled clean distclean install install-* uninstall cli lib-* etags cli: xavs2$(EXE) lib-static: $(LIBXAVS2) lib-shared: $(SONAME) $(LIBXAVS2): $(GENERATED) .depend $(OBJS) $(OBJAVX) $(OBJASM) @echo "\033[33m [linking static] $(LIBXAVS2) \033[0m" rm -f $(LIBXAVS2) $(AR)$@ $(OBJS) $(OBJAVX) $(OBJASM) $(if $(RANLIB), $(RANLIB) $@) $(SONAME): $(GENERATED) .depend $(OBJS) $(OBJAVX) $(OBJASM) $(OBJSO) @echo "\033[33m [linking shared] $(SONAME) \033[0m" $(LD)$@ $(OBJS) $(OBJAVX) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) ifneq ($(EXE),) .PHONY: xavs2 checkasm xavs2: xavs2$(EXE) checkasm: checkasm$(EXE) endif xavs2$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBXAVS2) @echo "\033[33m [linking execution] xavs2$(EXE) \033[0m" $(LD)$@ $(OBJCLI) $(CLI_LIBXAVS2) $(LDFLAGSCLI) $(LDFLAGS) checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBXAVS2) @echo "\033[33m [linking checkasm] checkasm$(EXE) \033[0m" $(LD)$@ $(OBJCHK) $(LIBXAVS2) $(LDFLAGS) $(OBJS) $(OBJAVX) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend %.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm @echo "\033[33m [Compiling asm]: $< \033[0m" $(AS) $(ASFLAGS) -o $@ $< -@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile %.o: %.S @echo "\033[33m [Compiling asm]: $< \033[0m" $(AS) $(ASFLAGS) -o $@ $< -@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile %.dll.o: %.rc xavs2.h @echo "\033[33m [Compiling dll]: $< \033[0m" $(RC) $(RCFLAGS)$@ -DDLL $< %.o: %.rc xavs2.h @echo "\033[33m [Compiling rc]: $< \033[0m" $(RC) $(RCFLAGS)$@ $< $(OBJAVX): @echo "\033[33m [Compiling]: $(@:.o=.c) \033[0m" $(CC) $(CFLAGS) -mavx -mavx2 -c -o $@ $(SRCPATH)/$(@:.o=.c) %.o: %.c @echo "\033[33m [Compiling]: $< \033[0m" $(CC) $(CFLAGS) -c -o $@ $< .depend: config.mak @rm -f .depend @echo "\033[33m dependency file generation... \033[0m" ifeq ($(COMPILER),CL) @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(SRCPATH)/tools/msvsdepend.sh "$(CC)" "$(CFLAGS)" "$(SRC)" "$(SRC:$(SRCPATH)/%.c=%.o)" 1>> .depend;) @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCSAVX)), $(CC) $(CFLAGS) -mavx2 $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;) else @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;) @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCSAVX)), $(CC) $(CFLAGS) -mavx2 $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;) endif config.mak: ./configure depend: .depend ifneq ($(wildcard .depend),) include .depend endif SRC2 = $(SRCS) $(SRCCLI) # These should cover most of the important codepaths OPT0 = --crf 30 -b1 -m1 -r1 --me dia --no-cabac --direct temporal --ssim --no-weightb OPT1 = --crf 16 -b2 -m3 -r3 --me hex --no-8x8dct --direct spatial --no-dct-decimate -t0 --slice-max-mbs 50 OPT2 = --crf 26 -b4 -m5 -r2 --me hex --cqm jvt --nr 100 --psnr --no-mixed-refs --b-adapt 2 --slice-max-size 1500 OPT3 = --crf 18 -b3 -m9 -r5 --me umh -t1 -A all --b-pyramid normal --direct auto --no-fast-pskip --no-mbtree OPT4 = --crf 22 -b3 -m7 -r4 --me esa -t2 -A all --psy-rd 1.0:1.0 --slices 4 OPT5 = --frames 50 --crf 24 -b3 -m10 -r3 --me tesa -t2 OPT6 = --frames 50 -q0 -m9 -r2 --me hex -Aall OPT7 = --frames 50 -q0 -m2 -r1 --me hex --no-cabac ifeq (,$(VIDS)) fprofiled: @echo 'usage: make fprofiled VIDS="infile1 infile2 ..."' @echo 'where infiles are anything that xavs2 understands,' @echo 'i.e. YUV with resolution in the filename, y4m, or avisynth.' else fprofiled: $(MAKE) clean $(MAKE) xavs2$(EXE) CFLAGS="$(CFLAGS) $(PROF_GEN_CC)" LDFLAGS="$(LDFLAGS) $(PROF_GEN_LD)" $(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./xavs2$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;)) ifeq ($(COMPILER),CL) # Because Visual Studio timestamps the object files within the PGD, it fails to build if they change - only the executable should be deleted rm -f xavs2$(EXE) else rm -f $(SRC2:%.c=%.o) endif $(MAKE) CFLAGS="$(CFLAGS) $(PROF_USE_CC)" LDFLAGS="$(LDFLAGS) $(PROF_USE_LD)" rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc endif clean: rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) rm -f *.a *.lib *.exp *.pdb libxavs2.so* xavs2 xavs2.exe .depend TAGS rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) xavs2_lookahead.clbin rm -f example example.exe $(OBJEXAMPLE) rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc distclean: clean rm -f config.mak xavs2_config.h config.h config.log xavs2.pc xavs2.def conftest* install-cli: cli $(INSTALL) -d $(DESTDIR)$(bindir) $(INSTALL) xavs2$(EXE) $(DESTDIR)$(bindir) install-lib-dev: $(INSTALL) -d $(DESTDIR)$(includedir) $(INSTALL) -d $(DESTDIR)$(libdir) $(INSTALL) -d $(DESTDIR)$(libdir)/pkgconfig $(INSTALL) -m 644 $(SRCPATH)/xavs2.h $(DESTDIR)$(includedir) $(INSTALL) -m 644 xavs2_config.h $(DESTDIR)$(includedir) $(INSTALL) -m 644 xavs2.pc $(DESTDIR)$(libdir)/pkgconfig install-lib-static: lib-static install-lib-dev $(INSTALL) -m 644 $(LIBXAVS2) $(DESTDIR)$(libdir) $(if $(RANLIB), $(RANLIB) $(DESTDIR)$(libdir)/$(LIBXAVS2)) install-lib-shared: lib-shared install-lib-dev ifneq ($(IMPLIBNAME),) $(INSTALL) -d $(DESTDIR)$(bindir) $(INSTALL) -m 755 $(SONAME) $(DESTDIR)$(bindir) $(INSTALL) -m 644 $(IMPLIBNAME) $(DESTDIR)$(libdir) else ifneq ($(SONAME),) ln -f -s $(SONAME) $(DESTDIR)$(libdir)/libxavs2.$(SOSUFFIX) $(INSTALL) -m 755 $(SONAME) $(DESTDIR)$(libdir) endif uninstall: rm -f $(DESTDIR)$(includedir)/xavs2.h $(DESTDIR)$(includedir)/xavs2_config.h $(DESTDIR)$(libdir)/libxavs2.a rm -f $(DESTDIR)$(bindir)/xavs2$(EXE) $(DESTDIR)$(libdir)/pkgconfig/xavs2.pc ifneq ($(IMPLIBNAME),) rm -f $(DESTDIR)$(bindir)/$(SONAME) $(DESTDIR)$(libdir)/$(IMPLIBNAME) else ifneq ($(SONAME),) rm -f $(DESTDIR)$(libdir)/$(SONAME) $(DESTDIR)$(libdir)/libxavs2.$(SOSUFFIX) endif etags: TAGS TAGS: etags $(SRCS) xavs2-1.3/build/linux/android_build.sh000066400000000000000000000033751340660520300200200ustar00rootroot00000000000000#!/bin/sh # Instruction: # A simple build script of xavs2/davs2 for Android platform. # Author: # Falei LUO # # reference: http://blog.csdn.net/u010963658/article/details/51404710 # https://github.com/yixia/x264.git # PIE: http://stackoverflow.com/questions/30612067/only-position-independent-executables-pie-are-supported # https://github.com/danielkop/android-ffmpeg/commit/616a099151fb6be05b559adc4c9ed95afacd92c2 # ------------------------------------------------------ # ARCH configurations: (arm/arm64), sdk-verision (19, 21) # only 21 and later version supports arm64 ARCH=arm SDK_VERSION=19 ANDROID_NDK="/android/ndk-r14b" # ------------------------------------------------------ if [ "$ARCH" = "arm64" ] then PLATFORM_PREFIX="aarch64-linux-android-" HOST="aarch64" PLATFORM_VERSION=4.9 EXTRA_CFLAGS="-march=armv8-a -D__ARM_ARCH_7__ -D__ARM_ARCH_7A__ -fPIE -pie" else PLATFORM_PREFIX="arm-linux-androideabi-" HOST="arm" PLATFORM_VERSION=4.9 EXTRA_CFLAGS="-march=armv7-a -mfloat-abi=softfp -mfpu=neon -D__ARM_ARCH_7__ -D__ARM_ARCH_7A__ -fPIE -pie" fi PREFIX=$(pwd)/android/${ARCH} NDKROOT=$ANDROID_NDK/platforms/android-${SDK_VERSION}/arch-${ARCH} TOOLCHAIN=$ANDROID_NDK/toolchains/${PLATFORM_PREFIX}${PLATFORM_VERSION}/prebuilt/linux-x86_64 CROSS_PREFIX=$TOOLCHAIN/bin/${PLATFORM_PREFIX} EXTRA_LDFLAGS="-fPIE -pie" # configure rm -rf config.mak ./configure --prefix=$PREFIX \ --cross-prefix=$CROSS_PREFIX \ --extra-cflags="$EXTRA_CFLAGS" \ --extra-ldflags="$EXTRA_LDFLAGS" \ --enable-pic \ --enable-static \ --enable-strip \ --disable-asm \ --host=arm-linux \ --sysroot=$NDKROOT make clean make STRIP= -j4 # install || exit 1 # scripts ends here xavs2-1.3/build/linux/config.guess000077500000000000000000001277131340660520300172100ustar00rootroot00000000000000#! /bin/sh # Attempt to guess a canonical system name. # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, # 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, # 2011, 2012 Free Software Foundation, Inc. timestamp='2012-09-25' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see . # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. # Originally written by Per Bothner. Please send patches (context # diff format) to and include a ChangeLog # entry. # # This script attempts to guess a canonical system name similar to # config.sub. If it succeeds, it prints the system name on stdout, and # exits with 0. Otherwise, it exits with 1. # # You can get the latest version of this script from: # http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD me=`echo "$0" | sed -e 's,.*/,,'` usage="\ Usage: $0 [OPTION] Output the configuration name of the system \`$me' is run on. Operation modes: -h, --help print this help, then exit -t, --time-stamp print date of last modification, then exit -v, --version print version number, then exit Report bugs and patches to ." version="\ GNU config.guess ($timestamp) Originally written by Per Bothner. Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." help=" Try \`$me --help' for more information." # Parse command line while test $# -gt 0 ; do case $1 in --time-stamp | --time* | -t ) echo "$timestamp" ; exit ;; --version | -v ) echo "$version" ; exit ;; --help | --h* | -h ) echo "$usage"; exit ;; -- ) # Stop option processing shift; break ;; - ) # Use stdin as input. break ;; -* ) echo "$me: invalid option $1$help" >&2 exit 1 ;; * ) break ;; esac done if test $# != 0; then echo "$me: too many arguments$help" >&2 exit 1 fi trap 'exit 1' 1 2 15 # CC_FOR_BUILD -- compiler used by this script. Note that the use of a # compiler to aid in system detection is discouraged as it requires # temporary files to be created and, as you can see below, it is a # headache to deal with in a portable fashion. # Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still # use `HOST_CC' if defined, but it is deprecated. # Portable tmp directory creation inspired by the Autoconf team. set_cc_for_build=' trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ; trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ; : ${TMPDIR=/tmp} ; { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } || { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } || { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ; dummy=$tmp/dummy ; tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; case $CC_FOR_BUILD,$HOST_CC,$CC in ,,) echo "int x;" > $dummy.c ; for c in cc gcc c89 c99 ; do if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then CC_FOR_BUILD="$c"; break ; fi ; done ; if test x"$CC_FOR_BUILD" = x ; then CC_FOR_BUILD=no_compiler_found ; fi ;; ,,*) CC_FOR_BUILD=$CC ;; ,*,*) CC_FOR_BUILD=$HOST_CC ;; esac ; set_cc_for_build= ;' # This is needed to find uname on a Pyramid OSx when run in the BSD universe. # (ghazi@noc.rutgers.edu 1994-08-24) if (test -f /.attbin/uname) >/dev/null 2>&1 ; then PATH=$PATH:/.attbin ; export PATH fi UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown # Note: order is significant - the case branches are not exclusive. case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in *:NetBSD:*:*) # NetBSD (nbsd) targets should (where applicable) match one or # more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*, # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently # switched to ELF, *-*-netbsd* would select the old # object file format. This provides both forward # compatibility and a consistent mechanism for selecting the # object file format. # # Note: NetBSD doesn't particularly care about the vendor # portion of the name. We always set it to "unknown". sysctl="sysctl -n hw.machine_arch" UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \ /usr/sbin/$sysctl 2>/dev/null || echo unknown)` case "${UNAME_MACHINE_ARCH}" in armeb) machine=armeb-unknown ;; arm*) machine=arm-unknown ;; sh3el) machine=shl-unknown ;; sh3eb) machine=sh-unknown ;; sh5el) machine=sh5le-unknown ;; *) machine=${UNAME_MACHINE_ARCH}-unknown ;; esac # The Operating System including object format, if it has switched # to ELF recently, or will in the future. case "${UNAME_MACHINE_ARCH}" in arm*|i386|m68k|ns32k|sh3*|sparc|vax) eval $set_cc_for_build if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ELF__ then # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). # Return netbsd for either. FIX? os=netbsd else os=netbsdelf fi ;; *) os=netbsd ;; esac # The OS release # Debian GNU/NetBSD machines have a different userland, and # thus, need a distinct triplet. However, they do not need # kernel version information, so it can be replaced with a # suitable tag, in the style of linux-gnu. case "${UNAME_VERSION}" in Debian*) release='-gnu' ;; *) release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` ;; esac # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: # contains redundant information, the shorter form: # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. echo "${machine}-${os}${release}" exit ;; *:Bitrig:*:*) UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'` echo ${UNAME_MACHINE_ARCH}-unknown-bitrig${UNAME_RELEASE} exit ;; *:OpenBSD:*:*) UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE} exit ;; *:ekkoBSD:*:*) echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE} exit ;; *:SolidBSD:*:*) echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE} exit ;; macppc:MirBSD:*:*) echo powerpc-unknown-mirbsd${UNAME_RELEASE} exit ;; *:MirBSD:*:*) echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE} exit ;; alpha:OSF1:*:*) case $UNAME_RELEASE in *4.0) UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` ;; *5.*) UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` ;; esac # According to Compaq, /usr/sbin/psrinfo has been available on # OSF/1 and Tru64 systems produced since 1995. I hope that # covers most systems running today. This code pipes the CPU # types through head -n 1, so we only detect the type of CPU 0. ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` case "$ALPHA_CPU_TYPE" in "EV4 (21064)") UNAME_MACHINE="alpha" ;; "EV4.5 (21064)") UNAME_MACHINE="alpha" ;; "LCA4 (21066/21068)") UNAME_MACHINE="alpha" ;; "EV5 (21164)") UNAME_MACHINE="alphaev5" ;; "EV5.6 (21164A)") UNAME_MACHINE="alphaev56" ;; "EV5.6 (21164PC)") UNAME_MACHINE="alphapca56" ;; "EV5.7 (21164PC)") UNAME_MACHINE="alphapca57" ;; "EV6 (21264)") UNAME_MACHINE="alphaev6" ;; "EV6.7 (21264A)") UNAME_MACHINE="alphaev67" ;; "EV6.8CB (21264C)") UNAME_MACHINE="alphaev68" ;; "EV6.8AL (21264B)") UNAME_MACHINE="alphaev68" ;; "EV6.8CX (21264D)") UNAME_MACHINE="alphaev68" ;; "EV6.9A (21264/EV69A)") UNAME_MACHINE="alphaev69" ;; "EV7 (21364)") UNAME_MACHINE="alphaev7" ;; "EV7.9 (21364A)") UNAME_MACHINE="alphaev79" ;; esac # A Pn.n version is a patched version. # A Vn.n version is a released version. # A Tn.n version is a released field test version. # A Xn.n version is an unreleased experimental baselevel. # 1.2 uses "1.2" for uname -r. echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` # Reset EXIT trap before exiting to avoid spurious non-zero exit code. exitcode=$? trap '' 0 exit $exitcode ;; Alpha\ *:Windows_NT*:*) # How do we know it's Interix rather than the generic POSIX subsystem? # Should we change UNAME_MACHINE based on the output of uname instead # of the specific Alpha model? echo alpha-pc-interix exit ;; 21064:Windows_NT:50:3) echo alpha-dec-winnt3.5 exit ;; Amiga*:UNIX_System_V:4.0:*) echo m68k-unknown-sysv4 exit ;; *:[Aa]miga[Oo][Ss]:*:*) echo ${UNAME_MACHINE}-unknown-amigaos exit ;; *:[Mm]orph[Oo][Ss]:*:*) echo ${UNAME_MACHINE}-unknown-morphos exit ;; *:OS/390:*:*) echo i370-ibm-openedition exit ;; *:z/VM:*:*) echo s390-ibm-zvmoe exit ;; *:OS400:*:*) echo powerpc-ibm-os400 exit ;; arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) echo arm-acorn-riscix${UNAME_RELEASE} exit ;; arm*:riscos:*:*|arm*:RISCOS:*:*) echo arm-unknown-riscos exit ;; SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) echo hppa1.1-hitachi-hiuxmpp exit ;; Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. if test "`(/bin/universe) 2>/dev/null`" = att ; then echo pyramid-pyramid-sysv3 else echo pyramid-pyramid-bsd fi exit ;; NILE*:*:*:dcosx) echo pyramid-pyramid-svr4 exit ;; DRS?6000:unix:4.0:6*) echo sparc-icl-nx6 exit ;; DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) case `/usr/bin/uname -p` in sparc) echo sparc-icl-nx7; exit ;; esac ;; s390x:SunOS:*:*) echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4H:SunOS:5.*:*) echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*) echo i386-pc-auroraux${UNAME_RELEASE} exit ;; i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) eval $set_cc_for_build SUN_ARCH="i386" # If there is a compiler, see if it is configured for 64-bit objects. # Note that the Sun cc does not turn __LP64__ into 1 like gcc does. # This test works for both compilers. if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \ (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ grep IS_64BIT_ARCH >/dev/null then SUN_ARCH="x86_64" fi fi echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:6*:*) # According to config.sub, this is the proper way to canonicalize # SunOS6. Hard to guess exactly what SunOS6 will be like, but # it's likely to be more like Solaris than SunOS4. echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:*:*) case "`/usr/bin/arch -k`" in Series*|S4*) UNAME_RELEASE=`uname -v` ;; esac # Japanese Language versions have a version number like `4.1.3-JL'. echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` exit ;; sun3*:SunOS:*:*) echo m68k-sun-sunos${UNAME_RELEASE} exit ;; sun*:*:4.2BSD:*) UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 case "`/bin/arch`" in sun3) echo m68k-sun-sunos${UNAME_RELEASE} ;; sun4) echo sparc-sun-sunos${UNAME_RELEASE} ;; esac exit ;; aushp:SunOS:*:*) echo sparc-auspex-sunos${UNAME_RELEASE} exit ;; # The situation for MiNT is a little confusing. The machine name # can be virtually everything (everything which is not # "atarist" or "atariste" at least should have a processor # > m68000). The system name ranges from "MiNT" over "FreeMiNT" # to the lowercase version "mint" (or "freemint"). Finally # the system name "TOS" denotes a system which is actually not # MiNT. But MiNT is downward compatible to TOS, so this should # be no problem. atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) echo m68k-milan-mint${UNAME_RELEASE} exit ;; hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) echo m68k-hades-mint${UNAME_RELEASE} exit ;; *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) echo m68k-unknown-mint${UNAME_RELEASE} exit ;; m68k:machten:*:*) echo m68k-apple-machten${UNAME_RELEASE} exit ;; powerpc:machten:*:*) echo powerpc-apple-machten${UNAME_RELEASE} exit ;; RISC*:Mach:*:*) echo mips-dec-mach_bsd4.3 exit ;; RISC*:ULTRIX:*:*) echo mips-dec-ultrix${UNAME_RELEASE} exit ;; VAX*:ULTRIX*:*:*) echo vax-dec-ultrix${UNAME_RELEASE} exit ;; 2020:CLIX:*:* | 2430:CLIX:*:*) echo clipper-intergraph-clix${UNAME_RELEASE} exit ;; mips:*:*:UMIPS | mips:*:*:RISCos) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #ifdef __cplusplus #include /* for printf() prototype */ int main (int argc, char *argv[]) { #else int main (argc, argv) int argc; char *argv[]; { #endif #if defined (host_mips) && defined (MIPSEB) #if defined (SYSTYPE_SYSV) printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); #endif #if defined (SYSTYPE_SVR4) printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); #endif #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); #endif #endif exit (-1); } EOF $CC_FOR_BUILD -o $dummy $dummy.c && dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` && SYSTEM_NAME=`$dummy $dummyarg` && { echo "$SYSTEM_NAME"; exit; } echo mips-mips-riscos${UNAME_RELEASE} exit ;; Motorola:PowerMAX_OS:*:*) echo powerpc-motorola-powermax exit ;; Motorola:*:4.3:PL8-*) echo powerpc-harris-powermax exit ;; Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) echo powerpc-harris-powermax exit ;; Night_Hawk:Power_UNIX:*:*) echo powerpc-harris-powerunix exit ;; m88k:CX/UX:7*:*) echo m88k-harris-cxux7 exit ;; m88k:*:4*:R4*) echo m88k-motorola-sysv4 exit ;; m88k:*:3*:R3*) echo m88k-motorola-sysv3 exit ;; AViiON:dgux:*:*) # DG/UX returns AViiON for all architectures UNAME_PROCESSOR=`/usr/bin/uname -p` if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ] then if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \ [ ${TARGET_BINARY_INTERFACE}x = x ] then echo m88k-dg-dgux${UNAME_RELEASE} else echo m88k-dg-dguxbcs${UNAME_RELEASE} fi else echo i586-dg-dgux${UNAME_RELEASE} fi exit ;; M88*:DolphinOS:*:*) # DolphinOS (SVR3) echo m88k-dolphin-sysv3 exit ;; M88*:*:R3*:*) # Delta 88k system running SVR3 echo m88k-motorola-sysv3 exit ;; XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) echo m88k-tektronix-sysv3 exit ;; Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) echo m68k-tektronix-bsd exit ;; *:IRIX*:*:*) echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` exit ;; ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id exit ;; # Note that: echo "'`uname -s`'" gives 'AIX ' i*86:AIX:*:*) echo i386-ibm-aix exit ;; ia64:AIX:*:*) if [ -x /usr/bin/oslevel ] ; then IBM_REV=`/usr/bin/oslevel` else IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} fi echo ${UNAME_MACHINE}-ibm-aix${IBM_REV} exit ;; *:AIX:2:3) if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #include main() { if (!__power_pc()) exit(1); puts("powerpc-ibm-aix3.2.5"); exit(0); } EOF if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` then echo "$SYSTEM_NAME" else echo rs6000-ibm-aix3.2.5 fi elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then echo rs6000-ibm-aix3.2.4 else echo rs6000-ibm-aix3.2 fi exit ;; *:AIX:*:[4567]) IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then IBM_ARCH=rs6000 else IBM_ARCH=powerpc fi if [ -x /usr/bin/oslevel ] ; then IBM_REV=`/usr/bin/oslevel` else IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} fi echo ${IBM_ARCH}-ibm-aix${IBM_REV} exit ;; *:AIX:*:*) echo rs6000-ibm-aix exit ;; ibmrt:4.4BSD:*|romp-ibm:BSD:*) echo romp-ibm-bsd4.4 exit ;; ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to exit ;; # report: romp-ibm BSD 4.3 *:BOSX:*:*) echo rs6000-bull-bosx exit ;; DPX/2?00:B.O.S.:*:*) echo m68k-bull-sysv3 exit ;; 9000/[34]??:4.3bsd:1.*:*) echo m68k-hp-bsd exit ;; hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) echo m68k-hp-bsd4.4 exit ;; 9000/[34678]??:HP-UX:*:*) HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` case "${UNAME_MACHINE}" in 9000/31? ) HP_ARCH=m68000 ;; 9000/[34]?? ) HP_ARCH=m68k ;; 9000/[678][0-9][0-9]) if [ -x /usr/bin/getconf ]; then sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` case "${sc_cpu_version}" in 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 532) # CPU_PA_RISC2_0 case "${sc_kernel_bits}" in 32) HP_ARCH="hppa2.0n" ;; 64) HP_ARCH="hppa2.0w" ;; '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20 esac ;; esac fi if [ "${HP_ARCH}" = "" ]; then eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #define _HPUX_SOURCE #include #include int main () { #if defined(_SC_KERNEL_BITS) long bits = sysconf(_SC_KERNEL_BITS); #endif long cpu = sysconf (_SC_CPU_VERSION); switch (cpu) { case CPU_PA_RISC1_0: puts ("hppa1.0"); break; case CPU_PA_RISC1_1: puts ("hppa1.1"); break; case CPU_PA_RISC2_0: #if defined(_SC_KERNEL_BITS) switch (bits) { case 64: puts ("hppa2.0w"); break; case 32: puts ("hppa2.0n"); break; default: puts ("hppa2.0"); break; } break; #else /* !defined(_SC_KERNEL_BITS) */ puts ("hppa2.0"); break; #endif default: puts ("hppa1.0"); break; } exit (0); } EOF (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` test -z "$HP_ARCH" && HP_ARCH=hppa fi ;; esac if [ ${HP_ARCH} = "hppa2.0w" ] then eval $set_cc_for_build # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler # generating 64-bit code. GNU and HP use different nomenclature: # # $ CC_FOR_BUILD=cc ./config.guess # => hppa2.0w-hp-hpux11.23 # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess # => hppa64-hp-hpux11.23 if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | grep -q __LP64__ then HP_ARCH="hppa2.0w" else HP_ARCH="hppa64" fi fi echo ${HP_ARCH}-hp-hpux${HPUX_REV} exit ;; ia64:HP-UX:*:*) HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` echo ia64-hp-hpux${HPUX_REV} exit ;; 3050*:HI-UX:*:*) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #include int main () { long cpu = sysconf (_SC_CPU_VERSION); /* The order matters, because CPU_IS_HP_MC68K erroneously returns true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct results, however. */ if (CPU_IS_PA_RISC (cpu)) { switch (cpu) { case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; default: puts ("hppa-hitachi-hiuxwe2"); break; } } else if (CPU_IS_HP_MC68K (cpu)) puts ("m68k-hitachi-hiuxwe2"); else puts ("unknown-hitachi-hiuxwe2"); exit (0); } EOF $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` && { echo "$SYSTEM_NAME"; exit; } echo unknown-hitachi-hiuxwe2 exit ;; 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) echo hppa1.1-hp-bsd exit ;; 9000/8??:4.3bsd:*:*) echo hppa1.0-hp-bsd exit ;; *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) echo hppa1.0-hp-mpeix exit ;; hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) echo hppa1.1-hp-osf exit ;; hp8??:OSF1:*:*) echo hppa1.0-hp-osf exit ;; i*86:OSF1:*:*) if [ -x /usr/sbin/sysversion ] ; then echo ${UNAME_MACHINE}-unknown-osf1mk else echo ${UNAME_MACHINE}-unknown-osf1 fi exit ;; parisc*:Lites*:*:*) echo hppa1.1-hp-lites exit ;; C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) echo c1-convex-bsd exit ;; C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) if getsysinfo -f scalar_acc then echo c32-convex-bsd else echo c2-convex-bsd fi exit ;; C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) echo c34-convex-bsd exit ;; C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) echo c38-convex-bsd exit ;; C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) echo c4-convex-bsd exit ;; CRAY*Y-MP:*:*:*) echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*[A-Z]90:*:*:*) echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ -e 's/\.[^.]*$/.X/' exit ;; CRAY*TS:*:*:*) echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*T3E:*:*:*) echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*SV1:*:*:*) echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; *:UNICOS/mp:*:*) echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; 5000:UNIX_System_V:4.*:*) FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} exit ;; sparc*:BSD/OS:*:*) echo sparc-unknown-bsdi${UNAME_RELEASE} exit ;; *:BSD/OS:*:*) echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE} exit ;; *:FreeBSD:*:*) UNAME_PROCESSOR=`/usr/bin/uname -p` case ${UNAME_PROCESSOR} in amd64) echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; *) echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; esac exit ;; i*:CYGWIN*:*) echo ${UNAME_MACHINE}-pc-cygwin exit ;; *:MINGW64*:*) echo ${UNAME_MACHINE}-pc-mingw64 exit ;; *:MINGW*:*) echo ${UNAME_MACHINE}-pc-mingw32 exit ;; i*:MSYS*:*) echo ${UNAME_MACHINE}-pc-msys exit ;; i*:windows32*:*) # uname -m includes "-pc" on this system. echo ${UNAME_MACHINE}-mingw32 exit ;; i*:PW*:*) echo ${UNAME_MACHINE}-pc-pw32 exit ;; *:Interix*:*) case ${UNAME_MACHINE} in x86) echo i586-pc-interix${UNAME_RELEASE} exit ;; authenticamd | genuineintel | EM64T) echo x86_64-unknown-interix${UNAME_RELEASE} exit ;; IA64) echo ia64-unknown-interix${UNAME_RELEASE} exit ;; esac ;; [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) echo i${UNAME_MACHINE}-pc-mks exit ;; 8664:Windows_NT:*) echo x86_64-pc-mks exit ;; i*:Windows_NT*:* | Pentium*:Windows_NT*:*) # How do we know it's Interix rather than the generic POSIX subsystem? # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we # UNAME_MACHINE based on the output of uname instead of i386? echo i586-pc-interix exit ;; i*:UWIN*:*) echo ${UNAME_MACHINE}-pc-uwin exit ;; amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) echo x86_64-unknown-cygwin exit ;; p*:CYGWIN*:*) echo powerpcle-unknown-cygwin exit ;; prep*:SunOS:5.*:*) echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; *:GNU:*:*) # the GNU system echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` exit ;; *:GNU/*:*:*) # other systems with GNU libc and userland echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu exit ;; i*86:Minix:*:*) echo ${UNAME_MACHINE}-pc-minix exit ;; aarch64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; aarch64_be:Linux:*:*) UNAME_MACHINE=aarch64_be echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; alpha:Linux:*:*) case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in EV5) UNAME_MACHINE=alphaev5 ;; EV56) UNAME_MACHINE=alphaev56 ;; PCA56) UNAME_MACHINE=alphapca56 ;; PCA57) UNAME_MACHINE=alphapca56 ;; EV6) UNAME_MACHINE=alphaev6 ;; EV67) UNAME_MACHINE=alphaev67 ;; EV68*) UNAME_MACHINE=alphaev68 ;; esac objdump --private-headers /bin/sh | grep -q ld.so.1 if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} exit ;; arm*:Linux:*:*) eval $set_cc_for_build if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_EABI__ then echo ${UNAME_MACHINE}-unknown-linux-gnu else if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_PCS_VFP then echo ${UNAME_MACHINE}-unknown-linux-gnueabi else echo ${UNAME_MACHINE}-unknown-linux-gnueabihf fi fi exit ;; avr32*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; cris:Linux:*:*) echo ${UNAME_MACHINE}-axis-linux-gnu exit ;; crisv32:Linux:*:*) echo ${UNAME_MACHINE}-axis-linux-gnu exit ;; frv:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; hexagon:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; i*86:Linux:*:*) LIBC=gnu eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #ifdef __dietlibc__ LIBC=dietlibc #endif EOF eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'` echo "${UNAME_MACHINE}-pc-linux-${LIBC}" exit ;; ia64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; m32r*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; m68*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; mips:Linux:*:* | mips64:Linux:*:*) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #undef CPU #undef ${UNAME_MACHINE} #undef ${UNAME_MACHINE}el #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) CPU=${UNAME_MACHINE}el #else #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) CPU=${UNAME_MACHINE} #else CPU= #endif #endif EOF eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'` test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } ;; or32:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; padre:Linux:*:*) echo sparc-unknown-linux-gnu exit ;; parisc64:Linux:*:* | hppa64:Linux:*:*) echo hppa64-unknown-linux-gnu exit ;; parisc:Linux:*:* | hppa:Linux:*:*) # Look for CPU level case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in PA7*) echo hppa1.1-unknown-linux-gnu ;; PA8*) echo hppa2.0-unknown-linux-gnu ;; *) echo hppa-unknown-linux-gnu ;; esac exit ;; ppc64:Linux:*:*) echo powerpc64-unknown-linux-gnu exit ;; ppc64le:Linux:*:*) echo powerpc64le-unknown-linux-gnu exit ;; ppc:Linux:*:*) echo powerpc-unknown-linux-gnu exit ;; s390:Linux:*:* | s390x:Linux:*:*) echo ${UNAME_MACHINE}-ibm-linux exit ;; sh64*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; sh*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; sparc:Linux:*:* | sparc64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; tile*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; vax:Linux:*:*) echo ${UNAME_MACHINE}-dec-linux-gnu exit ;; x86_64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; xtensa*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; i*86:DYNIX/ptx:4*:*) # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. # earlier versions are messed up and put the nodename in both # sysname and nodename. echo i386-sequent-sysv4 exit ;; i*86:UNIX_SV:4.2MP:2.*) # Unixware is an offshoot of SVR4, but it has its own version # number series starting with 2... # I am not positive that other SVR4 systems won't match this, # I just have to hope. -- rms. # Use sysv4.2uw... so that sysv4* matches it. echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} exit ;; i*86:OS/2:*:*) # If we were able to find `uname', then EMX Unix compatibility # is probably installed. echo ${UNAME_MACHINE}-pc-os2-emx exit ;; i*86:XTS-300:*:STOP) echo ${UNAME_MACHINE}-unknown-stop exit ;; i*86:atheos:*:*) echo ${UNAME_MACHINE}-unknown-atheos exit ;; i*86:syllable:*:*) echo ${UNAME_MACHINE}-pc-syllable exit ;; i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*) echo i386-unknown-lynxos${UNAME_RELEASE} exit ;; i*86:*DOS:*:*) echo ${UNAME_MACHINE}-pc-msdosdjgpp exit ;; i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL} else echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL} fi exit ;; i*86:*:5:[678]*) # UnixWare 7.x, OpenUNIX and OpenServer 6. case `/bin/uname -X | grep "^Machine"` in *486*) UNAME_MACHINE=i486 ;; *Pentium) UNAME_MACHINE=i586 ;; *Pent*|*Celeron) UNAME_MACHINE=i686 ;; esac echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} exit ;; i*86:*:3.2:*) if test -f /usr/options/cb.name; then UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ && UNAME_MACHINE=i586 (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ && UNAME_MACHINE=i686 (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ && UNAME_MACHINE=i686 echo ${UNAME_MACHINE}-pc-sco$UNAME_REL else echo ${UNAME_MACHINE}-pc-sysv32 fi exit ;; pc:*:*:*) # Left here for compatibility: # uname -m prints for DJGPP always 'pc', but it prints nothing about # the processor, so we play safe by assuming i586. # Note: whatever this is, it MUST be the same as what config.sub # prints for the "djgpp" host, or else GDB configury will decide that # this is a cross-build. echo i586-pc-msdosdjgpp exit ;; Intel:Mach:3*:*) echo i386-pc-mach3 exit ;; paragon:*:*:*) echo i860-intel-osf1 exit ;; i860:*:4.*:*) # i860-SVR4 if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 else # Add other i860-SVR4 vendors below as they are discovered. echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4 fi exit ;; mini*:CTIX:SYS*5:*) # "miniframe" echo m68010-convergent-sysv exit ;; mc68k:UNIX:SYSTEM5:3.51m) echo m68k-convergent-sysv exit ;; M680?0:D-NIX:5.3:*) echo m68k-diab-dnix exit ;; M68*:*:R3V[5678]*:*) test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) OS_REL='' test -r /etc/.relid \ && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4.3${OS_REL}; exit; } /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4; exit; } ;; NCR*:*:4.2:* | MPRAS*:*:4.2:*) OS_REL='.3' test -r /etc/.relid \ && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4.3${OS_REL}; exit; } /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } /bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) echo m68k-unknown-lynxos${UNAME_RELEASE} exit ;; mc68030:UNIX_System_V:4.*:*) echo m68k-atari-sysv4 exit ;; TSUNAMI:LynxOS:2.*:*) echo sparc-unknown-lynxos${UNAME_RELEASE} exit ;; rs6000:LynxOS:2.*:*) echo rs6000-unknown-lynxos${UNAME_RELEASE} exit ;; PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*) echo powerpc-unknown-lynxos${UNAME_RELEASE} exit ;; SM[BE]S:UNIX_SV:*:*) echo mips-dde-sysv${UNAME_RELEASE} exit ;; RM*:ReliantUNIX-*:*:*) echo mips-sni-sysv4 exit ;; RM*:SINIX-*:*:*) echo mips-sni-sysv4 exit ;; *:SINIX-*:*:*) if uname -p 2>/dev/null >/dev/null ; then UNAME_MACHINE=`(uname -p) 2>/dev/null` echo ${UNAME_MACHINE}-sni-sysv4 else echo ns32k-sni-sysv fi exit ;; PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort # says echo i586-unisys-sysv4 exit ;; *:UNIX_System_V:4*:FTX*) # From Gerald Hewes . # How about differentiating between stratus architectures? -djm echo hppa1.1-stratus-sysv4 exit ;; *:*:*:FTX*) # From seanf@swdc.stratus.com. echo i860-stratus-sysv4 exit ;; i*86:VOS:*:*) # From Paul.Green@stratus.com. echo ${UNAME_MACHINE}-stratus-vos exit ;; *:VOS:*:*) # From Paul.Green@stratus.com. echo hppa1.1-stratus-vos exit ;; mc68*:A/UX:*:*) echo m68k-apple-aux${UNAME_RELEASE} exit ;; news*:NEWS-OS:6*:*) echo mips-sony-newsos6 exit ;; R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) if [ -d /usr/nec ]; then echo mips-nec-sysv${UNAME_RELEASE} else echo mips-unknown-sysv${UNAME_RELEASE} fi exit ;; BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. echo powerpc-be-beos exit ;; BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. echo powerpc-apple-beos exit ;; BePC:BeOS:*:*) # BeOS running on Intel PC compatible. echo i586-pc-beos exit ;; BePC:Haiku:*:*) # Haiku running on Intel PC compatible. echo i586-pc-haiku exit ;; x86_64:Haiku:*:*) echo x86_64-unknown-haiku exit ;; SX-4:SUPER-UX:*:*) echo sx4-nec-superux${UNAME_RELEASE} exit ;; SX-5:SUPER-UX:*:*) echo sx5-nec-superux${UNAME_RELEASE} exit ;; SX-6:SUPER-UX:*:*) echo sx6-nec-superux${UNAME_RELEASE} exit ;; SX-7:SUPER-UX:*:*) echo sx7-nec-superux${UNAME_RELEASE} exit ;; SX-8:SUPER-UX:*:*) echo sx8-nec-superux${UNAME_RELEASE} exit ;; SX-8R:SUPER-UX:*:*) echo sx8r-nec-superux${UNAME_RELEASE} exit ;; Power*:Rhapsody:*:*) echo powerpc-apple-rhapsody${UNAME_RELEASE} exit ;; *:Rhapsody:*:*) echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE} exit ;; *:Darwin:*:*) UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown case $UNAME_PROCESSOR in i386) eval $set_cc_for_build if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ grep IS_64BIT_ARCH >/dev/null then UNAME_PROCESSOR="x86_64" fi fi ;; unknown) UNAME_PROCESSOR=powerpc ;; esac echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} exit ;; *:procnto*:*:* | *:QNX:[0123456789]*:*) UNAME_PROCESSOR=`uname -p` if test "$UNAME_PROCESSOR" = "x86"; then UNAME_PROCESSOR=i386 UNAME_MACHINE=pc fi echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE} exit ;; *:QNX:*:4*) echo i386-pc-qnx exit ;; NEO-?:NONSTOP_KERNEL:*:*) echo neo-tandem-nsk${UNAME_RELEASE} exit ;; NSE-*:NONSTOP_KERNEL:*:*) echo nse-tandem-nsk${UNAME_RELEASE} exit ;; NSR-?:NONSTOP_KERNEL:*:*) echo nsr-tandem-nsk${UNAME_RELEASE} exit ;; *:NonStop-UX:*:*) echo mips-compaq-nonstopux exit ;; BS2000:POSIX*:*:*) echo bs2000-siemens-sysv exit ;; DS/*:UNIX_System_V:*:*) echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE} exit ;; *:Plan9:*:*) # "uname -m" is not consistent, so use $cputype instead. 386 # is converted to i386 for consistency with other x86 # operating systems. if test "$cputype" = "386"; then UNAME_MACHINE=i386 else UNAME_MACHINE="$cputype" fi echo ${UNAME_MACHINE}-unknown-plan9 exit ;; *:TOPS-10:*:*) echo pdp10-unknown-tops10 exit ;; *:TENEX:*:*) echo pdp10-unknown-tenex exit ;; KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) echo pdp10-dec-tops20 exit ;; XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) echo pdp10-xkl-tops20 exit ;; *:TOPS-20:*:*) echo pdp10-unknown-tops20 exit ;; *:ITS:*:*) echo pdp10-unknown-its exit ;; SEI:*:*:SEIUX) echo mips-sei-seiux${UNAME_RELEASE} exit ;; *:DragonFly:*:*) echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` exit ;; *:*VMS:*:*) UNAME_MACHINE=`(uname -p) 2>/dev/null` case "${UNAME_MACHINE}" in A*) echo alpha-dec-vms ; exit ;; I*) echo ia64-dec-vms ; exit ;; V*) echo vax-dec-vms ; exit ;; esac ;; *:XENIX:*:SysV) echo i386-pc-xenix exit ;; i*86:skyos:*:*) echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//' exit ;; i*86:rdos:*:*) echo ${UNAME_MACHINE}-pc-rdos exit ;; i*86:AROS:*:*) echo ${UNAME_MACHINE}-pc-aros exit ;; x86_64:VMkernel:*:*) echo ${UNAME_MACHINE}-unknown-esx exit ;; esac eval $set_cc_for_build cat >$dummy.c < # include #endif main () { #if defined (sony) #if defined (MIPSEB) /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, I don't know.... */ printf ("mips-sony-bsd\n"); exit (0); #else #include printf ("m68k-sony-newsos%s\n", #ifdef NEWSOS4 "4" #else "" #endif ); exit (0); #endif #endif #if defined (__arm) && defined (__acorn) && defined (__unix) printf ("arm-acorn-riscix\n"); exit (0); #endif #if defined (hp300) && !defined (hpux) printf ("m68k-hp-bsd\n"); exit (0); #endif #if defined (NeXT) #if !defined (__ARCHITECTURE__) #define __ARCHITECTURE__ "m68k" #endif int version; version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; if (version < 4) printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); else printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); exit (0); #endif #if defined (MULTIMAX) || defined (n16) #if defined (UMAXV) printf ("ns32k-encore-sysv\n"); exit (0); #else #if defined (CMU) printf ("ns32k-encore-mach\n"); exit (0); #else printf ("ns32k-encore-bsd\n"); exit (0); #endif #endif #endif #if defined (__386BSD__) printf ("i386-pc-bsd\n"); exit (0); #endif #if defined (sequent) #if defined (i386) printf ("i386-sequent-dynix\n"); exit (0); #endif #if defined (ns32000) printf ("ns32k-sequent-dynix\n"); exit (0); #endif #endif #if defined (_SEQUENT_) struct utsname un; uname(&un); if (strncmp(un.version, "V2", 2) == 0) { printf ("i386-sequent-ptx2\n"); exit (0); } if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ printf ("i386-sequent-ptx1\n"); exit (0); } printf ("i386-sequent-ptx\n"); exit (0); #endif #if defined (vax) # if !defined (ultrix) # include # if defined (BSD) # if BSD == 43 printf ("vax-dec-bsd4.3\n"); exit (0); # else # if BSD == 199006 printf ("vax-dec-bsd4.3reno\n"); exit (0); # else printf ("vax-dec-bsd\n"); exit (0); # endif # endif # else printf ("vax-dec-bsd\n"); exit (0); # endif # else printf ("vax-dec-ultrix\n"); exit (0); # endif #endif #if defined (alliant) && defined (i860) printf ("i860-alliant-bsd\n"); exit (0); #endif exit (1); } EOF $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` && { echo "$SYSTEM_NAME"; exit; } # Apollos put the system type in the environment. test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; } # Convex versions that predate uname can use getsysinfo(1) if [ -x /usr/convex/getsysinfo ] then case `getsysinfo -f cpu_type` in c1*) echo c1-convex-bsd exit ;; c2*) if getsysinfo -f scalar_acc then echo c32-convex-bsd else echo c2-convex-bsd fi exit ;; c34*) echo c34-convex-bsd exit ;; c38*) echo c38-convex-bsd exit ;; c4*) echo c4-convex-bsd exit ;; esac fi cat >&2 < in order to provide the needed information to handle your system. config.guess timestamp = $timestamp uname -m = `(uname -m) 2>/dev/null || echo unknown` uname -r = `(uname -r) 2>/dev/null || echo unknown` uname -s = `(uname -s) 2>/dev/null || echo unknown` uname -v = `(uname -v) 2>/dev/null || echo unknown` /usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` /bin/uname -X = `(/bin/uname -X) 2>/dev/null` hostinfo = `(hostinfo) 2>/dev/null` /bin/universe = `(/bin/universe) 2>/dev/null` /usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` /bin/arch = `(/bin/arch) 2>/dev/null` /usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` /usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` UNAME_MACHINE = ${UNAME_MACHINE} UNAME_RELEASE = ${UNAME_RELEASE} UNAME_SYSTEM = ${UNAME_SYSTEM} UNAME_VERSION = ${UNAME_VERSION} EOF exit 1 # Local variables: # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "timestamp='" # time-stamp-format: "%:y-%02m-%02d" # time-stamp-end: "'" # End: xavs2-1.3/build/linux/config.sub000077500000000000000000001056561340660520300166550ustar00rootroot00000000000000#! /bin/sh # Configuration validation subroutine script. # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, # 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, # 2011, 2012 Free Software Foundation, Inc. timestamp='2012-12-06' # This file is (in principle) common to ALL GNU software. # The presence of a machine in this file suggests that SOME GNU software # can handle that machine. It does not imply ALL GNU software can. # # This file is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see . # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. # Please send patches to . Submit a context # diff and a properly formatted GNU ChangeLog entry. # # Configuration subroutine to validate and canonicalize a configuration type. # Supply the specified configuration type as an argument. # If it is invalid, we print an error message on stderr and exit with code 1. # Otherwise, we print the canonical config type on stdout and succeed. # You can get the latest version of this script from: # http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD # This file is supposed to be the same for all GNU packages # and recognize all the CPU types, system types and aliases # that are meaningful with *any* GNU software. # Each package is responsible for reporting which valid configurations # it does not support. The user should be able to distinguish # a failure to support a valid configuration from a meaningless # configuration. # The goal of this file is to map all the various variations of a given # machine specification into a single specification in the form: # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM # or in some cases, the newer four-part form: # CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM # It is wrong to echo any other type of specification. me=`echo "$0" | sed -e 's,.*/,,'` usage="\ Usage: $0 [OPTION] CPU-MFR-OPSYS $0 [OPTION] ALIAS Canonicalize a configuration name. Operation modes: -h, --help print this help, then exit -t, --time-stamp print date of last modification, then exit -v, --version print version number, then exit Report bugs and patches to ." version="\ GNU config.sub ($timestamp) Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." help=" Try \`$me --help' for more information." # Parse command line while test $# -gt 0 ; do case $1 in --time-stamp | --time* | -t ) echo "$timestamp" ; exit ;; --version | -v ) echo "$version" ; exit ;; --help | --h* | -h ) echo "$usage"; exit ;; -- ) # Stop option processing shift; break ;; - ) # Use stdin as input. break ;; -* ) echo "$me: invalid option $1$help" exit 1 ;; *local*) # First pass through any local machine types. echo $1 exit ;; * ) break ;; esac done case $# in 0) echo "$me: missing argument$help" >&2 exit 1;; 1) ;; *) echo "$me: too many arguments$help" >&2 exit 1;; esac # Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any). # Here we must recognize all the valid KERNEL-OS combinations. maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` case $maybe_os in nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \ linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \ knetbsd*-gnu* | netbsd*-gnu* | \ kopensolaris*-gnu* | \ storm-chaos* | os2-emx* | rtmk-nova*) os=-$maybe_os basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` ;; android-linux) os=-linux-android basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown ;; *) basic_machine=`echo $1 | sed 's/-[^-]*$//'` if [ $basic_machine != $1 ] then os=`echo $1 | sed 's/.*-/-/'` else os=; fi ;; esac ### Let's recognize common machines as not being operating systems so ### that things like config.sub decstation-3100 work. We also ### recognize some manufacturers as not being operating systems, so we ### can provide default operating systems below. case $os in -sun*os*) # Prevent following clause from handling this invalid input. ;; -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \ -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \ -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \ -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ -apple | -axis | -knuth | -cray | -microblaze*) os= basic_machine=$1 ;; -bluegene*) os=-cnk ;; -sim | -cisco | -oki | -wec | -winbond) os= basic_machine=$1 ;; -scout) ;; -wrs) os=-vxworks basic_machine=$1 ;; -chorusos*) os=-chorusos basic_machine=$1 ;; -chorusrdb) os=-chorusrdb basic_machine=$1 ;; -hiux*) os=-hiuxwe2 ;; -sco6) os=-sco5v6 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco5) os=-sco3.2v5 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco4) os=-sco3.2v4 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco3.2.[4-9]*) os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco3.2v[4-9]*) # Don't forget version if it is 3.2v4 or newer. basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco5v6*) # Don't forget version if it is 3.2v4 or newer. basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco*) os=-sco3.2v2 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -udk*) basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -isc) os=-isc2.2 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -clix*) basic_machine=clipper-intergraph ;; -isc*) basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -lynx*178) os=-lynxos178 ;; -lynx*5) os=-lynxos5 ;; -lynx*) os=-lynxos ;; -ptx*) basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'` ;; -windowsnt*) os=`echo $os | sed -e 's/windowsnt/winnt/'` ;; -psos*) os=-psos ;; -mint | -mint[0-9]*) basic_machine=m68k-atari os=-mint ;; esac # Decode aliases for certain CPU-COMPANY combinations. case $basic_machine in # Recognize the basic CPU types without company name. # Some are omitted here because they have special meanings below. 1750a | 580 \ | a29k \ | aarch64 | aarch64_be \ | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \ | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \ | am33_2.0 \ | arc \ | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \ | avr | avr32 \ | be32 | be64 \ | bfin \ | c4x | clipper \ | d10v | d30v | dlx | dsp16xx \ | epiphany \ | fido | fr30 | frv \ | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ | hexagon \ | i370 | i860 | i960 | ia64 \ | ip2k | iq2000 \ | le32 | le64 \ | lm32 \ | m32c | m32r | m32rle | m68000 | m68k | m88k \ | maxq | mb | microblaze | microblazeel | mcore | mep | metag \ | mips | mipsbe | mipseb | mipsel | mipsle \ | mips16 \ | mips64 | mips64el \ | mips64octeon | mips64octeonel \ | mips64orion | mips64orionel \ | mips64r5900 | mips64r5900el \ | mips64vr | mips64vrel \ | mips64vr4100 | mips64vr4100el \ | mips64vr4300 | mips64vr4300el \ | mips64vr5000 | mips64vr5000el \ | mips64vr5900 | mips64vr5900el \ | mipsisa32 | mipsisa32el \ | mipsisa32r2 | mipsisa32r2el \ | mipsisa64 | mipsisa64el \ | mipsisa64r2 | mipsisa64r2el \ | mipsisa64sb1 | mipsisa64sb1el \ | mipsisa64sr71k | mipsisa64sr71kel \ | mipstx39 | mipstx39el \ | mn10200 | mn10300 \ | moxie \ | mt \ | msp430 \ | nds32 | nds32le | nds32be \ | nios | nios2 \ | ns16k | ns32k \ | open8 \ | or32 \ | pdp10 | pdp11 | pj | pjl \ | powerpc | powerpc64 | powerpc64le | powerpcle \ | pyramid \ | rl78 | rx \ | score \ | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \ | sh64 | sh64le \ | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \ | sparcv8 | sparcv9 | sparcv9b | sparcv9v \ | spu \ | tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \ | ubicom32 \ | v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \ | we32k \ | x86 | xc16x | xstormy16 | xtensa \ | z8k | z80) basic_machine=$basic_machine-unknown ;; c54x) basic_machine=tic54x-unknown ;; c55x) basic_machine=tic55x-unknown ;; c6x) basic_machine=tic6x-unknown ;; m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | picochip) basic_machine=$basic_machine-unknown os=-none ;; m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k) ;; ms1) basic_machine=mt-unknown ;; strongarm | thumb | xscale) basic_machine=arm-unknown ;; xgate) basic_machine=$basic_machine-unknown os=-none ;; xscaleeb) basic_machine=armeb-unknown ;; xscaleel) basic_machine=armel-unknown ;; # We use `pc' rather than `unknown' # because (1) that's what they normally are, and # (2) the word "unknown" tends to confuse beginning users. i*86 | x86_64) basic_machine=$basic_machine-pc ;; # Object if more than one company name word. *-*-*) echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 exit 1 ;; # Recognize the basic CPU types with company name. 580-* \ | a29k-* \ | aarch64-* | aarch64_be-* \ | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \ | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \ | alphapca5[67]-* | alpha64pca5[67]-* | arc-* \ | arm-* | armbe-* | armle-* | armeb-* | armv*-* \ | avr-* | avr32-* \ | be32-* | be64-* \ | bfin-* | bs2000-* \ | c[123]* | c30-* | [cjt]90-* | c4x-* \ | clipper-* | craynv-* | cydra-* \ | d10v-* | d30v-* | dlx-* \ | elxsi-* \ | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \ | h8300-* | h8500-* \ | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \ | hexagon-* \ | i*86-* | i860-* | i960-* | ia64-* \ | ip2k-* | iq2000-* \ | le32-* | le64-* \ | lm32-* \ | m32c-* | m32r-* | m32rle-* \ | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \ | m88110-* | m88k-* | maxq-* | mcore-* | metag-* \ | microblaze-* | microblazeel-* \ | mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \ | mips16-* \ | mips64-* | mips64el-* \ | mips64octeon-* | mips64octeonel-* \ | mips64orion-* | mips64orionel-* \ | mips64r5900-* | mips64r5900el-* \ | mips64vr-* | mips64vrel-* \ | mips64vr4100-* | mips64vr4100el-* \ | mips64vr4300-* | mips64vr4300el-* \ | mips64vr5000-* | mips64vr5000el-* \ | mips64vr5900-* | mips64vr5900el-* \ | mipsisa32-* | mipsisa32el-* \ | mipsisa32r2-* | mipsisa32r2el-* \ | mipsisa64-* | mipsisa64el-* \ | mipsisa64r2-* | mipsisa64r2el-* \ | mipsisa64sb1-* | mipsisa64sb1el-* \ | mipsisa64sr71k-* | mipsisa64sr71kel-* \ | mipstx39-* | mipstx39el-* \ | mmix-* \ | mt-* \ | msp430-* \ | nds32-* | nds32le-* | nds32be-* \ | nios-* | nios2-* \ | none-* | np1-* | ns16k-* | ns32k-* \ | open8-* \ | orion-* \ | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \ | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \ | pyramid-* \ | rl78-* | romp-* | rs6000-* | rx-* \ | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \ | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \ | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \ | sparclite-* \ | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \ | tahoe-* \ | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \ | tile*-* \ | tron-* \ | ubicom32-* \ | v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \ | vax-* \ | we32k-* \ | x86-* | x86_64-* | xc16x-* | xps100-* \ | xstormy16-* | xtensa*-* \ | ymp-* \ | z8k-* | z80-*) ;; # Recognize the basic CPU types without company name, with glob match. xtensa*) basic_machine=$basic_machine-unknown ;; # Recognize the various machine names and aliases which stand # for a CPU type and a company and sometimes even an OS. 386bsd) basic_machine=i386-unknown os=-bsd ;; 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) basic_machine=m68000-att ;; 3b*) basic_machine=we32k-att ;; a29khif) basic_machine=a29k-amd os=-udi ;; abacus) basic_machine=abacus-unknown ;; adobe68k) basic_machine=m68010-adobe os=-scout ;; alliant | fx80) basic_machine=fx80-alliant ;; altos | altos3068) basic_machine=m68k-altos ;; am29k) basic_machine=a29k-none os=-bsd ;; amd64) basic_machine=x86_64-pc ;; amd64-*) basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'` ;; amdahl) basic_machine=580-amdahl os=-sysv ;; amiga | amiga-*) basic_machine=m68k-unknown ;; amigaos | amigados) basic_machine=m68k-unknown os=-amigaos ;; amigaunix | amix) basic_machine=m68k-unknown os=-sysv4 ;; apollo68) basic_machine=m68k-apollo os=-sysv ;; apollo68bsd) basic_machine=m68k-apollo os=-bsd ;; aros) basic_machine=i386-pc os=-aros ;; aux) basic_machine=m68k-apple os=-aux ;; balance) basic_machine=ns32k-sequent os=-dynix ;; blackfin) basic_machine=bfin-unknown os=-linux ;; blackfin-*) basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; bluegene*) basic_machine=powerpc-ibm os=-cnk ;; c54x-*) basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; c55x-*) basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; c6x-*) basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; c90) basic_machine=c90-cray os=-unicos ;; cegcc) basic_machine=arm-unknown os=-cegcc ;; convex-c1) basic_machine=c1-convex os=-bsd ;; convex-c2) basic_machine=c2-convex os=-bsd ;; convex-c32) basic_machine=c32-convex os=-bsd ;; convex-c34) basic_machine=c34-convex os=-bsd ;; convex-c38) basic_machine=c38-convex os=-bsd ;; cray | j90) basic_machine=j90-cray os=-unicos ;; craynv) basic_machine=craynv-cray os=-unicosmp ;; cr16 | cr16-*) basic_machine=cr16-unknown os=-elf ;; crds | unos) basic_machine=m68k-crds ;; crisv32 | crisv32-* | etraxfs*) basic_machine=crisv32-axis ;; cris | cris-* | etrax*) basic_machine=cris-axis ;; crx) basic_machine=crx-unknown os=-elf ;; da30 | da30-*) basic_machine=m68k-da30 ;; decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn) basic_machine=mips-dec ;; decsystem10* | dec10*) basic_machine=pdp10-dec os=-tops10 ;; decsystem20* | dec20*) basic_machine=pdp10-dec os=-tops20 ;; delta | 3300 | motorola-3300 | motorola-delta \ | 3300-motorola | delta-motorola) basic_machine=m68k-motorola ;; delta88) basic_machine=m88k-motorola os=-sysv3 ;; dicos) basic_machine=i686-pc os=-dicos ;; djgpp) basic_machine=i586-pc os=-msdosdjgpp ;; dpx20 | dpx20-*) basic_machine=rs6000-bull os=-bosx ;; dpx2* | dpx2*-bull) basic_machine=m68k-bull os=-sysv3 ;; ebmon29k) basic_machine=a29k-amd os=-ebmon ;; elxsi) basic_machine=elxsi-elxsi os=-bsd ;; encore | umax | mmax) basic_machine=ns32k-encore ;; es1800 | OSE68k | ose68k | ose | OSE) basic_machine=m68k-ericsson os=-ose ;; fx2800) basic_machine=i860-alliant ;; genix) basic_machine=ns32k-ns ;; gmicro) basic_machine=tron-gmicro os=-sysv ;; go32) basic_machine=i386-pc os=-go32 ;; h3050r* | hiux*) basic_machine=hppa1.1-hitachi os=-hiuxwe2 ;; h8300hms) basic_machine=h8300-hitachi os=-hms ;; h8300xray) basic_machine=h8300-hitachi os=-xray ;; h8500hms) basic_machine=h8500-hitachi os=-hms ;; harris) basic_machine=m88k-harris os=-sysv3 ;; hp300-*) basic_machine=m68k-hp ;; hp300bsd) basic_machine=m68k-hp os=-bsd ;; hp300hpux) basic_machine=m68k-hp os=-hpux ;; hp3k9[0-9][0-9] | hp9[0-9][0-9]) basic_machine=hppa1.0-hp ;; hp9k2[0-9][0-9] | hp9k31[0-9]) basic_machine=m68000-hp ;; hp9k3[2-9][0-9]) basic_machine=m68k-hp ;; hp9k6[0-9][0-9] | hp6[0-9][0-9]) basic_machine=hppa1.0-hp ;; hp9k7[0-79][0-9] | hp7[0-79][0-9]) basic_machine=hppa1.1-hp ;; hp9k78[0-9] | hp78[0-9]) # FIXME: really hppa2.0-hp basic_machine=hppa1.1-hp ;; hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893) # FIXME: really hppa2.0-hp basic_machine=hppa1.1-hp ;; hp9k8[0-9][13679] | hp8[0-9][13679]) basic_machine=hppa1.1-hp ;; hp9k8[0-9][0-9] | hp8[0-9][0-9]) basic_machine=hppa1.0-hp ;; hppa-next) os=-nextstep3 ;; hppaosf) basic_machine=hppa1.1-hp os=-osf ;; hppro) basic_machine=hppa1.1-hp os=-proelf ;; i370-ibm* | ibm*) basic_machine=i370-ibm ;; i*86v32) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv32 ;; i*86v4*) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv4 ;; i*86v) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv ;; i*86sol2) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-solaris2 ;; i386mach) basic_machine=i386-mach os=-mach ;; i386-vsta | vsta) basic_machine=i386-unknown os=-vsta ;; iris | iris4d) basic_machine=mips-sgi case $os in -irix*) ;; *) os=-irix4 ;; esac ;; isi68 | isi) basic_machine=m68k-isi os=-sysv ;; m68knommu) basic_machine=m68k-unknown os=-linux ;; m68knommu-*) basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; m88k-omron*) basic_machine=m88k-omron ;; magnum | m3230) basic_machine=mips-mips os=-sysv ;; merlin) basic_machine=ns32k-utek os=-sysv ;; microblaze*) basic_machine=microblaze-xilinx ;; mingw64) basic_machine=x86_64-pc os=-mingw64 ;; mingw32) basic_machine=i386-pc os=-mingw32 ;; mingw32ce) basic_machine=arm-unknown os=-mingw32ce ;; miniframe) basic_machine=m68000-convergent ;; *mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*) basic_machine=m68k-atari os=-mint ;; mips3*-*) basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'` ;; mips3*) basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown ;; monitor) basic_machine=m68k-rom68k os=-coff ;; morphos) basic_machine=powerpc-unknown os=-morphos ;; msdos) basic_machine=i386-pc os=-msdos ;; ms1-*) basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'` ;; msys) basic_machine=i386-pc os=-msys ;; mvs) basic_machine=i370-ibm os=-mvs ;; nacl) basic_machine=le32-unknown os=-nacl ;; ncr3000) basic_machine=i486-ncr os=-sysv4 ;; netbsd386) basic_machine=i386-unknown os=-netbsd ;; netwinder) basic_machine=armv4l-rebel os=-linux ;; news | news700 | news800 | news900) basic_machine=m68k-sony os=-newsos ;; news1000) basic_machine=m68030-sony os=-newsos ;; news-3600 | risc-news) basic_machine=mips-sony os=-newsos ;; necv70) basic_machine=v70-nec os=-sysv ;; next | m*-next ) basic_machine=m68k-next case $os in -nextstep* ) ;; -ns2*) os=-nextstep2 ;; *) os=-nextstep3 ;; esac ;; nh3000) basic_machine=m68k-harris os=-cxux ;; nh[45]000) basic_machine=m88k-harris os=-cxux ;; nindy960) basic_machine=i960-intel os=-nindy ;; mon960) basic_machine=i960-intel os=-mon960 ;; nonstopux) basic_machine=mips-compaq os=-nonstopux ;; np1) basic_machine=np1-gould ;; neo-tandem) basic_machine=neo-tandem ;; nse-tandem) basic_machine=nse-tandem ;; nsr-tandem) basic_machine=nsr-tandem ;; op50n-* | op60c-*) basic_machine=hppa1.1-oki os=-proelf ;; openrisc | openrisc-*) basic_machine=or32-unknown ;; os400) basic_machine=powerpc-ibm os=-os400 ;; OSE68000 | ose68000) basic_machine=m68000-ericsson os=-ose ;; os68k) basic_machine=m68k-none os=-os68k ;; pa-hitachi) basic_machine=hppa1.1-hitachi os=-hiuxwe2 ;; paragon) basic_machine=i860-intel os=-osf ;; parisc) basic_machine=hppa-unknown os=-linux ;; parisc-*) basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; pbd) basic_machine=sparc-tti ;; pbb) basic_machine=m68k-tti ;; pc532 | pc532-*) basic_machine=ns32k-pc532 ;; pc98) basic_machine=i386-pc ;; pc98-*) basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentium | p5 | k5 | k6 | nexgen | viac3) basic_machine=i586-pc ;; pentiumpro | p6 | 6x86 | athlon | athlon_*) basic_machine=i686-pc ;; pentiumii | pentium2 | pentiumiii | pentium3) basic_machine=i686-pc ;; pentium4) basic_machine=i786-pc ;; pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*) basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentiumpro-* | p6-* | 6x86-* | athlon-*) basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*) basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentium4-*) basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pn) basic_machine=pn-gould ;; power) basic_machine=power-ibm ;; ppc | ppcbe) basic_machine=powerpc-unknown ;; ppc-* | ppcbe-*) basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppcle | powerpclittle | ppc-le | powerpc-little) basic_machine=powerpcle-unknown ;; ppcle-* | powerpclittle-*) basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppc64) basic_machine=powerpc64-unknown ;; ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppc64le | powerpc64little | ppc64-le | powerpc64-little) basic_machine=powerpc64le-unknown ;; ppc64le-* | powerpc64little-*) basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ps2) basic_machine=i386-ibm ;; pw32) basic_machine=i586-unknown os=-pw32 ;; rdos | rdos64) basic_machine=x86_64-pc os=-rdos ;; rdos32) basic_machine=i386-pc os=-rdos ;; rom68k) basic_machine=m68k-rom68k os=-coff ;; rm[46]00) basic_machine=mips-siemens ;; rtpc | rtpc-*) basic_machine=romp-ibm ;; s390 | s390-*) basic_machine=s390-ibm ;; s390x | s390x-*) basic_machine=s390x-ibm ;; sa29200) basic_machine=a29k-amd os=-udi ;; sb1) basic_machine=mipsisa64sb1-unknown ;; sb1el) basic_machine=mipsisa64sb1el-unknown ;; sde) basic_machine=mipsisa32-sde os=-elf ;; sei) basic_machine=mips-sei os=-seiux ;; sequent) basic_machine=i386-sequent ;; sh) basic_machine=sh-hitachi os=-hms ;; sh5el) basic_machine=sh5le-unknown ;; sh64) basic_machine=sh64-unknown ;; sparclite-wrs | simso-wrs) basic_machine=sparclite-wrs os=-vxworks ;; sps7) basic_machine=m68k-bull os=-sysv2 ;; spur) basic_machine=spur-unknown ;; st2000) basic_machine=m68k-tandem ;; stratus) basic_machine=i860-stratus os=-sysv4 ;; strongarm-* | thumb-*) basic_machine=arm-`echo $basic_machine | sed 's/^[^-]*-//'` ;; sun2) basic_machine=m68000-sun ;; sun2os3) basic_machine=m68000-sun os=-sunos3 ;; sun2os4) basic_machine=m68000-sun os=-sunos4 ;; sun3os3) basic_machine=m68k-sun os=-sunos3 ;; sun3os4) basic_machine=m68k-sun os=-sunos4 ;; sun4os3) basic_machine=sparc-sun os=-sunos3 ;; sun4os4) basic_machine=sparc-sun os=-sunos4 ;; sun4sol2) basic_machine=sparc-sun os=-solaris2 ;; sun3 | sun3-*) basic_machine=m68k-sun ;; sun4) basic_machine=sparc-sun ;; sun386 | sun386i | roadrunner) basic_machine=i386-sun ;; sv1) basic_machine=sv1-cray os=-unicos ;; symmetry) basic_machine=i386-sequent os=-dynix ;; t3e) basic_machine=alphaev5-cray os=-unicos ;; t90) basic_machine=t90-cray os=-unicos ;; tile*) basic_machine=$basic_machine-unknown os=-linux-gnu ;; tx39) basic_machine=mipstx39-unknown ;; tx39el) basic_machine=mipstx39el-unknown ;; toad1) basic_machine=pdp10-xkl os=-tops20 ;; tower | tower-32) basic_machine=m68k-ncr ;; tpf) basic_machine=s390x-ibm os=-tpf ;; udi29k) basic_machine=a29k-amd os=-udi ;; ultra3) basic_machine=a29k-nyu os=-sym1 ;; v810 | necv810) basic_machine=v810-nec os=-none ;; vaxv) basic_machine=vax-dec os=-sysv ;; vms) basic_machine=vax-dec os=-vms ;; vpp*|vx|vx-*) basic_machine=f301-fujitsu ;; vxworks960) basic_machine=i960-wrs os=-vxworks ;; vxworks68) basic_machine=m68k-wrs os=-vxworks ;; vxworks29k) basic_machine=a29k-wrs os=-vxworks ;; w65*) basic_machine=w65-wdc os=-none ;; w89k-*) basic_machine=hppa1.1-winbond os=-proelf ;; xbox) basic_machine=i686-pc os=-mingw32 ;; xps | xps100) basic_machine=xps100-honeywell ;; xscale-* | xscalee[bl]-*) basic_machine=`echo $basic_machine | sed 's/^xscale/arm/'` ;; ymp) basic_machine=ymp-cray os=-unicos ;; z8k-*-coff) basic_machine=z8k-unknown os=-sim ;; z80-*-coff) basic_machine=z80-unknown os=-sim ;; none) basic_machine=none-none os=-none ;; # Here we handle the default manufacturer of certain CPU types. It is in # some cases the only manufacturer, in others, it is the most popular. w89k) basic_machine=hppa1.1-winbond ;; op50n) basic_machine=hppa1.1-oki ;; op60c) basic_machine=hppa1.1-oki ;; romp) basic_machine=romp-ibm ;; mmix) basic_machine=mmix-knuth ;; rs6000) basic_machine=rs6000-ibm ;; vax) basic_machine=vax-dec ;; pdp10) # there are many clones, so DEC is not a safe bet basic_machine=pdp10-unknown ;; pdp11) basic_machine=pdp11-dec ;; we32k) basic_machine=we32k-att ;; sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele) basic_machine=sh-unknown ;; sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v) basic_machine=sparc-sun ;; cydra) basic_machine=cydra-cydrome ;; orion) basic_machine=orion-highlevel ;; orion105) basic_machine=clipper-highlevel ;; mac | mpw | mac-mpw) basic_machine=m68k-apple ;; pmac | pmac-mpw) basic_machine=powerpc-apple ;; *-unknown) # Make sure to match an already-canonicalized machine name. ;; *) echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 exit 1 ;; esac # Here we canonicalize certain aliases for manufacturers. case $basic_machine in *-digital*) basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'` ;; *-commodore*) basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'` ;; *) ;; esac # Decode manufacturer-specific aliases for certain operating systems. if [ x"$os" != x"" ] then case $os in # First match some system type aliases # that might get confused with valid system types. # -solaris* is a basic system type, with this one exception. -auroraux) os=-auroraux ;; -solaris1 | -solaris1.*) os=`echo $os | sed -e 's|solaris1|sunos4|'` ;; -solaris) os=-solaris2 ;; -svr4*) os=-sysv4 ;; -unixware*) os=-sysv4.2uw ;; -gnu/linux*) os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` ;; # First accept the basic system types. # The portable systems comes first. # Each alternative MUST END IN A *, to match a version number. # -sysv* is not here because it comes later, after sysvr4. -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ | -sym* | -kopensolaris* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ | -aos* | -aros* \ | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \ | -bitrig* | -openbsd* | -solidbsd* \ | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \ | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \ | -chorusos* | -chorusrdb* | -cegcc* \ | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \ | -linux-newlib* | -linux-musl* | -linux-uclibc* \ | -uxpv* | -beos* | -mpeix* | -udk* \ | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \ | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \ | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \ | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \ | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \ | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*) # Remember, each alternative MUST END IN *, to match a version number. ;; -qnx*) case $basic_machine in x86-* | i*86-*) ;; *) os=-nto$os ;; esac ;; -nto-qnx*) ;; -nto*) os=`echo $os | sed -e 's|nto|nto-qnx|'` ;; -sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \ | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \ | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*) ;; -mac*) os=`echo $os | sed -e 's|mac|macos|'` ;; -linux-dietlibc) os=-linux-dietlibc ;; -linux*) os=`echo $os | sed -e 's|linux|linux-gnu|'` ;; -sunos5*) os=`echo $os | sed -e 's|sunos5|solaris2|'` ;; -sunos6*) os=`echo $os | sed -e 's|sunos6|solaris3|'` ;; -opened*) os=-openedition ;; -os400*) os=-os400 ;; -wince*) os=-wince ;; -osfrose*) os=-osfrose ;; -osf*) os=-osf ;; -utek*) os=-bsd ;; -dynix*) os=-bsd ;; -acis*) os=-aos ;; -atheos*) os=-atheos ;; -syllable*) os=-syllable ;; -386bsd) os=-bsd ;; -ctix* | -uts*) os=-sysv ;; -nova*) os=-rtmk-nova ;; -ns2 ) os=-nextstep2 ;; -nsk*) os=-nsk ;; # Preserve the version number of sinix5. -sinix5.*) os=`echo $os | sed -e 's|sinix|sysv|'` ;; -sinix*) os=-sysv4 ;; -tpf*) os=-tpf ;; -triton*) os=-sysv3 ;; -oss*) os=-sysv3 ;; -svr4) os=-sysv4 ;; -svr3) os=-sysv3 ;; -sysvr4) os=-sysv4 ;; # This must come after -sysvr4. -sysv*) ;; -ose*) os=-ose ;; -es1800*) os=-ose ;; -xenix) os=-xenix ;; -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) os=-mint ;; -aros*) os=-aros ;; -kaos*) os=-kaos ;; -zvmoe) os=-zvmoe ;; -dicos*) os=-dicos ;; -nacl*) ;; -none) ;; *) # Get rid of the `-' at the beginning of $os. os=`echo $os | sed 's/[^-]*-//'` echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2 exit 1 ;; esac else # Here we handle the default operating systems that come with various machines. # The value should be what the vendor currently ships out the door with their # machine or put another way, the most popular os provided with the machine. # Note that if you're going to try to match "-MANUFACTURER" here (say, # "-sun"), then you have to tell the case statement up towards the top # that MANUFACTURER isn't an operating system. Otherwise, code above # will signal an error saying that MANUFACTURER isn't an operating # system, and we'll never get to this point. case $basic_machine in score-*) os=-elf ;; spu-*) os=-elf ;; *-acorn) os=-riscix1.2 ;; arm*-rebel) os=-linux ;; arm*-semi) os=-aout ;; c4x-* | tic4x-*) os=-coff ;; hexagon-*) os=-elf ;; tic54x-*) os=-coff ;; tic55x-*) os=-coff ;; tic6x-*) os=-coff ;; # This must come before the *-dec entry. pdp10-*) os=-tops20 ;; pdp11-*) os=-none ;; *-dec | vax-*) os=-ultrix4.2 ;; m68*-apollo) os=-domain ;; i386-sun) os=-sunos4.0.2 ;; m68000-sun) os=-sunos3 ;; m68*-cisco) os=-aout ;; mep-*) os=-elf ;; mips*-cisco) os=-elf ;; mips*-*) os=-elf ;; or32-*) os=-coff ;; *-tti) # must be before sparc entry or we get the wrong os. os=-sysv3 ;; sparc-* | *-sun) os=-sunos4.1.1 ;; *-be) os=-beos ;; *-haiku) os=-haiku ;; *-ibm) os=-aix ;; *-knuth) os=-mmixware ;; *-wec) os=-proelf ;; *-winbond) os=-proelf ;; *-oki) os=-proelf ;; *-hp) os=-hpux ;; *-hitachi) os=-hiux ;; i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent) os=-sysv ;; *-cbm) os=-amigaos ;; *-dg) os=-dgux ;; *-dolphin) os=-sysv3 ;; m68k-ccur) os=-rtu ;; m88k-omron*) os=-luna ;; *-next ) os=-nextstep ;; *-sequent) os=-ptx ;; *-crds) os=-unos ;; *-ns) os=-genix ;; i370-*) os=-mvs ;; *-next) os=-nextstep3 ;; *-gould) os=-sysv ;; *-highlevel) os=-bsd ;; *-encore) os=-bsd ;; *-sgi) os=-irix ;; *-siemens) os=-sysv4 ;; *-masscomp) os=-rtu ;; f30[01]-fujitsu | f700-fujitsu) os=-uxpv ;; *-rom68k) os=-coff ;; *-*bug) os=-coff ;; *-apple) os=-macos ;; *-atari*) os=-mint ;; *) os=-none ;; esac fi # Here we handle the case where we know the os, and the CPU type, but not the # manufacturer. We pick the logical manufacturer. vendor=unknown case $basic_machine in *-unknown) case $os in -riscix*) vendor=acorn ;; -sunos*) vendor=sun ;; -cnk*|-aix*) vendor=ibm ;; -beos*) vendor=be ;; -hpux*) vendor=hp ;; -mpeix*) vendor=hp ;; -hiux*) vendor=hitachi ;; -unos*) vendor=crds ;; -dgux*) vendor=dg ;; -luna*) vendor=omron ;; -genix*) vendor=ns ;; -mvs* | -opened*) vendor=ibm ;; -os400*) vendor=ibm ;; -ptx*) vendor=sequent ;; -tpf*) vendor=ibm ;; -vxsim* | -vxworks* | -windiss*) vendor=wrs ;; -aux*) vendor=apple ;; -hms*) vendor=hitachi ;; -mpw* | -macos*) vendor=apple ;; -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) vendor=atari ;; -vos*) vendor=stratus ;; esac basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"` ;; esac echo $basic_machine$os exit # Local variables: # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "timestamp='" # time-stamp-format: "%:y-%02m-%02d" # time-stamp-end: "'" # End: xavs2-1.3/build/linux/configure000077500000000000000000001377521340660520300166030ustar00rootroot00000000000000#!/bin/bash if test x"$1" = x"-h" -o x"$1" = x"--help" ; then cat <> config.log } log_ok() { echo "yes" >> config.log } log_fail() { echo "no" >> config.log } log_msg() { echo "$1" >> config.log } cc_cflags() { # several non gcc compilers issue an incredibly large number of warnings on high warning levels, # suppress them by reducing the warning level rather than having to use #pragmas for arg in $*; do [[ "$arg" = -falign-loops* ]] && arg= [ "$arg" = -fno-tree-vectorize ] && arg= [ "$arg" = -Wshadow ] && arg= [ "$arg" = -Wno-maybe-uninitialized ] && arg= [[ "$arg" = -mpreferred-stack-boundary* ]] && arg= [[ "$arg" = -l* ]] && arg= [[ "$arg" = -L* ]] && arg= if [ $compiler_style = MS ]; then [ "$arg" = -ffast-math ] && arg="-fp:fast" [ "$arg" = -Wall ] && arg= [ "$arg" = -Werror ] && arg="-W3 -WX" [ "$arg" = -g ] && arg=-Z7 [ "$arg" = -fomit-frame-pointer ] && arg= [ "$arg" = -s ] && arg= [ "$arg" = -fPIC ] && arg= else [ "$arg" = -ffast-math ] && arg= [ "$arg" = -Wall ] && arg= [ "$arg" = -Werror ] && arg="-w3 -Werror" fi [ $compiler = CL -a "$arg" = -O3 ] && arg=-O2 [ -n "$arg" ] && echo -n "$arg " done } cl_ldflags() { for arg in $*; do arg=${arg/LIBPATH/libpath} [ "${arg#-libpath:}" == "$arg" -a "${arg#-l}" != "$arg" ] && arg=${arg#-l}.lib [ "${arg#-L}" != "$arg" ] && arg=-libpath:${arg#-L} [ "$arg" = -Wl,--large-address-aware ] && arg=-largeaddressaware [ "$arg" = -s ] && arg= [ "$arg" = -Wl,-Bsymbolic ] && arg= [ "$arg" = -fno-tree-vectorize ] && arg= [ "$arg" = -Werror ] && arg= [ "$arg" = -Wshadow ] && arg= [ "$arg" = -Wmaybe-uninitialized ] && arg= [[ "$arg" = -Qdiag-error* ]] && arg= arg=${arg/pthreadGC/pthreadVC} [ "$arg" = avifil32.lib ] && arg=vfw32.lib [ "$arg" = gpac_static.lib ] && arg=libgpac_static.lib [ "$arg" = xavs2.lib ] && arg=libxavs2.lib [ -n "$arg" ] && echo -n "$arg " done } cc_check() { if [ -z "$3" ]; then if [ -z "$1$2" ]; then log_check "whether $CC works" elif [ -z "$1" ]; then log_check "for $2" else log_check "for $1" fi elif [ -z "$1" ]; then if [ -z "$2" ]; then log_check "whether $CC supports $3" else log_check "whether $CC supports $3 with $2" fi else log_check "for $3 in $1"; fi rm -f conftest.c for arg in $1; do echo "#include <$arg>" >> conftest.c done echo "int main (void) { $3 return 0; }" >> conftest.c if [ $compiler_style = MS ]; then cc_cmd="$CC conftest.c $(cc_cflags $CFLAGS $CHECK_CFLAGS $2) -link $(cl_ldflags $2 $LDFLAGSCLI $LDFLAGS)" else cc_cmd="$CC conftest.c $CFLAGS $CHECK_CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest" fi if $cc_cmd >conftest.log 2>&1; then res=$? log_ok else res=$? log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" log_msg "$cc_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" cat conftest.c >> config.log log_msg "--------------------------------------------------" fi return $res } cpp_check() { log_check "whether $3 is true" rm -f conftest.c for arg in $1; do echo "#include <$arg>" >> conftest.c done echo -e "#if !($3) \n#error $4 \n#endif " >> conftest.c if [ $compiler_style = MS ]; then cpp_cmd="$CC conftest.c $(cc_cflags $CFLAGS $2) -P" else cpp_cmd="$CC conftest.c $CFLAGS $2 -E -o conftest" fi if $cpp_cmd >conftest.log 2>&1; then res=$? log_ok else res=$? log_fail log_msg "--------------------------------------------------" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" cat conftest.c >> config.log log_msg "--------------------------------------------------" fi return $res } as_check() { log_check "whether $AS supports $1" echo "$1" > conftest$AS_EXT as_cmd="$AS conftest$AS_EXT $ASFLAGS $2 -o conftest.o" if $as_cmd >conftest.log 2>&1; then res=$? log_ok else res=$? log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" log_msg "$as_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" cat conftest$AS_EXT >> config.log log_msg "--------------------------------------------------" fi return $res } rc_check() { log_check "whether $RC works" echo "$1" > conftest.rc if [ $compiler = GNU ]; then rc_cmd="$RC $RCFLAGS -o conftest.o conftest.rc" else rc_cmd="$RC $RCFLAGS -foconftest.o conftest.rc" fi if $rc_cmd >conftest.log 2>&1; then res=$? log_ok else res=$? log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" log_msg "$rc_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" cat conftest.rc >> config.log log_msg "--------------------------------------------------" fi return $res } define() { echo "#define $1$([ -n "$2" ] && echo " $2" || echo " 1")" >> config.h } die() { log_msg "DIED: $@" echo "$@" exit 1 } configure_system_override() { log_check "system libxavs2 configuration" xavs2_config_path="$1/xavs2_config.h" if [ -e "$xavs2_config_path" ]; then res=$? log_ok arg="$(grep '#define XAVS2_GPL ' $xavs2_config_path | sed -e 's/#define XAVS2_GPL *//; s/ *$//')" if [ -n "$arg" ]; then [ "$arg" = 0 ] && arg="no" || arg="yes" [ "$arg" != "$gpl" ] && die "Incompatible license with system libxavs2" fi arg="$(grep '#define XAVS2_BIT_DEPTH ' $xavs2_config_path | sed -e 's/#define XAVS2_BIT_DEPTH *//; s/ *$//')" if [ -n "$arg" ]; then if [ "$arg" != "$bit_depth" ]; then echo "Override output bit depth with system libxavs2 configuration" bit_depth="$arg" fi fi arg="$(grep '#define XAVS2_CHROMA_FORMAT ' $xavs2_config_path | sed -e 's/#define XAVS2_CHROMA_FORMAT *//; s/ *$//')" if [ -n "$arg" ]; then [ "$arg" = 0 ] && arg="all" || arg="${arg#XAVS2_CSP_I}" if [ "$arg" != "$chroma_format" ]; then echo "Override output chroma format with system libxavs2 configuration" chroma_format="$arg" fi fi arg="$(grep '#define XAVS2_INTERLACED ' $xavs2_config_path | sed -e 's/#define XAVS2_INTERLACED *//; s/ *$//')" if [ -n "$arg" ]; then [ "$arg" = 0 ] && arg="no" || arg="yes" if [ "$arg" != "$interlaced" ]; then echo "Override interlaced encoding support with system libxavs2 configuration" interlaced="$arg" fi fi else res=$? log_fail log_msg "Failed search path was: $xavs2_config_path" fi return $res } rm -f xavs2_config.h config.h config.mak config.log xavs2.pc xavs2.def conftest* # Construct a path to the specified directory relative to the working directory relative_path() { local base="${PWD%/}" local path="$(cd "$1" >/dev/null; printf '%s/.' "${PWD%/}")" local up='' while [[ $path != "$base/"* ]]; do base="${base%/*}" up="../$up" done dirname "$up${path#"$base/"}" } SRCPATH="$(cd ../../source ; pwd)" BUILDPATH="$(cd . ; pwd)" echo "$SRCPATH" | grep -q ' ' && die "Out of tree builds are impossible with whitespace in source path." echo "$BUILDPATH" | grep -q ' ' && die "Out of tree builds are impossible with whitespace in source path." [ -e "$BUILDPATH/config.h" -o -e "$BUILDPATH/xavs2_config.h" ] && die "Out of tree builds are impossible with config.h/xavs2_config.h in source dir." prefix='/usr/local' exec_prefix='${prefix}' bindir='${exec_prefix}/bin' libdir='${exec_prefix}/lib' includedir='${prefix}/include' DEVNULL='/dev/null' cli="yes" cli_libxavs2="internal" shared="no" static="yes" avs="no" lavf="no" ffms="no" gpac="auto" lsmash="auto" mp4="no" gpl="yes" thread="auto" swscale="no" asm="auto" interlaced="yes" lto="no" debug="no" gprof="no" strip="no" pic="no" bit_depth="8" chroma_format="all" compiler="GNU" compiler_style="GNU" opencl="no" vsx="auto" CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)" LDFLAGS="$LDFLAGS" LDFLAGSCLI="$LDFLAGSCLI" ASFLAGS="$ASFLAGS -I. -I\$(SRCPATH)" RCFLAGS="$RCFLAGS" CHECK_CFLAGS="" HAVE_GETOPT_LONG=1 cross_prefix="" EXE="" AS_EXT=".S" NL=" " # list of all preprocessor HAVE values we can define CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \ LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC INTEL_DISPATCHER \ MSA MMAP WINRT VSX" # parse options for opt do optarg="${opt#*=}" case "$opt" in --prefix=*) prefix="$optarg" ;; --exec-prefix=*) exec_prefix="$optarg" ;; --bindir=*) bindir="$optarg" ;; --libdir=*) libdir="$optarg" ;; --includedir=*) includedir="$optarg" ;; --disable-cli) cli="no" ;; --system-libxavs2) cli_libxavs2="system" ;; --enable-shared) shared="yes" ;; --enable-static) static="yes" ;; --disable-asm) asm="no" ;; --disable-interlaced) interlaced="no" ;; --disable-avs) avs="no" ;; --disable-lavf) lavf="no" ;; --disable-ffms) ffms="no" ;; --disable-gpac) gpac="no" ;; --disable-lsmash) lsmash="no" ;; --disable-gpl) gpl="no" ;; --extra-asflags=*) ASFLAGS="$ASFLAGS $optarg" ;; --extra-cflags=*) CFLAGS="$CFLAGS $optarg" ;; --extra-ldflags=*) LDFLAGS="$LDFLAGS $optarg" ;; --extra-rcflags=*) RCFLAGS="$RCFLAGS $optarg" ;; --disable-thread) thread="no" ;; --disable-win32thread) [ "$thread" != "no" ] && thread="posix" ;; --disable-swscale) swscale="no" ;; --enable-lto) lto="auto" ;; --enable-debug) debug="yes" ;; --enable-gprof) CFLAGS="$CFLAGS -pg" LDFLAGS="$LDFLAGS -pg" gprof="yes" ;; --enable-strip) strip="yes" ;; --enable-pic) pic="yes" ;; --host=*) host="$optarg" ;; --disable-vsx) vsx="no" ;; --disable-opencl) opencl="no" ;; --cross-prefix=*) cross_prefix="$optarg" ;; --sysroot=*) CFLAGS="$CFLAGS --sysroot=$optarg" LDFLAGS="$LDFLAGS --sysroot=$optarg" ;; --bit-depth=*) bit_depth="$optarg" if [ "$bit_depth" -lt "8" -o "$bit_depth" -gt "10" ]; then echo "Supplied bit depth must be in range [8,10]." exit 1 elif [[ "$bit_depth" = "9" || "$bit_depth" = "10" ]]; then echo "BitDepth $bit_depth not supported currently." exit 1 fi bit_depth=`expr $bit_depth + 0` ;; --chroma-format=*) chroma_format="$optarg" if [ $chroma_format != "420" -a $chroma_format != "422" -a $chroma_format != "444" -a $chroma_format != "all" ]; then echo "Supplied chroma format must be 420, 422, 444 or all." exit 1 fi ;; *) echo "Unknown option $opt, ignored" ;; esac done [ "$cli" = "no" -a "$shared" = "no" -a "$static" = "no" ] && die "Nothing to build. Enable cli, shared or static." CC="${CC-${cross_prefix}gcc}" STRIP="${STRIP-${cross_prefix}strip}" INSTALL="${INSTALL-install}" PKGCONFIG="${PKGCONFIG-${cross_prefix}pkg-config}" # ar and ranlib doesn't load the LTO plugin by default, prefer the gcc-prefixed wrappers which does. if ${cross_prefix}gcc-ar --version >/dev/null 2>&1; then AR="${AR-${cross_prefix}gcc-ar}" else AR="${AR-${cross_prefix}ar}" fi if ${cross_prefix}gcc-ranlib --version >/dev/null 2>&1; then RANLIB="${RANLIB-${cross_prefix}gcc-ranlib}" else RANLIB="${RANLIB-${cross_prefix}ranlib}" fi if [ "x$host" = x ]; then host=`${BUILDPATH}/config.guess` fi # normalize a triplet into a quadruplet host=`${BUILDPATH}/config.sub $host` # split $host host_cpu="${host%%-*}" host="${host#*-}" host_vendor="${host%%-*}" host_os="${host#*-}" trap 'rm -f conftest*' EXIT # test for use of compilers that require specific handling cc_base=`basename "$CC"` QPRE="-" if [[ $host_os = mingw* || $host_os = cygwin* ]]; then if [[ "$cc_base" = icl || "$cc_base" = icl[\ .]* ]]; then # Windows Intel Compiler creates dependency generation with absolute Windows paths, Cygwin's make does not support Windows paths. [[ $host_os = cygwin* ]] && die "Windows Intel Compiler support requires MSYS" compiler=ICL compiler_style=MS CFLAGS="$CFLAGS -Qstd=c99 -nologo -Qms0 -DHAVE_STRING_H -I\$(SRCPATH)/extras" QPRE="-Q" `$CC 2>&1 | grep -q IA-32` && host_cpu=i486 `$CC 2>&1 | grep -q "Intel(R) 64"` && host_cpu=x86_64 cpp_check "" "" "_MSC_VER >= 1400" || die "Windows Intel Compiler support requires Visual Studio 2005 or newer" if cc_check '' -Qdiag-error:10006,10157 ; then CHECK_CFLAGS="$CHECK_CFLAGS -Qdiag-error:10006,10157" fi elif [[ "$cc_base" = cl || "$cc_base" = cl[\ .]* ]]; then # Standard Microsoft Visual Studio compiler=CL compiler_style=MS CFLAGS="$CFLAGS -nologo -GS- -DHAVE_STRING_H -I\$(SRCPATH)/extras" `$CC 2>&1 | grep -q 'x86'` && host_cpu=i486 `$CC 2>&1 | grep -q 'x64'` && host_cpu=x86_64 cpp_check '' '' '_MSC_VER > 1800 || (_MSC_VER == 1800 && _MSC_FULL_VER >= 180030324)' || die "Microsoft Visual Studio support requires Visual Studio 2013 Update 2 or newer" else # MinGW uses broken pre-VS2015 Microsoft printf functions unless it's told to use the POSIX ones. CFLAGS="$CFLAGS -D_POSIX_C_SOURCE=200112L" fi else if [[ "$cc_base" = icc || "$cc_base" = icc[\ .]* ]]; then AR="xiar" compiler=ICC fi fi if [[ "$cc_base" = clang* ]]; then if cc_check '' -Werror=unknown-warning-option ; then CHECK_CFLAGS="$CHECK_CFLAGS -Werror=unknown-warning-option" fi fi libm="" case $host_os in beos*) SYS="BEOS" define HAVE_MALLOC_H ;; darwin*) SYS="MACOSX" libm="-lm" if [ "$pic" = "no" ]; then cc_check "" -mdynamic-no-pic && CFLAGS="$CFLAGS -mdynamic-no-pic" fi # TODO: Fix compiling error under mac osx (force disabled now) asm="no" ;; freebsd*) SYS="FREEBSD" libm="-lm" ;; kfreebsd*-gnu) SYS="FREEBSD" define HAVE_MALLOC_H libm="-lm" ;; netbsd*) SYS="NETBSD" libm="-lm" ;; openbsd*) SYS="OPENBSD" libm="-lm" ;; *linux*) SYS="LINUX" define HAVE_MALLOC_H libm="-lm" ;; gnu*) SYS="HURD" define HAVE_MALLOC_H libm="-lm" ;; cygwin*|mingw*|msys*) EXE=".exe" if [[ $host_os = cygwin* ]] && cpp_check "" "" "defined(__CYGWIN__)" ; then SYS="CYGWIN" define HAVE_MALLOC_H else SYS="WINDOWS" DEVNULL="NUL" LDFLAGSCLI="$LDFLAGSCLI -lshell32" [ $compiler = GNU ] && RC="${RC-${cross_prefix}windres}" || RC="${RC-rc}" fi ;; sunos*|solaris*) SYS="SunOS" define HAVE_MALLOC_H libm="-lm" if cc_check "" /usr/lib/64/values-xpg6.o; then LDFLAGS="$LDFLAGS /usr/lib/64/values-xpg6.o" else LDFLAGS="$LDFLAGS /usr/lib/values-xpg6.o" fi if test -x /usr/ucb/install ; then INSTALL=/usr/ucb/install elif test -x /usr/bin/ginstall ; then # OpenSolaris INSTALL=/usr/bin/ginstall elif test -x /usr/gnu/bin/install ; then # OpenSolaris INSTALL=/usr/gnu/bin/install fi HAVE_GETOPT_LONG=0 ;; *qnx*) SYS="QNX" define HAVE_MALLOC_H libm="-lm" HAVE_GETOPT_LONG=0 CFLAGS="$CFLAGS -I\$(SRCPATH)/extras" ;; *haiku*) SYS="HAIKU" ;; *) die "Unknown system $host, edit the configure" ;; esac LDFLAGS="$LDFLAGS $libm" stack_alignment=4 case $host_cpu in i*86) ARCH="X86" AS="${AS-yasm}" AS_EXT=".asm" CFLAGS="$CFLAGS -DARCH_X86_64=0" ASFLAGS="$ASFLAGS -DARCH_X86_64=0 -I\$(SRCPATH)/common/x86/" if [ $compiler = GNU ]; then if [[ "$asm" == auto && "$CFLAGS" != *-march* ]]; then CFLAGS="$CFLAGS -march=i686" fi if [[ "$asm" == auto && "$CFLAGS" != *-mfpmath* ]]; then CFLAGS="$CFLAGS -mfpmath=sse -msse -msse2" fi CFLAGS="-m32 $CFLAGS" LDFLAGS="-m32 $LDFLAGS" fi if [ "$SYS" = MACOSX ]; then ASFLAGS="$ASFLAGS -f macho32 -DPREFIX" elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then ASFLAGS="$ASFLAGS -f win32 -DPREFIX" LDFLAGS="$LDFLAGS -Wl,--large-address-aware" [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--dynamicbase,--nxcompat,--tsaware" [ $compiler = GNU ] && RCFLAGS="--target=pe-i386 $RCFLAGS" else ASFLAGS="$ASFLAGS -f elf32" fi ;; x86_64) ARCH="X86_64" AS="${AS-yasm}" AS_EXT=".asm" CFLAGS="$CFLAGS -DARCH_X86_64=1" ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -I\$(SRCPATH)/common/x86/" stack_alignment=16 [ $compiler = GNU ] && CFLAGS="-m64 $CFLAGS" && LDFLAGS="-m64 $LDFLAGS" if [ "$SYS" = MACOSX ]; then ASFLAGS="$ASFLAGS -f macho64 -DPIC -DPREFIX" if cc_check '' "-arch x86_64"; then CFLAGS="$CFLAGS -arch x86_64" LDFLAGS="$LDFLAGS -arch x86_64" fi elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then ASFLAGS="$ASFLAGS -f win64" if [ $compiler = GNU ]; then # only the GNU toolchain is inconsistent in prefixing function names with _ cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX" cc_check "" "-Wl,--high-entropy-va" && LDFLAGS="$LDFLAGS -Wl,--high-entropy-va" LDFLAGS="$LDFLAGS -Wl,--dynamicbase,--nxcompat,--tsaware" LDFLAGSCLI="$LDFLAGSCLI -Wl,--image-base,0x140000000" SOFLAGS="$SOFLAGS -Wl,--image-base,0x180000000" RCFLAGS="--target=pe-x86-64 $RCFLAGS" fi else ASFLAGS="$ASFLAGS -f elf64" fi ;; powerpc*) ARCH="PPC" if [ $asm = auto ] ; then define HAVE_ALTIVEC AS="${AS-${CC}}" AS_EXT=".c" if [ $SYS = MACOSX ] ; then CFLAGS="$CFLAGS -faltivec -fastf -mcpu=G4" else CFLAGS="$CFLAGS -maltivec -mabi=altivec" define HAVE_ALTIVEC_H fi if [ "$vsx" != "no" ] ; then vsx="no" if cc_check "" "-mvsx" ; then CFLAGS="$CFLAGS -mvsx" define HAVE_VSX vsx="yes" fi fi fi ;; sparc) ARCH="SPARC" ;; mips*) ARCH="MIPS" AS="${AS-${CC}}" AS_EXT=".c" ;; arm*) ARCH="ARM" if [ "$SYS" = MACOSX ] ; then AS="${AS-${SRCPATH}/tools/gas-preprocessor.pl -arch arm -- ${CC}}" ASFLAGS="$ASFLAGS -DPREFIX -DPIC" # apple's ld doesn't support movw/movt relocations at all # build for armv7 by default if ! echo $CFLAGS | grep -Eq '\-arch' ; then CFLAGS="$CFLAGS -arch armv7" LDFLAGS="$LDFLAGS -arch armv7" fi else AS="${AS-${CC}}" fi ;; aarch64) ARCH="AARCH64" stack_alignment=16 if [ "$SYS" = MACOSX ] ; then AS="${AS-${SRCPATH}/tools/gas-preprocessor.pl -arch aarch64 -- ${CC}}" ASFLAGS="$ASFLAGS -DPREFIX" else AS="${AS-${CC}}" fi ;; s390|s390x) ARCH="S390" ;; hppa*|parisc*) ARCH="PARISC" ;; ia64) ARCH="IA64" ;; alpha*) ARCH="ALPHA" ;; *) ARCH="$(echo $host_cpu | tr a-z A-Z)" ;; esac [ "$vsx" != "yes" ] && vsx="no" if [ $SYS = WINDOWS ]; then if ! rc_check "0 RCDATA {0}" ; then RC="" fi if cpp_check "winapifamily.h" "" "!WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)" ; then [ $compiler = CL ] || die "WinRT requires MSVC" define HAVE_WINRT CFLAGS="$CFLAGS -MD" LDFLAGS="$LDFLAGS -appcontainer" if ! cpp_check "" "" "defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0603" ; then die "_WIN32_WINNT must be defined to at least 0x0603 (Windows 8.1) for WinRT" elif cpp_check "" "" "_WIN32_WINNT >= 0x0A00" ; then # Universal Windows Platform (Windows 10) LDFLAGS="$LDFLAGS -lWindowsApp" fi cli="no" opencl="no" fi fi log_msg "xavs2 configure script" if [ -n "$*" ]; then msg="Command line options:" for i in $@; do msg="$msg \"$i\"" done log_msg "$msg" fi log_msg "" # check requirements cc_check || die "No working C compiler found." if [ $compiler_style = GNU ]; then if cc_check '' -std=gnu99 'for( int i = 0; i < 9; i++ );' ; then CFLAGS="$CFLAGS -std=gnu99 -D_GNU_SOURCE" elif cc_check '' -std=c99 'for( int i = 0; i < 9; i++ );' ; then CFLAGS="$CFLAGS -std=c99 -D_POSIX_C_SOURCE=200112L -D_BSD_SOURCE" elif ! cc_check '' '' 'for( int i = 0; i < 9; i++ );' ; then die "C99 compiler is needed for compilation." fi fi if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" -o $ARCH = "AARCH64" \) ] ; then pic="yes" fi if [ $compiler = GNU -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then if cc_check '' -mpreferred-stack-boundary=5 ; then CFLAGS="$CFLAGS -mpreferred-stack-boundary=5" stack_alignment=32 elif [ $stack_alignment -lt 16 ] && cc_check '' -mpreferred-stack-boundary=4 ; then CFLAGS="$CFLAGS -mpreferred-stack-boundary=4" stack_alignment=16 fi elif [ $compiler = ICC -a $ARCH = X86 ]; then # icc on linux has various degrees of mod16 stack support if [ $SYS = LINUX ]; then # >= 12 defaults to a mod16 stack if cpp_check "" "" "__INTEL_COMPILER >= 1200" ; then stack_alignment=16 # 11 <= x < 12 is capable of keeping a mod16 stack, but defaults to not doing so. elif cpp_check "" "" "__INTEL_COMPILER >= 1100" ; then CFLAGS="$CFLAGS -falign-stack=assume-16-byte" stack_alignment=16 fi # < 11 is completely incapable of keeping a mod16 stack fi fi if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then if ! as_check "vpmovzxwd ymm0, xmm0" ; then VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1` echo "Found $VER" echo "Minimum version is yasm-1.2.0" echo "If you really want to compile without asm, configure with --disable-asm." exit 1 fi cc_check '' '' '__asm__("pabsw %xmm0, %xmm0");' && define HAVE_X86_INLINE_ASM ASFLAGS="$ASFLAGS -Worphan-labels" define HAVE_MMX fi if [ $asm = auto -a $ARCH = ARM ] ; then # set flags so neon is built by default echo $CFLAGS | grep -Eq '(-mcpu|-march|-mfpu)' || CFLAGS="$CFLAGS -mcpu=cortex-a8 -mfpu=neon" if cc_check '' '' '__asm__("rev ip, ip");' ; then define HAVE_ARMV6 cc_check '' '' '__asm__("movt r0, #0");' && define HAVE_ARMV6T2 cc_check '' '' '__asm__("vadd.i16 q0, q0, q0");' && define HAVE_NEON ASFLAGS="$ASFLAGS -c" else echo "You specified a pre-ARMv6 or Thumb-1 CPU in your CFLAGS." echo "If you really want to run on such a CPU, configure with --disable-asm." exit 1 fi fi if [ $asm = auto -a $ARCH = AARCH64 ] ; then if cc_check '' '' '__asm__("cmeq v0.8h, v0.8h, #0");' ; then define HAVE_NEON ASFLAGS="$ASFLAGS -c" else echo "no NEON support, try adding -mfpu=neon to CFLAGS" echo "If you really want to run on such a CPU, configure with --disable-asm." exit 1 fi fi if [ $asm = auto -a \( $ARCH = ARM -o $ARCH = AARCH64 \) ] ; then # check if the assembler supports '.func' (clang 3.5 does not) as_check ".func test${NL}.endfunc" && define HAVE_AS_FUNC 1 fi if [ $asm = auto -a $ARCH = MIPS ] ; then if ! echo $CFLAGS | grep -Eq '(-march|-mmsa|-mno-msa)' ; then cc_check '' '-mmsa -mfp64 -mhard-float' && CFLAGS="-mmsa -mfp64 -mhard-float $CFLAGS" fi if cc_check '' '' '__asm__("addvi.b $w0, $w1, 1");' ; then define HAVE_MSA else echo "You specified a pre-MSA CPU in your CFLAGS." echo "If you really want to run on such a CPU, configure with --disable-asm." exit 1 fi fi [ $asm = no ] && AS="" [ "x$AS" = x ] && asm="no" || asm="yes" define ARCH_$ARCH define SYS_$SYS define STACK_ALIGNMENT $stack_alignment ASFLAGS="$ASFLAGS -DSTACK_ALIGNMENT=$stack_alignment" # skip endianness check for Intel Compiler and MSVS, as all supported platforms are little. each have flags that will cause the check to fail as well CPU_ENDIAN="little-endian" if [ $compiler = GNU ]; then echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c $CC $CFLAGS conftest.c -c -o conftest.o 2>/dev/null || die "endian test failed" if (${cross_prefix}strings -a conftest.o | grep -q BIGE) && (${cross_prefix}strings -a conftest.o | grep -q FPendian) ; then define WORDS_BIGENDIAN CPU_ENDIAN="big-endian" elif !(${cross_prefix}strings -a conftest.o | grep -q EGIB && ${cross_prefix}strings -a conftest.o | grep -q naidnePF) ; then die "endian test failed" fi fi if [ "$cli_libxavs2" = "system" -a "$shared" != "yes" ] ; then [ "$static" = "yes" ] && die "Option --system-libxavs2 can not be used together with --enable-static" if $PKGCONFIG --exists xavs2 2>/dev/null; then XAVS2_LIBS="$($PKGCONFIG --libs xavs2)" XAVS2_INCLUDE_DIR="${XAVS2_INCLUDE_DIR-$($PKGCONFIG --variable=includedir xavs2)}" configure_system_override "$XAVS2_INCLUDE_DIR" || die "Detection of system libxavs2 configuration failed" else die "Can not find system libxavs2" fi fi # autodetect options that weren't forced nor disabled libpthread="" if [ "$SYS" = "WINDOWS" -a "$thread" = "posix" ] ; then if [ "$gpl" = "no" ] ; then echo "Warning: pthread-win32 is LGPL and is therefore not supported with --disable-gpl" thread="no" elif cc_check pthread.h -lpthread "pthread_create(0,0,0,0);" ; then libpthread="-lpthread" elif cc_check pthread.h -lpthreadGC2 "pthread_create(0,0,0,0);" ; then libpthread="-lpthreadGC2" elif cc_check pthread.h "-lpthreadGC2 -lwsock32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then libpthread="-lpthreadGC2 -lwsock32" define PTW32_STATIC_LIB elif cc_check pthread.h "-lpthreadGC2 -lws2_32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then libpthread="-lpthreadGC2 -lws2_32" define PTW32_STATIC_LIB else thread="no" fi elif [ "$thread" != "no" ] ; then thread="no" case $SYS in BEOS) thread="beos" define HAVE_BEOSTHREAD ;; WINDOWS) thread="win32" define HAVE_WIN32THREAD ;; QNX) cc_check pthread.h -lc "pthread_create(0,0,0,0);" && thread="posix" && libpthread="-lc" ;; *) if cc_check pthread.h -lpthread "pthread_create(0,0,0,0);" ; then thread="posix" libpthread="-lpthread" else cc_check pthread.h "" "pthread_create(0,0,0,0);" && thread="posix" && libpthread="" fi ;; esac fi if [ "$thread" = "posix" ]; then LDFLAGS="$LDFLAGS $libpthread" define HAVE_POSIXTHREAD if [ "$SYS" = "LINUX" ] && cc_check sched.h "-D_GNU_SOURCE -Werror" "cpu_set_t p_aff; return CPU_COUNT(&p_aff);" ; then define HAVE_CPU_COUNT fi fi [ "$thread" != "no" ] && define HAVE_THREAD if cc_check "math.h" "-Werror" "return log2f(2);" ; then define HAVE_LOG2F fi if [ "$SYS" != "WINDOWS" ] && cpp_check "sys/mman.h unistd.h" "" "defined(MAP_PRIVATE)"; then define HAVE_MMAP fi if [ "$SYS" = "LINUX" -a \( "$ARCH" = "X86" -o "$ARCH" = "X86_64" \) ] && cc_check "sys/mman.h" "" "MADV_HUGEPAGE;" ; then define HAVE_THP fi if [ "$cli" = "no" ] ; then avs="no" lavf="no" ffms="no" gpac="no" lsmash="no" mp4="no" swscale="no" fi if [ "$swscale" = "auto" ] ; then swscale="no" if $PKGCONFIG --exists libswscale 2>/dev/null; then SWSCALE_LIBS="$SWSCALE_LIBS $($PKGCONFIG --libs libswscale libavutil)" SWSCALE_CFLAGS="$SWSCALE_CFLAGS $($PKGCONFIG --cflags libswscale libavutil)" fi [ -z "$SWSCALE_LIBS" ] && SWSCALE_LIBS="-lswscale -lavutil" if cc_check "libswscale/swscale.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "sws_init_context(0,0,0);" ; then if cpp_check "libavutil/pixdesc.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "defined(AV_PIX_FMT_FLAG_RGB)" ; then swscale="yes" else echo "Warning: AV_PIX_FMT_FLAG_RGB is missing from libavutil, update for swscale support" fi fi fi if [ "$lavf" = "auto" ] ; then lavf="no" if $PKGCONFIG --exists libavformat libavcodec libswscale 2>/dev/null; then LAVF_LIBS="$LAVF_LIBS $($PKGCONFIG --libs libavformat libavcodec libavutil libswscale)" LAVF_CFLAGS="$LAVF_CFLAGS $($PKGCONFIG --cflags libavformat libavcodec libavutil libswscale)" fi if [ -z "$LAVF_LIBS" -a -z "$LAVF_CFLAGS" ]; then LAVF_LIBS="-lavformat" for lib in -lpostproc -lavcodec -lswscale -lavutil -lm -lz -lbz2 $libpthread -lavifil32 -lws2_32; do cc_check "" $lib && LAVF_LIBS="$LAVF_LIBS $lib" done fi LAVF_LIBS="-L. $LAVF_LIBS" if cc_check libavformat/avformat.h "$LAVF_CFLAGS $LAVF_LIBS" "av_frame_free(0);" ; then if [ "$swscale" = "yes" ]; then lavf="yes" else echo "Warning: libavformat is not supported without swscale support" fi fi fi if [ "$ffms" = "auto" ] ; then ffms_major="2"; ffms_minor="21"; ffms_micro="0"; ffms_bump="0" ffms="no" if $PKGCONFIG --exists ffms2 2>/dev/null; then FFMS2_LIBS="$FFMS2_LIBS $($PKGCONFIG --libs ffms2)" FFMS2_CFLAGS="$FFMS2_CFLAGS $($PKGCONFIG --cflags ffms2)" fi [ -z "$FFMS2_LIBS" ] && FFMS2_LIBS="-lffms2" if cc_check ffms.h "$FFMS2_CFLAGS $FFMS2_LIBS" "FFMS_DestroyVideoSource(0);" ; then ffms="yes" elif cc_check ffms.h "$FFMS2_CFLAGS $FFMS2_LIBS -lstdc++ $LAVF_LIBS" "FFMS_DestroyVideoSource(0);" ; then ffms="yes" FFMS2_LIBS="$FFMS2_LIBS -lstdc++ $LAVF_LIBS" fi error="ffms must be at least version $ffms_major.$ffms_minor.$ffms_micro.$ffms_bump" if [ $ffms = "yes" ] && ! cpp_check "ffms.h" "$FFMS2_CFLAGS" "FFMS_VERSION >= (($ffms_major << 24) | ($ffms_minor << 16) | ($ffms_micro << 8) | $ffms_bump)" "$error"; then ffms="no" echo "Warning: $error" fi if [ "$ffms" = "yes" -a "$swscale" = "no" ]; then echo "Warning: ffms is not supported without swscale support" ffms="no" fi fi if [ "$swscale" = "yes" ]; then LDFLAGSCLI="$SWSCALE_LIBS $LDFLAGSCLI" CFLAGS="$CFLAGS $SWSCALE_CFLAGS" define HAVE_SWSCALE if [ "$lavf" = "yes" ]; then LDFLAGSCLI="$LAVF_LIBS $LDFLAGSCLI" CFLAGS="$CFLAGS $LAVF_CFLAGS" define HAVE_LAVF fi if [ "$ffms" = "yes" ]; then LDFLAGSCLI="$FFMS2_LIBS $LDFLAGSCLI" CFLAGS="$CFLAGS $FFMS2_CFLAGS" define HAVE_FFMS fi fi if [ "$lsmash" = "auto" ] ; then lsmash="no" if $PKGCONFIG --exists liblsmash 2>/dev/null; then LSMASH_LIBS="$LSMASH_LIBS $($PKGCONFIG --libs liblsmash)" LSMASH_CFLAGS="$LSMASH_CFLAGS $($PKGCONFIG --cflags liblsmash)" fi [ -z "$LSMASH_LIBS" ] && LSMASH_LIBS="-llsmash" if cc_check lsmash.h "$LSMASH_CFLAGS $LSMASH_LIBS" ; then if cpp_check lsmash.h "$LSMASH_CFLAGS" "LSMASH_VERSION_MAJOR > 1 || (LSMASH_VERSION_MAJOR == 1 && LSMASH_VERSION_MINOR >= 5)" ; then lsmash="yes" else echo "Warning: lsmash is too old, update to rev.895 or later" fi fi fi if [ "$gpac" = "auto" -a "$lsmash" != "yes" ] ; then gpac="no" GPAC_LIBS="-lgpac_static" cc_check "" -lz && GPAC_LIBS="$GPAC_LIBS -lz" if [ "$SYS" = "WINDOWS" ] ; then cc_check "" -lws2_32 && GPAC_LIBS="$GPAC_LIBS -lws2_32" cc_check "" -lwinmm && GPAC_LIBS="$GPAC_LIBS -lwinmm" fi if cc_check gpac/isomedia.h "$GPAC_LIBS" ; then if cc_check gpac/isomedia.h "$GPAC_LIBS" "gf_isom_set_pixel_aspect_ratio(0,0,0,0,0);" ; then gpac="yes" else echo "Warning: gpac is too old, update to 2007-06-21 UTC or later" fi fi fi if [ "$lsmash" = "yes" ] ; then mp4="lsmash" LDFLAGSCLI="$LSMASH_LIBS $LDFLAGSCLI" CFLAGS="$CFLAGS $LSMASH_CFLAGS" define HAVE_LSMASH elif [ "$gpac" = "yes" ] ; then mp4="gpac" define HAVE_GPAC LDFLAGSCLI="$GPAC_LIBS $LDFLAGSCLI" fi if [ "$avs" = "auto" ] ; then avs="no" # cygwin can use avisynth if it can use LoadLibrary if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibraryW(0);") ; then avs="avisynth" define HAVE_AVS define USE_AVXSYNTH 0 elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then # AvxSynth currently only supports Linux and OSX avs="avxsynth" define HAVE_AVS define USE_AVXSYNTH 1 AVS_LIBS="-ldl" LDFLAGSCLI="$AVS_LIBS $LDFLAGSCLI" fi fi cc_check "stdint.h" "" "uint32_t test_vec __attribute__ ((vector_size (16))) = {0,1,2,3};" && define HAVE_VECTOREXT if [ "$pic" = "yes" ] ; then [ "$SYS" != WINDOWS -a "$SYS" != CYGWIN ] && CFLAGS="$CFLAGS -fPIC" ASFLAGS="$ASFLAGS -DPIC" # resolve textrels in the x86 asm cc_check stdio.h "-shared -Wl,-Bsymbolic" && SOFLAGS="$SOFLAGS -Wl,-Bsymbolic" [ $SYS = SunOS -a "$ARCH" = "X86" ] && SOFLAGS="$SOFLAGS -mimpure-text" fi if [ "$debug" != "yes" -a "$gprof" != "yes" ]; then CFLAGS="$CFLAGS -fomit-frame-pointer" fi if [ "$strip" = "yes" ]; then LDFLAGS="$LDFLAGS -s" fi if [ "$debug" = "yes" ]; then CFLAGS="-O1 -g $CFLAGS" RCFLAGS="$RCFLAGS -DDEBUG" else CFLAGS="-O3 -ffast-math $CFLAGS" if [ "$lto" = "auto" ] && [ $compiler = GNU ] && cc_check "" "-flto" ; then lto="yes" CFLAGS="$CFLAGS -flto" LDFLAGS="$LDFLAGS -O3 -flto" fi fi [ "$lto" = "auto" ] && lto="no" if cc_check '' -fno-tree-vectorize ; then CFLAGS="$CFLAGS -fno-tree-vectorize" fi if [ $SYS = WINDOWS -a $ARCH = X86 -a $compiler = GNU ] ; then # workaround gcc/ld bug with alignment of static variables/arrays that are initialized to zero cc_check '' -fno-zero-initialized-in-bss && CFLAGS="$CFLAGS -fno-zero-initialized-in-bss" fi if cc_check '' -Wshadow ; then CFLAGS="-Wshadow $CFLAGS" fi if cc_check '' -Wmaybe-uninitialized ; then if [ $SYS = MACOSX ] ; then CFLAGS="-Wno-uninitialized $CFLAGS" else CFLAGS="-Wno-maybe-uninitialized $CFLAGS" fi fi if [ $compiler = ICC -o $compiler = ICL ] ; then if cc_check 'extras/intel_dispatcher.h' '' 'xavs2_intel_dispatcher_override();' ; then define HAVE_INTEL_DISPATCHER fi fi if [ "$bit_depth" -gt "8" ]; then define HIGH_BIT_DEPTH ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=1" CFLAGS+=" -DHIGH_BIT_DEPTH=1" opencl="no" else ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=0" CFLAGS+=" -DHIGH_BIT_DEPTH=0" fi if [ "$chroma_format" != "all" ]; then define CHROMA_FORMAT CHROMA_$chroma_format fi ASFLAGS="$ASFLAGS -DBIT_DEPTH=$bit_depth" CFLAGS+=" -DBIT_DEPTH=$bit_depth" [ $gpl = yes ] && define HAVE_GPL && xavs2_gpl=1 || xavs2_gpl=0 [ $interlaced = yes ] && define HAVE_INTERLACED && xavs2_interlaced=1 || xavs2_interlaced=0 libdl="" if [ "$opencl" = "yes" ]; then opencl="no" # cygwin can use opencl if it can use LoadLibrary if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibraryW(0);") ; then opencl="yes" define HAVE_OPENCL elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then opencl="yes" define HAVE_OPENCL libdl="-ldl" fi LDFLAGS="$LDFLAGS $libdl" elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then libdl="-ldl" LDFLAGS="$LDFLAGS $libdl" fi #define undefined vars as 0 for var in $CONFIG_HAVE; do grep -q "HAVE_$var 1" config.h || define HAVE_$var 0 done # generate exported config file config_chroma_format="XAVS2_CSP_I$chroma_format" [ "$config_chroma_format" == "XAVS2_CSP_Iall" ] && config_chroma_format="0" cat > $BUILDPATH/xavs2_config.h << EOF #define XAVS2_BIT_DEPTH $bit_depth #define XAVS2_GPL $xavs2_gpl #define XAVS2_INTERLACED $xavs2_interlaced #define XAVS2_CHROMA_FORMAT $config_chroma_format EOF # generate version.h cd ${SRCPATH}/.. ./version.sh >> $BUILDPATH/xavs2_config.h cd ${BUILDPATH} if [ "$cli_libxavs2" = "system" ] ; then if [ "$shared" = "yes" ]; then CLI_LIBXAVS2='$(SONAME)' else CLI_LIBXAVS2= LDFLAGSCLI="$XAVS2_LIBS $LDFLAGSCLI" cc_check 'stdint.h xavs2.h' '' 'xavs2_encoder_open(0);' || die "System libxavs2 can't be used for compilation of this version" fi else CLI_LIBXAVS2='$(LIBXAVS2)' fi DEPMM="${QPRE}MM" DEPMT="${QPRE}MT" if [ $compiler_style = MS ]; then AR="lib -nologo -out:" LD="link -out:" if [ $compiler = ICL ]; then AR="xi$AR" LD="xi$LD" else mslink="$(dirname "$(command -v cl 2>/dev/null)")/link" [ -x "$mslink" ] && LD="\"$mslink\" -out:" fi HAVE_GETOPT_LONG=0 LDFLAGS="-nologo -incremental:no $(cl_ldflags $LDFLAGS)" LDFLAGSCLI="$(cl_ldflags $LDFLAGSCLI)" LIBXAVS2=libxavs2.lib RANLIB= [ -n "$RC" ] && RCFLAGS="$RCFLAGS -nologo -I. -I\$(SRCPATH)/extras -fo" STRIP= if [ $debug = yes ]; then LDFLAGS="-debug $LDFLAGS" CFLAGS="-D_DEBUG $CFLAGS" else CFLAGS="-DNDEBUG $CFLAGS" fi else # gcc/icc DEPMM="$DEPMM -g0" AR="$AR rc " LD="$CC -o " LIBXAVS2=libxavs2.a [ -n "$RC" ] && RCFLAGS="$RCFLAGS -I. -o " fi [ $compiler != GNU ] && CFLAGS="$(cc_cflags $CFLAGS)" if [ $compiler = ICC -o $compiler = ICL ]; then # icc does not define __SSE__ until SSE2 optimization and icl never defines it or _M_IX86_FP [ \( $ARCH = X86_64 -o $ARCH = X86 \) -a $asm = yes ] && ! cpp_check "" "" "defined(__SSE__)" && define __SSE__ PROF_GEN_CC="${QPRE}prof-gen ${QPRE}prof-dir." PROF_GEN_LD= PROF_USE_CC="${QPRE}prof-use ${QPRE}prof-dir." PROF_USE_LD= elif [ $compiler = CL ]; then # Visual Studio # _M_IX86_FP is only defined on x86 [ $ARCH = X86 ] && cpp_check '' '' '_M_IX86_FP >= 1' && define __SSE__ [ $ARCH = X86_64 ] && define __SSE__ # As long as the cli application can't link against the dll, the dll can not be pgo'd. # pgds are link flag specific and the -dll flag for creating the dll makes it unshareable with the cli PROF_GEN_CC="-GL" PROF_GEN_LD="-LTCG:PGINSTRUMENT" PROF_USE_CC="-GL" PROF_USE_LD="-LTCG:PGOPTIMIZE" else PROF_GEN_CC="-fprofile-generate" PROF_GEN_LD="-fprofile-generate" PROF_USE_CC="-fprofile-use" PROF_USE_LD="-fprofile-use" fi # generate config files cat > config.mak << EOF SRCPATH=$SRCPATH prefix=$prefix exec_prefix=$exec_prefix bindir=$bindir libdir=$libdir includedir=$includedir SYS_ARCH=$ARCH SYS=$SYS CC=$CC CFLAGS=$CFLAGS COMPILER=$compiler COMPILER_STYLE=$compiler_style DEPMM=$DEPMM DEPMT=$DEPMT LD=$LD LDFLAGS=$LDFLAGS LIBXAVS2=$LIBXAVS2 AR=$AR RANLIB=$RANLIB STRIP=$STRIP INSTALL=$INSTALL AS=$AS ASFLAGS=$ASFLAGS RC=$RC RCFLAGS=$RCFLAGS EXE=$EXE HAVE_GETOPT_LONG=$HAVE_GETOPT_LONG DEVNULL=$DEVNULL PROF_GEN_CC=$PROF_GEN_CC PROF_GEN_LD=$PROF_GEN_LD PROF_USE_CC=$PROF_USE_CC PROF_USE_LD=$PROF_USE_LD HAVE_OPENCL=$opencl EOF if [ $compiler_style = MS ]; then echo '%.o: %.c' >> config.mak echo ' $(CC) $(CFLAGS) -c -Fo$@ $<' >> config.mak fi if [ "$cli" = "yes" ]; then echo 'default: cli' >> config.mak echo 'install: install-cli' >> config.mak fi if [ "$shared" = "yes" ]; then API=$(grep '#define XAVS2_VERSION' < $BUILDPATH/xavs2_config.h | sed 's/^.* \([1-9][0-9]*\).*$/\1/') if [ "$SYS" = "WINDOWS" -o "$SYS" = "CYGWIN" ]; then echo "SONAME=libxavs2-$API.dll" >> config.mak if [ $compiler_style = MS ]; then echo 'IMPLIBNAME=libxavs2.dll.lib' >> config.mak # GNU ld on windows defaults to exporting all global functions if there are no explicit __declspec(dllexport) declarations # MSVC link does not act similarly, so it is required to make an export definition out of xavs2.h and use it at link time echo "SOFLAGS=-dll -def:xavs2.def -implib:\$(IMPLIBNAME) $SOFLAGS" >> config.mak echo "EXPORTS" > xavs2.def # export API functions grep "^\(int\|void\|xavs2_t\).*xavs2" ${SRCPATH}/xavs2.h | sed -e "s/.*\(xavs2.*\)(.*/\1/;s/open/open_$API/g" >> xavs2.def # export API variables/data. must be flagged with the DATA keyword grep "extern.*xavs2" ${SRCPATH}/xavs2.h | sed -e "s/.*\(xavs2\w*\)\W.*/\1 DATA/;" >> xavs2.def else echo 'IMPLIBNAME=libxavs2.dll.a' >> config.mak echo "SOFLAGS=-shared -Wl,--out-implib,\$(IMPLIBNAME) $SOFLAGS" >> config.mak fi elif [ "$SYS" = "MACOSX" ]; then echo "SOSUFFIX=dylib" >> config.mak echo "SONAME=libxavs2.$API.dylib" >> config.mak echo "SOFLAGS=-shared -dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name \$(DESTDIR)\$(libdir)/\$(SONAME) $SOFLAGS" >> config.mak elif [ "$SYS" = "SunOS" ]; then echo "SOSUFFIX=so" >> config.mak echo "SONAME=libxavs2.so.$API" >> config.mak echo "SOFLAGS=-shared -Wl,-h,\$(SONAME) $SOFLAGS" >> config.mak else echo "SOSUFFIX=so" >> config.mak echo "SONAME=libxavs2.so.$API" >> config.mak echo "SOFLAGS=-shared -Wl,-soname,\$(SONAME) $SOFLAGS" >> config.mak fi echo 'default: lib-shared' >> config.mak echo 'install: install-lib-shared' >> config.mak fi if [ "$static" = "yes" ]; then echo 'default: lib-static' >> config.mak echo 'install: install-lib-static' >> config.mak fi echo "LDFLAGSCLI = $LDFLAGSCLI" >> config.mak echo "CLI_LIBXAVS2 = $CLI_LIBXAVS2" >> config.mak cat > xavs2.pc << EOF prefix=$prefix exec_prefix=$exec_prefix libdir=$libdir includedir=$includedir Name: xavs2 Description: AVS2 (IEEE 1857.4) encoder library Version: $(grep POINTVER < $BUILDPATH/xavs2_config.h | sed -e 's/.* "//; s/".*//') Libs: -L$libdir -lxavs2 $([ "$shared" = "yes" ] || echo $libpthread $libm $libdl) Libs.private: $([ "$shared" = "yes" ] && echo $libpthread $libm $libdl) Cflags: -I$includedir EOF filters="crop select_every" gpl_filters="" [ $swscale = yes ] && filters="resize $filters" [ $gpl = yes ] && filters="$filters $gpl_filters" cat > conftest.log <> config.log cat conftest.log >> config.log cat conftest.log # [ "$SRCPATH" != "." ] && ln -sf ${SRCPATH}/Makefile ./Makefile mkdir -p common/{aarch64,arm,ppc,x86,vec} encoder test echo echo "You can run 'make' or 'make fprofiled' now." xavs2-1.3/build/vs2013/000077500000000000000000000000001340660520300144545ustar00rootroot00000000000000xavs2-1.3/build/vs2013/libxavs2.vcxproj000066400000000000000000000422541340660520300176320ustar00rootroot00000000000000 Debug Win32 Debug x64 Release Win32 Release x64 {F202E04C-47EC-42CD-8AC0-73B0D77E4DD4} Win32Proj DynamicLibrary true v120 MultiByte DynamicLibrary true v120 MultiByte DynamicLibrary false v120 true MultiByte DynamicLibrary false v120 true MultiByte true $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ true true $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ true false $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ true false $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ true Level4 Disabled XAVS2_EXPORTS;HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;_DEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) ProgramDatabase MultiThreadedDebug ..\..\source;..\..\source\common;..\..\source\encoder;..\..\source\pthread Windows true $(IntDir)$(TargetName).pdb $(OutDir) libxavs2asm.lib;libxavs2intrin_avx.lib;libxavs2intrin_sse.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) ARCH_X86_64=0;HIGH_BIT_DEPTH=0;BIT_DEPTH=8;PREFIX cd /d "$(SolutionDir)..\.." && sh version.sh UpdateSourceVersionInfo Level4 Disabled XAVS2_EXPORTS;HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;_DEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) ProgramDatabase MultiThreadedDebug ..\..\source;..\..\source\common;..\..\source\encoder;..\..\source\pthread Windows true $(IntDir)$(TargetName).pdb $(OutDir) libxavs2asm.lib;libxavs2intrin_avx.lib;libxavs2intrin_sse.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) ARCH_X86_64=1;HIGH_BIT_DEPTH=0;BIT_DEPTH=8 cd /d "$(SolutionDir)..\.." && sh version.sh UpdateSourceVersionInfo Level4 NotUsing MaxSpeed true true XAVS2_EXPORTS;HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;NDEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) Speed MultiThreaded ..\..\source;..\..\source\common;..\..\source\encoder;..\..\source\pthread Windows true true true $(IntDir)$(TargetName).pdb $(OutDir) libxavs2asm.lib;libxavs2intrin_avx.lib;libxavs2intrin_sse.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) ARCH_X86_64=0;HIGH_BIT_DEPTH=0;BIT_DEPTH=8;PREFIX cd /d "$(SolutionDir)..\.." && sh version.sh UpdateSourceVersionInfo Level4 NotUsing MaxSpeed true true XAVS2_EXPORTS;HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;NDEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) Speed MultiThreaded ..\..\source;..\..\source\common;..\..\source\encoder;..\..\source\pthread Windows true true true $(IntDir)$(TargetName).pdb $(OutDir) libxavs2asm.lib;libxavs2intrin_avx.lib;libxavs2intrin_sse.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) ARCH_X86_64=1;HIGH_BIT_DEPTH=0;BIT_DEPTH=8 cd /d "$(SolutionDir)..\.." && sh version.sh UpdateSourceVersionInfo xavs2-1.3/build/vs2013/libxavs2.vcxproj.filters000066400000000000000000000257041340660520300213020ustar00rootroot00000000000000 {4ff04604-8f8d-4071-b0b8-49ac8bae7b6b} asm {507bf760-8348-4146-b19d-9ac9a3fe30d3} h;hpp {af245058-2850-4e6f-9bf2-7db09c8a284f} cpp;c;h;hpp;inl;def;asm {6aaaedbd-451b-4e6c-8af2-4d05203608a8} h;hpp {0276c2da-15f9-498b-a367-17310abbbf04} cpp;c;h;hpp;inl;def;asm common-src common-src common-src common-src common-src common-src common-src common-src common-src encoder-src encoder-src encoder-src encoder-src common-src common-src common-src encoder-src encoder-src encoder-src encoder-src encoder-src encoder-src encoder-src encoder-src common-src encoder-src encoder-src encoder-src encoder-src encoder-src common-src common-src encoder-src encoder-src encoder-src encoder-src common-src encoder-src common-src encoder-src encoder-src encoder-src encoder-src encoder-src encoder-src common-inc common-inc common-inc common-inc common-inc common-inc common-inc common-inc common-inc common-inc common-inc encoder-inc encoder-inc encoder-inc encoder-inc common-inc common-inc common-inc encoder-inc encoder-inc encoder-inc encoder-inc encoder-inc encoder-inc common-asm common-asm common-inc common-asm common-asm common-asm common-asm common-asm encoder-inc encoder-inc common-inc common-inc encoder-inc common-inc encoder-inc common-inc common-inc encoder-inc encoder-inc common-inc encoder-inc encoder-inc xavs2-1.3/build/vs2013/libxavs2asm.vcxproj000066400000000000000000000241471340660520300203340ustar00rootroot00000000000000 Debug Win32 Debug x64 Release Win32 Release x64 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1} Win32Proj asmopt StaticLibrary true v120 MultiByte StaticLibrary true v120 MultiByte StaticLibrary false v120 true MultiByte StaticLibrary false v120 true MultiByte $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ Level4 Disabled HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;_DEBUG;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\common;..\..\source\pthread ProgramDatabase MultiThreadedDebug 4752; Windows true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=0;PREFIX Level4 Disabled HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;_DEBUG;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\common;..\..\source\pthread MultiThreadedDebug 4752; NotSet Windows true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=1; Level4 MaxSpeed true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;NDEBUG;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\common;..\..\source\pthread Speed MultiThreaded 4752; Windows true true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=0;PREFIX Level4 MaxSpeed true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;NDEBUG;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\common;..\..\source\pthread Speed MultiThreaded 4752; NotSet Windows true true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=1; xavs2-1.3/build/vs2013/libxavs2asm.vcxproj.filters000066400000000000000000000041241340660520300217740ustar00rootroot00000000000000 {4FC737F1-C7A5-4376-A066-2A32D752A2FF} cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx vec asm asm asm asm asm asm asm asm asm asm asm asm asm asm asm asm xavs2-1.3/build/vs2013/libxavs2intrin_avx.vcxproj000066400000000000000000000242361340660520300217340ustar00rootroot00000000000000 Debug Win32 Debug x64 Release Win32 Release x64 {28F41748-29FF-4423-8B3A-320455EC0B1D} Win32Proj asmopt StaticLibrary true v120 MultiByte StaticLibrary true v120 MultiByte StaticLibrary false v120 true MultiByte StaticLibrary false v120 true MultiByte $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ Level4 Disabled HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;_DEBUG;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\common;..\..\source\pthread ProgramDatabase MultiThreadedDebug 4752; AdvancedVectorExtensions Windows true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=0;PREFIX Level4 Disabled HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;_DEBUG;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\common;..\..\source\pthread MultiThreadedDebug 4752; AdvancedVectorExtensions Windows true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=1; Level4 MaxSpeed true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;NDEBUG;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\common;..\..\source\pthread Speed MultiThreaded 4752; AdvancedVectorExtensions Windows true true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=0;PREFIX Level4 MaxSpeed true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;NDEBUG;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\common;..\..\source\pthread Speed MultiThreaded 4752; AdvancedVectorExtensions Windows true true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=1; xavs2-1.3/build/vs2013/libxavs2intrin_avx.vcxproj.filters000066400000000000000000000033261340660520300234000ustar00rootroot00000000000000 {93995380-89BD-4b04-88EB-625FBE52EBFB} cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx vec vec vec vec vec vec vec vec vec vec vec xavs2-1.3/build/vs2013/libxavs2intrin_sse.vcxproj000066400000000000000000000236611340660520300217310ustar00rootroot00000000000000 Debug Win32 Debug x64 Release Win32 Release x64 {90F0D7B6-BBB4-4076-8543-A2C1D7176946} Win32Proj asmopt StaticLibrary true v120 MultiByte StaticLibrary true v120 MultiByte StaticLibrary false v120 true MultiByte StaticLibrary false v120 true MultiByte $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ Level4 Disabled HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;_DEBUG;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\common;..\..\source\pthread ProgramDatabase MultiThreadedDebug Windows true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=0;PREFIX Level4 Disabled HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;_DEBUG;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\common;..\..\source\pthread MultiThreadedDebug NotSet Windows true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=1; Level4 MaxSpeed true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;NDEBUG;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\common;..\..\source\pthread Speed MultiThreaded Windows true true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=0;PREFIX Level4 MaxSpeed true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;NDEBUG;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\common;..\..\source\pthread Speed MultiThreaded NotSet Windows true true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=1; xavs2-1.3/build/vs2013/libxavs2intrin_sse.vcxproj.filters000066400000000000000000000041671340660520300234000ustar00rootroot00000000000000 {93995380-89BD-4b04-88EB-625FBE52EBFB} cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx vec vec vec vec vec vec vec vec vec vec vec vec vec vec vec xavs2-1.3/build/vs2013/xavs2.sln000066400000000000000000000113571340660520300162440ustar00rootroot00000000000000 Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 2013 VisualStudioVersion = 12.0.40629.0 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xavs2", "xavs2.vcxproj", "{01B07A15-D428-468F-ADE3-35982416A66A}" ProjectSection(ProjectDependencies) = postProject {F202E04C-47EC-42CD-8AC0-73B0D77E4DD4} = {F202E04C-47EC-42CD-8AC0-73B0D77E4DD4} EndProjectSection EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libxavs2asm", "libxavs2asm.vcxproj", "{A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libxavs2", "libxavs2.vcxproj", "{F202E04C-47EC-42CD-8AC0-73B0D77E4DD4}" ProjectSection(ProjectDependencies) = postProject {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1} = {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1} {28F41748-29FF-4423-8B3A-320455EC0B1D} = {28F41748-29FF-4423-8B3A-320455EC0B1D} {90F0D7B6-BBB4-4076-8543-A2C1D7176946} = {90F0D7B6-BBB4-4076-8543-A2C1D7176946} EndProjectSection EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libxavs2intrin_avx", "libxavs2intrin_avx.vcxproj", "{28F41748-29FF-4423-8B3A-320455EC0B1D}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libxavs2intrin_sse", "libxavs2intrin_sse.vcxproj", "{90F0D7B6-BBB4-4076-8543-A2C1D7176946}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Win32 = Debug|Win32 Debug|x64 = Debug|x64 Release|Win32 = Release|Win32 Release|x64 = Release|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {01B07A15-D428-468F-ADE3-35982416A66A}.Debug|Win32.ActiveCfg = Debug|Win32 {01B07A15-D428-468F-ADE3-35982416A66A}.Debug|Win32.Build.0 = Debug|Win32 {01B07A15-D428-468F-ADE3-35982416A66A}.Debug|x64.ActiveCfg = Debug|x64 {01B07A15-D428-468F-ADE3-35982416A66A}.Debug|x64.Build.0 = Debug|x64 {01B07A15-D428-468F-ADE3-35982416A66A}.Release|Win32.ActiveCfg = Release|Win32 {01B07A15-D428-468F-ADE3-35982416A66A}.Release|Win32.Build.0 = Release|Win32 {01B07A15-D428-468F-ADE3-35982416A66A}.Release|x64.ActiveCfg = Release|x64 {01B07A15-D428-468F-ADE3-35982416A66A}.Release|x64.Build.0 = Release|x64 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Debug|Win32.ActiveCfg = Debug|Win32 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Debug|Win32.Build.0 = Debug|Win32 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Debug|x64.ActiveCfg = Debug|x64 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Debug|x64.Build.0 = Debug|x64 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Release|Win32.ActiveCfg = Release|Win32 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Release|Win32.Build.0 = Release|Win32 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Release|x64.ActiveCfg = Release|x64 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Release|x64.Build.0 = Release|x64 {F202E04C-47EC-42CD-8AC0-73B0D77E4DD4}.Debug|Win32.ActiveCfg = Debug|Win32 {F202E04C-47EC-42CD-8AC0-73B0D77E4DD4}.Debug|Win32.Build.0 = Debug|Win32 {F202E04C-47EC-42CD-8AC0-73B0D77E4DD4}.Debug|x64.ActiveCfg = Debug|x64 {F202E04C-47EC-42CD-8AC0-73B0D77E4DD4}.Debug|x64.Build.0 = Debug|x64 {F202E04C-47EC-42CD-8AC0-73B0D77E4DD4}.Release|Win32.ActiveCfg = Release|Win32 {F202E04C-47EC-42CD-8AC0-73B0D77E4DD4}.Release|Win32.Build.0 = Release|Win32 {F202E04C-47EC-42CD-8AC0-73B0D77E4DD4}.Release|x64.ActiveCfg = Release|x64 {F202E04C-47EC-42CD-8AC0-73B0D77E4DD4}.Release|x64.Build.0 = Release|x64 {28F41748-29FF-4423-8B3A-320455EC0B1D}.Debug|Win32.ActiveCfg = Debug|Win32 {28F41748-29FF-4423-8B3A-320455EC0B1D}.Debug|Win32.Build.0 = Debug|Win32 {28F41748-29FF-4423-8B3A-320455EC0B1D}.Debug|x64.ActiveCfg = Debug|x64 {28F41748-29FF-4423-8B3A-320455EC0B1D}.Debug|x64.Build.0 = Debug|x64 {28F41748-29FF-4423-8B3A-320455EC0B1D}.Release|Win32.ActiveCfg = Release|Win32 {28F41748-29FF-4423-8B3A-320455EC0B1D}.Release|Win32.Build.0 = Release|Win32 {28F41748-29FF-4423-8B3A-320455EC0B1D}.Release|x64.ActiveCfg = Release|x64 {28F41748-29FF-4423-8B3A-320455EC0B1D}.Release|x64.Build.0 = Release|x64 {90F0D7B6-BBB4-4076-8543-A2C1D7176946}.Debug|Win32.ActiveCfg = Debug|Win32 {90F0D7B6-BBB4-4076-8543-A2C1D7176946}.Debug|Win32.Build.0 = Debug|Win32 {90F0D7B6-BBB4-4076-8543-A2C1D7176946}.Debug|x64.ActiveCfg = Debug|x64 {90F0D7B6-BBB4-4076-8543-A2C1D7176946}.Debug|x64.Build.0 = Debug|x64 {90F0D7B6-BBB4-4076-8543-A2C1D7176946}.Release|Win32.ActiveCfg = Release|Win32 {90F0D7B6-BBB4-4076-8543-A2C1D7176946}.Release|Win32.Build.0 = Release|Win32 {90F0D7B6-BBB4-4076-8543-A2C1D7176946}.Release|x64.ActiveCfg = Release|x64 {90F0D7B6-BBB4-4076-8543-A2C1D7176946}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection EndGlobal xavs2-1.3/build/vs2013/xavs2.vcxproj000066400000000000000000000243431340660520300171420ustar00rootroot00000000000000 Debug Win32 Debug x64 Release Win32 Release x64 {01B07A15-D428-468F-ADE3-35982416A66A} Win32Proj xavs2 Application true v120 MultiByte Application true v120 MultiByte Application false v120 true MultiByte Application false v120 true MultiByte true $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ true $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ false $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ false $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ Level4 Disabled WIN32;HIGH_BIT_DEPTH=0;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\pthread ProgramDatabase MultiThreadedDebug Console true kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(OutDir) $(IntDir)$(TargetName).pdb Level4 Disabled WIN32;HIGH_BIT_DEPTH=0;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\pthread ProgramDatabase MultiThreadedDebug Console true kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(OutDir) $(IntDir)$(TargetName).pdb Level4 Full true true WIN32;HIGH_BIT_DEPTH=0;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\pthread MultiThreaded Console true true true $(OutDir) kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(IntDir)$(TargetName).pdb Level4 Full true true WIN32;HIGH_BIT_DEPTH=0;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) ..\..\source;..\..\source\pthread MultiThreaded Console true true true $(OutDir) kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(IntDir)$(TargetName).pdb {8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942} false xavs2-1.3/build/vs2013/xavs2.vcxproj.filters000066400000000000000000000020221340660520300205770ustar00rootroot00000000000000 {4FC737F1-C7A5-4376-A066-2A32D752A2FF} cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx {eaa2aa69-cd94-4ae6-8a75-dead54e6f39d} src config config config config xavs2-1.3/config/000077500000000000000000000000001340660520300136645ustar00rootroot00000000000000xavs2-1.3/config/encoder_ai.cfg000066400000000000000000000210771340660520300164440ustar00rootroot00000000000000# New Input File Format is as follows # = # Comment # See configfile.h for a list of supported ParameterNames ########################################################################################## # Files ########################################################################################## InputFile = "E:\seqs\AVS2_Test_Seq\BasketballPass_416x240_50.yuv" # Input sequence, YUV 4:2:0 InputHeaderLength = 0 # If the inputfile has a header, state it's length in byte here FramesToBeEncoded = 30 # Number of frames to be coded SourceWidth = 416 # Image width in Pels, must be multiple of 16 SourceHeight = 240 # Image height in Pels, must be multiple of 16 InputSampleBitDepth = 8 # Source bit depth (8 bit or 10 bit for all components) SampleBitDepth = 8 # Internal coding bit depth (8 bit or 10 bit for all components) ReconFile = "test_rec.yuv" OutputFile = "test.avs" ########################################################################################## # Maximum Size ########################################################################################## MaxSizeInBit = 6 # Maximum CU size ########################################################################################## # Encoder Control ########################################################################################## ProfileID = 32 # Profile ID (18: MAIN PICTURE profile, 32: MAIN profile, 34: MAIN10 profile) LevelID = 66 # Level ID (16: 2.0; 32: 4.0; 34: 4.2; 64: 6.0; 66: 6.2) IntraPeriodMin = 1 # min Period of I-Frames (0=only first) IntraPeriodMax = 1 # max Period of I-Frames (0=only first) UseHadamard = 1 # Hadamard transform (0=not used, 1=used) ########################################################################################## # RD Optimization ########################################################################################## RdoLevel = 3 # RD-optimized mode decision (0:off, 1: only for best partition mode of one CU, 2: only for best 2 partition modes; 3: All partition modes) ########################################################################################## # Coding Tools ########################################################################################## NSQT = 0 # SDIP = 1 # SECTEnable = 1 # (0: Not use Secondary Transform, 1:Use Secondary Transform) ########################################################################################## # Loop filter parameters ########################################################################################## CrossSliceLoopFilter = 1 # Enable Cross Slice Boundary Filter (0=Disable, 1=Enable) LoopFilterDisable = 0 # Disable loop filter in picture header (0=Filter, 1=No Filter) LoopFilterParameter = 0 # Send loop filter parameter (0= No parameter, 1= Send Parameter) LoopFilterAlphaOffset = 0 # Aplha offset in loop filter LoopFilterBetaOffset = 0 # Beta offset in loop filter SAOEnable = 1 # Enable SAO (1=SAO on, 0=SAO OFF) ALFEnable = 1 # Enable ALF (1=ALF on, 0=ALF OFF) ALF_LowLatencyEncodingEnable = 0 # Enable Low Latency ALF (1=Low Latency ALF, 0=High Efficiency ALF) ########################################################################################## # Reference Picture Manage ########################################################################################## NumberBFrames = 0 # Number of B frames inserted (0=not used) gop_size = 1 # sub GOP size (negative numbers indicating an employ of default settings, which will invliadate the following settings.) # POC QPoffset #ref_pics_active ref_pic reference pictures predict deltaRPS num_ref_be_removed ref_be_removed Frame1: 1 0 0 0 0 1 1 ########################################################################################## #frame rate ########################################################################################### FrameRate = 8 # (1: 24000/1001,2: 24,3: 25,4: 30000/1001,5: 30,6: 50,7: 60000/1001,8: 60) ########################################################################################### #chroma format parameters ########################################################################################### ChromaFormat = 1 # (0=4:0:0,1=4:2:0,2=4:2:2) ######################################################################################## # Frequency Weighting Quantization ######################################################################################## WQEnable = 0 # Frequency Weighting Quantization (0=disable,1=enable) # SeqWQM = 0 # (0:default, 1:user define) # SeqWQFile = "seq_wq_matrix.txt" # # PicWQEnable = 0 # Frame level Frequency Weighting Quantization (0=disable,1=enable) # WQParam = 0 # Weighting Quantization Parameter(0=Default, 1=UnDetailed, 2=Detailed) # PicWQDataIndex = 1 # Picture level WQ data index (0:refer to seq_header, # # 1:derived by WQ parameter, # # 2:load from pic_header) # PicWQFile = "pic_wq_matrix.txt" # WQModel = 1 # Frequency Weighting Quantization Model (0-2) # WeightParamDetailed = "[128,98,106,116,116,128]" # User defined Parameters-Detailed # WeightParamUnDetailed = "[135,143,143,160,160,213]" # User defined Parameters-UnDetailed # # ChromaDeltaQPDisable = 1 # 1: Chroma Delta QP disable; 0: Chroma Delta QP enable # ChromaDeltaU = 0 # Chroma component U delta QP # ChromaDeltaV = 0 # Chroma component V delta QP ###################Encoder Optimization Tools############################################# ######################################################################################## #RDOQ ######################################################################################## RdoqLevel = 2 # (0: off, 1: cu level, only for best partition mode, 2: all mode) LambdaFactor = 75 # LambdaFactorP = 120 # LambdaFactorB = 100 # ########################################################################################## #RATECONTROL ########################################################################################## RateControl = 0 # 0: CQP, 1: CBR (frame level), 2: CBR (SCU level), 3: VBR TargetBitRate = 1000000 # (target bit-rate, in bps) initial_qp = 34 # initial qp for first frame (0-63) # ---------------------------------------------------------------------------- # preset level # ---------------------------------------------------------------------------- preset_level = 8 # preset level for the tradeoff between speed and performance, ordered from fastest to slowest (default 5) # ---------------------------------------------------------------------------- # slices # ---------------------------------------------------------------------------- slice_num = 1 # number of slices per frame # ---------------------------------------------------------------------------- # parallel # ---------------------------------------------------------------------------- thread_frames = 1 # number of parallel threads for frames ( 0: auto ) thread_rows = 1 # number of parallel threads for rows ( 0: auto ) # ---------------------------------------------------------------------------- # log level # ---------------------------------------------------------------------------- log_level = 3 # log level: -1: none, 0: error, 1: warning, 2: info, 3: debug xavs2-1.3/config/encoder_ldp.cfg000066400000000000000000000253521340660520300166320ustar00rootroot00000000000000# New Input File Format is as follows # = # Comment # See configfile.h for a list of supported ParameterNames ########################################################################################## # Files ########################################################################################## InputFile = "d:\seqs\AVS2_Test_Seq\BasketballPass_416x240_50.yuv" # Input sequence, YUV 4:2:0 InputHeaderLength = 0 # If the inputfile has a header, state it's length in byte here FramesToBeEncoded = 97 # Number of frames to be coded SourceWidth = 416 # Image width in Pels, must be multiple of 16 SourceHeight = 240 # Image height in Pels, must be multiple of 16 InputSampleBitDepth = 8 # Source bit depth (8 bit or 10 bit for all components) SampleBitDepth = 8 # Internal coding bit depth (8 bit or 10 bit for all components) ReconFile = "test_rec.yuv" OutputFile = "test.avs" ########################################################################################## # Maximum Size ########################################################################################## MaxSizeInBit = 6 # Maximum CU size ########################################################################################## # Encoder Control ########################################################################################## ProfileID = 32 # Profile ID (18: MAIN PICTURE profile, 32: MAIN profile, 34: MAIN10 profile) LevelID = 66 # Level ID (16: 2.0; 32: 4.0; 34: 4.2; 64: 6.0; 66: 6.2) IntraPeriodMin = 0 # min Period of I-Frames (0=only first) IntraPeriodMax = 0 # max Period of I-Frames (0=only first) OpenGOP = 0 # Open GOP UseHadamard = 1 # Hadamard transform (0=not used, 1=used) FME = 3 # Fast Motion Estimation method (1: DIA, 2: HEX 3: UMH, 4: TZ) SearchRange = 64 # Max search range NumberReferenceFrames = 4 # Number of previous frames used for inter motion search (1-5) inter_2PU = 1 # Inter block search 2NxN or Nx2N or AMP (0=disable, 1=enable) inter_AMP = 1 # Inter block search AMP (0=disable, 1=enable) ########################################################################################## # F Frames ########################################################################################## FFRAMEEnable = 1 # (0: Don't use F frames 1:Use F frames instead of P frames) DHPEnable = 1 # (0: Don't use DHP, 1:Use DHP) MHPSKIPEnable = 1 # (0: Don't use MH_PSKIP, 1:Use MH_PSKIP) WSMEnable = 1 # (0: Don't use WSM, 1:Use WSM) ########################################################################################## # RD Optimization ########################################################################################## RdoLevel = 3 # RD-optimized mode decision (0:off, 1: only for best partition mode of one CU, 2: only for best 2 partition modes; 3: All partition modes) ########################################################################################## # Coding Tools ########################################################################################## PMVREnable = 1 # NSQT = 1 # SDIP = 1 # SECTEnable = 1 # (0: Not use Secondary Transform, 1:Use Secondary Transform) ########################################################################################## # Loop filter parameters ########################################################################################## CrossSliceLoopFilter = 1 # Enable Cross Slice Boundary Filter (0=Disable, 1=Enable) LoopFilterDisable = 0 # Disable loop filter in picture header (0=Filter, 1=No Filter) LoopFilterParameter = 0 # Send loop filter parameter (0= No parameter, 1= Send Parameter) LoopFilterAlphaOffset = 0 # Aplha offset in loop filter LoopFilterBetaOffset = 0 # Beta offset in loop filter SAOEnable = 1 # Enable SAO (1=SAO on, 0=SAO OFF) ALFEnable = 1 # Enable ALF (1=ALF on, 0=ALF OFF) ALF_LowLatencyEncodingEnable = 0 # Enable Low Latency ALF (1=Low Latency ALF, 0=High Efficiency ALF) ########################################################################################## # Reference Picture Manage ########################################################################################## NumberBFrames = 0 # Number of B frames inserted (0=not used) gop_size = 4 # sub GOP size (negative numbers indicating an employ of default settings, which will invliadate the following settings.) # POC QPoffset #ref_pics_active ref_pic reference pictures predict deltaRPS num_ref_be_removed ref_be_removed Frame1: 1 5 4 1 1 5 9 13 0 1 2 Frame2: 2 4 4 1 1 2 6 10 0 1 4 Frame3: 3 5 4 1 1 3 7 11 0 1 2 Frame4: 4 2 4 1 1 4 8 12 0 1 12 ########################################################################################## #frame rate ########################################################################################### FrameRate = 8 # (1: 24000/1001,2: 24,3: 25,4: 30000/1001,5: 30,6: 50,7: 60000/1001,8: 60) ########################################################################################### #chroma format parameters ########################################################################################### ChromaFormat = 1 # (0=4:0:0,1=4:2:0,2=4:2:2) ######################################################################################## # Frequency Weighting Quantization ######################################################################################## WQEnable = 0 # Frequency Weighting Quantization (0=disable,1=enable) # SeqWQM = 0 # (0:default, 1:user define) # SeqWQFile = "seq_wq_matrix.txt" # # PicWQEnable = 0 # Frame level Frequency Weighting Quantization (0=disable,1=enable) # WQParam = 0 # Weighting Quantization Parameter(0=Default, 1=UnDetailed, 2=Detailed) # PicWQDataIndex = 1 # Picture level WQ data index (0:refer to seq_header, # # 1:derived by WQ parameter, # # 2:load from pic_header) # PicWQFile = "pic_wq_matrix.txt" # WQModel = 1 # Frequency Weighting Quantization Model (0-2) # WeightParamDetailed = "[128,98,106,116,116,128]" # User defined Parameters-Detailed # WeightParamUnDetailed = "[135,143,143,160,160,213]" # User defined Parameters-UnDetailed # # ChromaDeltaQPDisable = 1 # 1: Chroma Delta QP disable; 0: Chroma Delta QP enable # ChromaDeltaU = 0 # Chroma component U delta QP # ChromaDeltaV = 0 # Chroma component V delta QP ###################Encoder Optimization Tools############################################# ######################################################################################## #RDOQ ######################################################################################## RdoqLevel = 2 # (0: off, 1: cu level, only for best partition mode, 2: all mode) LambdaFactor = 75 # LambdaFactorP = 120 # LambdaFactorB = 100 # ######################################################################################## #Refine QP (Only used at RA configuration) ######################################################################################## RefineQP = 0 # Enable refined quantization ########################################################################################## # TDRDO (Only used at LD configuration) ########################################################################################## TDRDOEnable = 0 # (0: default disable Block level TDRDO, 1: enable when LD encoding) ########################################################################################## #RATECONTROL ########################################################################################## RateControl = 0 # 0: CQP, 1: CBR (frame level), 2: CBR (SCU level), 3: VBR TargetBitRate = 1000000 # target bitrate, in bps initial_qp = 34 # initial qp for first frame (0-63) min_qp = 20 # min qp for rate control (0-63) max_qp = 55 # max qp for rate control (0-63) # ---------------------------------------------------------------------------- # preset level # ---------------------------------------------------------------------------- preset_level = 8 # preset level for the tradeoff between speed and performance, ordered from fastest to slowest (default 5) # ---------------------------------------------------------------------------- # slices # ---------------------------------------------------------------------------- slice_num = 1 # number of slices per frame # ---------------------------------------------------------------------------- # parallel # ---------------------------------------------------------------------------- thread_frames = 1 # number of parallel threads for frames ( 0: auto ) thread_rows = 1 # number of parallel threads for rows ( 0: auto ) # ---------------------------------------------------------------------------- # log level # ---------------------------------------------------------------------------- log_level = 3 # log level: -1: none, 0: error, 1: warning, 2: info, 3: debug xavs2-1.3/config/encoder_ra.cfg000066400000000000000000000262671340660520300164630ustar00rootroot00000000000000# New Input File Format is as follows # = # Comment # See configfile.h for a list of supported ParameterNames ########################################################################################## # Files ########################################################################################## InputFile = "d:\seqs\AVS2_Test_Seq\BasketballPass_416x240_50.yuv" # Input sequence, YUV 4:2:0 InputHeaderLength = 0 # If the inputfile has a header, state it's length in byte here FramesToBeEncoded = 97 # Number of frames to be coded SourceWidth = 416 # Image width in Pels, must be multiple of 16 SourceHeight = 240 # Image height in Pels, must be multiple of 16 InputSampleBitDepth = 8 # Source bit depth (8 bit or 10 bit for all components) SampleBitDepth = 8 # Internal coding bit depth (8 bit or 10 bit for all components) ReconFile = "test_rec.yuv" OutputFile = "test.avs" ########################################################################################## # Maximum Size ########################################################################################## MaxSizeInBit = 6 # Maximum CU size ########################################################################################## # Encoder Control ########################################################################################## ProfileID = 32 # Profile ID (18: MAIN PICTURE profile, 32: MAIN profile, 34: MAIN10 profile) LevelID = 66 # Level ID (16: 2.0; 32: 4.0; 34: 4.2; 64: 6.0; 66: 6.2) IntraPeriodMin = 48 # min Period of I-Frames (0=only first) IntraPeriodMax = 48 # max Period of I-Frames (0=only first) OpenGOP = 1 # Open GOP UseHadamard = 1 # Hadamard transform (0=not used, 1=used) FME = 3 # Fast Motion Estimation method (1: DIA, 2: HEX 3: UMH, 4: TZ) SearchRange = 64 # Max search range NumberReferenceFrames = 4 # Number of previous frames used for inter motion search (1-5) inter_2PU = 1 # Inter block search 2NxN or Nx2N or AMP (0=disable, 1=enable) inter_AMP = 1 # Inter block search AMP (0=disable, 1=enable) ########################################################################################## # F Frames ########################################################################################## FFRAMEEnable = 1 # (0: Don't use F frames 1:Use F frames instead of P frames) DHPEnable = 1 # (0: Don't use DHP, 1:Use DHP) MHPSKIPEnable = 1 # (0: Don't use MH_PSKIP, 1:Use MH_PSKIP) WSMEnable = 1 # (0: Don't use WSM, 1:Use WSM) ########################################################################################## # RD Optimization ########################################################################################## RdoLevel = 3 # RD-optimized mode decision (0:off, 1: only for best partition mode of one CU, 2: only for best 2 partition modes; 3: All partition modes) ########################################################################################## # Coding Tools ########################################################################################## PMVREnable = 1 # NSQT = 1 # SDIP = 1 # SECTEnable = 1 # (0: Not use Secondary Transform, 1:Use Secondary Transform) ########################################################################################## # Loop filter parameters ########################################################################################## CrossSliceLoopFilter = 1 # Enable Cross Slice Boundary Filter (0=Disable, 1=Enable) LoopFilterDisable = 0 # Disable loop filter in picture header (0=Filter, 1=No Filter) LoopFilterParameter = 0 # Send loop filter parameter (0= No parameter, 1= Send Parameter) LoopFilterAlphaOffset = 0 # Aplha offset in loop filter LoopFilterBetaOffset = 0 # Beta offset in loop filter SAOEnable = 1 # Enable SAO (1=SAO on, 0=SAO OFF) ALFEnable = 1 # Enable ALF (1=ALF on, 0=ALF OFF) ALF_LowLatencyEncodingEnable = 0 # Enable Low Latency ALF (1=Low Latency ALF, 0=High Efficiency ALF) ########################################################################################## # Reference Picture Manage ########################################################################################## gop_size = 8 # sub GOP size (negative numbers indicating an employ of default settings, which will invliadate the following settings.) NumberBFrames = 7 # Number of B frames inserted (0=not used) # POC QPoffset #ref_pics_active ref_pic reference_pictures predict_deltaRPS num_ref_be_removed ref_be_removed Frame1: 8 1 4 1 8 3 7 16 0 2 16 17 Frame2: 4 1 2 1 1 9 0 1 4 Frame3: 2 2 2 1 1 10 0 1 9 Frame4: 1 4 2 0 1 11 0 0 Frame5: 3 4 2 0 3 2 0 0 Frame6: 6 2 2 1 5 4 0 0 Frame7: 5 4 2 0 1 5 0 1 4 Frame8: 7 4 2 0 7 2 0 0 ########################################################################################## #frame rate ########################################################################################### FrameRate = 8 # (1: 24000/1001,2: 24,3: 25,4: 30000/1001,5: 30,6: 50,7: 60000/1001,8: 60) ########################################################################################### #chroma format parameters ########################################################################################### ChromaFormat = 1 # (0=4:0:0,1=4:2:0,2=4:2:2) ######################################################################################## # Frequency Weighting Quantization ######################################################################################## WQEnable = 0 # Frequency Weighting Quantization (0=disable,1=enable) # SeqWQM = 0 # (0:default, 1:user define) # SeqWQFile = "seq_wq_matrix.txt" # # PicWQEnable = 0 # Frame level Frequency Weighting Quantization (0=disable,1=enable) # WQParam = 0 # Weighting Quantization Parameter(0=Default, 1=UnDetailed, 2=Detailed) # PicWQDataIndex = 1 # Picture level WQ data index (0:refer to seq_header, # # 1:derived by WQ parameter, # # 2:load from pic_header) # PicWQFile = "pic_wq_matrix.txt" # WQModel = 1 # Frequency Weighting Quantization Model (0-2) # WeightParamDetailed = "[128,98,106,116,116,128]" # User defined Parameters-Detailed # WeightParamUnDetailed = "[135,143,143,160,160,213]" # User defined Parameters-UnDetailed # # ChromaDeltaQPDisable = 1 # 1: Chroma Delta QP disable; 0: Chroma Delta QP enable # ChromaDeltaU = 0 # Chroma component U delta QP # ChromaDeltaV = 0 # Chroma component V delta QP ###################Encoder Optimization Tools############################################# ######################################################################################## #RDOQ ######################################################################################## RdoqLevel = 2 # (0: off, 1: cu level, only for best partition mode, 2: all mode) LambdaFactor = 75 # LambdaFactorP = 120 # LambdaFactorB = 100 # ######################################################################################## #Refine QP (Only used at RA configuration) ######################################################################################## RefineQP = 1 # Enable refined quantization ########################################################################################## # TDRDO (Only used at LD configuration) ########################################################################################## TDRDOEnable = 0 # (0: default disable Block level TDRDO, 1: enable when LD encoding) ########################################################################################## #RATECONTROL ########################################################################################## RateControl = 0 # 0: CQP, 1: CBR (frame level), 2: CBR (SCU level), 3: VBR TargetBitRate = 1000000 # target bitrate, in bps initial_qp = 34 # initial qp for first frame (0-63) min_qp = 20 # min qp for rate control (0-63) max_qp = 55 # max qp for rate control (0-63) # ---------------------------------------------------------------------------- # preset level # ---------------------------------------------------------------------------- preset_level = 8 # preset level for the tradeoff between speed and performance, ordered from fastest to slowest (default 5) # ---------------------------------------------------------------------------- # slices # ---------------------------------------------------------------------------- slice_num = 1 # number of slices per frame # ---------------------------------------------------------------------------- # parallel # ---------------------------------------------------------------------------- num_parallel_gop = 1 # number of parallel GOPs (0,1: no GOP parallelization) thread_frames = 1 # number of parallel threads for frames ( 0: auto ) thread_rows = 1 # number of parallel threads for rows ( 0: auto ) # ---------------------------------------------------------------------------- # log level # ---------------------------------------------------------------------------- log_level = 3 # log level: -1: none, 0: error, 1: warning, 2: info, 3: debug xavs2-1.3/config/seq-template.cfg000066400000000000000000000017651340660520300167570ustar00rootroot00000000000000########################################################################################## # Input File Configuration # Suggested parameters: encoder_*.cfg -f seq.cfg # **Do not modify this file.** # You can copy this file and rename as you like ########################################################################################## InputFile = "E:\Seq\Foreman_352x288_30.yuv" # Input sequence, YUV 4:2:0 FramesToBeEncoded = 50 # Number of frames to be coded SourceWidth = 352 # Image width in Pels, must be multiple of 16 SourceHeight = 288 # Image height in Pels, must be multiple of 16 InputSampleBitDepth = 8 # Source bit depth (8 bit or 10 bit for all components) SampleBitDepth = 8 # Internal coding bit depth (8 bit or 10 bit for all components) ReconFile = "test_rec.yuv" OutputFile = "test.avs" # TraceFile = "trace_enc.txt" xavs2-1.3/source/000077500000000000000000000000001340660520300137175ustar00rootroot00000000000000xavs2-1.3/source/common/000077500000000000000000000000001340660520300152075ustar00rootroot00000000000000xavs2-1.3/source/common/avs2_defs.h000066400000000000000000000335451340660520300172460ustar00rootroot00000000000000/* * avs2_defs.h * * Description of this file: * Struct definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_AVS2_DEFINITIONS_H #define XAVS2_AVS2_DEFINITIONS_H #include #include #include "defines.h" #include "osdep.h" #include "basic_types.h" #if (ARCH_X86 || ARCH_X86_64) #include #endif /* --------------------------------------------------------------------------- */ enum intra_avail_e { MD_I_LEFT = 0, MD_I_TOP = 1, MD_I_LEFT_DOWN = 2, MD_I_TOP_RIGHT = 3, MD_I_TOP_LEFT = 4, MD_I_NUM = 5, #define IS_NEIGHBOR_AVAIL(i_avai, md) ((i_avai) & (1 << (md))) }; enum transform_scan_direction_e { INTRA_PRED_VER = 0, INTRA_PRED_HOR, INTRA_PRED_DC_DIAG }; /* --------------------------------------------------------------------------- * luma intra prediction modes */ enum intra_pred_mode_e{ /* non-angular mode */ DC_PRED = 0 , /* prediction mode: DC */ PLANE_PRED = 1 , /* prediction mode: PLANE */ BI_PRED = 2 , /* prediction mode: BI */ /* vertical angular mode */ INTRA_ANG_X_3 = 3, INTRA_ANG_X_4 = 4, INTRA_ANG_X_5 = 5, INTRA_ANG_X_6 = 6, INTRA_ANG_X_7 = 7, INTRA_ANG_X_8 = 8, INTRA_ANG_X_9 = 9, INTRA_ANG_X_10 = 10, INTRA_ANG_X_11 = 11, INTRA_ANG_X_12 = 12, VERT_PRED = INTRA_ANG_X_12, /* prediction mode: VERT */ /* vertical + horizontal angular mode */ INTRA_ANG_XY_13 = 13, INTRA_ANG_XY_14 = 14, INTRA_ANG_XY_15 = 15, INTRA_ANG_XY_16 = 16, INTRA_ANG_XY_17 = 17, INTRA_ANG_XY_18 = 18, INTRA_ANG_XY_19 = 19, INTRA_ANG_XY_20 = 20, INTRA_ANG_XY_21 = 21, INTRA_ANG_XY_22 = 22, INTRA_ANG_XY_23 = 23, /* horizontal angular mode */ INTRA_ANG_Y_24 = 24, INTRA_ANG_Y_25 = 25, INTRA_ANG_Y_26 = 26, INTRA_ANG_Y_27 = 27, INTRA_ANG_Y_28 = 28, INTRA_ANG_Y_29 = 29, INTRA_ANG_Y_30 = 30, INTRA_ANG_Y_31 = 31, INTRA_ANG_Y_32 = 32, HOR_PRED = INTRA_ANG_Y_24, /* prediction mode: HOR */ NUM_INTRA_MODE = 33, /* number of luma intra prediction modes */ }; /* --------------------------------------------------------------------------- * chroma intra prediction modes */ enum intra_chroma_pred_mode_e { /* chroma intra prediction modes */ DM_PRED_C = 0, /* prediction mode: DM */ DC_PRED_C = 1, /* prediction mode: DC */ HOR_PRED_C = 2, /* prediction mode: HOR */ VERT_PRED_C = 3, /* prediction mode: VERT */ BI_PRED_C = 4, /* prediction mode: BI */ NUM_INTRA_MODE_CHROMA = 5, /* number of chroma intra prediction modes */ }; /* --------------------------------------------------------------------------- */ enum mvp_e { MVP_MEDIAN = 0, /* mv pred type: median */ MVP_LEFT = 1, /* : left */ MVP_TOP = 2, /* : top */ MVP_TR = 3 /* : top-right */ }; /* --------------------------------------------------------------------------- */ enum inter_pred_direction_e { PDIR_FWD = 0, /* pred direction: forward */ PDIR_BWD = 1, /* : backward */ PDIR_SYM = 2, /* : symmetric */ PDIR_BID = 3, /* : bidirectional */ PDIR_DUAL = 4, /* : dual */ PDIR_INVALID =-1 /* : invalid */ }; /* --------------------------------------------------------------------------- * reference index */ enum inter_pred_index_e { INVALID_REF = -1, /* invalid reference index */ B_BWD = 0, /* backward reference index for B frame: h->fref[0], used for ref_idx derivation */ B_FWD = 1 /* forward reference index for B frame: h->fref[1], used for ref_idx derivation */ }; /* --------------------------------------------------------------------------- */ enum direct_skip_mode_e { DS_NONE = -1, /* no spatial direct/skip mode */ /* spatial direct/skip mode for B frame */ DS_B_BID = 0, /* skip/direct mode: bi-direction */ DS_B_BWD = 1, /* : backward direction */ DS_B_SYM = 2, /* : symmetrical direction */ DS_B_FWD = 3, /* : forward direction */ /* spatial direct/skip mode for F frame */ DS_DUAL_1ST = 0, /* skip/direct mode: dual 1st */ DS_DUAL_2ND = 1, /* : dual 2nd */ DS_SINGLE_1ST = 2, /* : single 1st */ DS_SINGLE_2ND = 3, /* : single 2st */ /* max number */ DS_MAX_NUM = 4 /* max spatial direct/skip mode number of B or F frames */ }; /* --------------------------------------------------------------------------- * neighbor position used in inter coding (MVP) or intra prediction */ enum neighbor_block_pos_e { BLK_TOPLEFT = 0, /* D: top-left block: (x - 1, y - 1) */ BLK_TOP = 1, /* B: top block: (x , y - 1) */ BLK_LEFT = 2, /* A: left block: (x - 1, y ) */ BLK_TOPRIGHT = 3, /* C: top-right block: (x + W , y - 1) */ BLK_TOP2 = 4, /* G: top block: (x + W - 1, y - 1) */ BLK_LEFT2 = 5, /* F: left block: (x - 1, y + H - 1) */ BLK_COL = 6, /* Z: collocated block of temporal neighbor */ }; /* --------------------------------------------------------------------------- * level for RDO */ enum rdo_level_e { RDO_OFF = 0, /* disable RDO */ RDO_CU_LEVEL1 = 1, /* conduct RDO only for best 1 partition mode of CU */ RDO_CU_LEVEL2 = 2, /* conduct RDO only for best 2 partition mode of CU, * including 1 skip/direct mode and 1 normal partition mode */ RDO_ALL = 3 /* conduct for all partition modes */ }; /* --------------------------------------------------------------------------- * level for RDOQ */ enum rdoq_level_e { RDOQ_OFF = 0, /* disable RDOQ */ RDOQ_CU_LEVEL = 1, /* conduct RDOQ only for best partition mode of CU */ RDOQ_ALL = 2 /* conduct for all modes */ }; /* --------------------------------------------------------------------------- */ enum sao_component_index_e { SAO_Y = 0, SAO_Cb, SAO_Cr, NUM_SAO_COMPONENTS }; /* --------------------------------------------------------------------------- */ enum sao_mode_merge_type_e { SAO_MERGE_LEFT = 0, SAO_MERGE_ABOVE, SAO_MERGE_NONE, NUM_SAO_MERGE_TYPES = 2 }; /* --------------------------------------------------------------------------- */ enum sao_mode_type_e { SAO_TYPE_OFF = -1, SAO_TYPE_EO_0, SAO_TYPE_EO_90, SAO_TYPE_EO_135, SAO_TYPE_EO_45, SAO_TYPE_BO, NUM_SAO_NEW_TYPES }; /* --------------------------------------------------------------------------- * EO Groups, the assignments depended on how you implement the edgeType calculation */ enum sao_class_e { SAO_CLASS_EO_FULL_VALLEY = 0, SAO_CLASS_EO_HALF_VALLEY = 1, SAO_CLASS_EO_PLAIN = 2, SAO_CLASS_EO_HALF_PEAK = 3, SAO_CLASS_EO_FULL_PEAK = 4, SAO_CLASS_BO = 5, NUM_SAO_EO_CLASSES = SAO_CLASS_BO, NUM_SAO_OFFSET }; /* * =========================================================================== * macros * =========================================================================== */ #define XAVS2_MIN(a, b) ((a) < (b)? (a) : (b)) #define XAVS2_MAX(a, b) ((a) > (b)? (a) : (b)) #define XAVS2_MIN3(a, b, c) XAVS2_MIN((a), XAVS2_MIN((b),(c))) #define XAVS2_MAX3(a, b, c) XAVS2_MAX((a), XAVS2_MAX((b),(c))) #define XAVS2_CLIP1(a) ((a) > max_pel_value ? max_pel_value : ((a) < 0 ? 0 : (a))) #define XAVS2_CLIP3F(L, H, v) (((v) < (L)) ? (L) : (((v) > (H)) ? (H) : (v))) #define XAVS2_CLIP3(L, H, v) xavs2_clip3(L, H, v) #define XAVS2_ABS(A) ((A) < 0 ? (-(A)) : (A)) // abs macro, faster than procedure #define XAVS2_SWAP(x, y) {(y) = (y) ^ (x); (x) = (y) ^ (x); (y) = (x) ^ (y);} #ifdef __cplusplus template static void XAVS2_SWAP_PTR(T *&x, T *&y) { T *t = x; x = y; y = x; } #else #define XAVS2_SWAP_PTR(x, y) {void *_t = (void *)(x); (x) = (y); (y) = _t;} #endif #define XAVS2_ALIGN(x, a) (((x) + ((a) - 1)) & (~((a) - 1))) #define MAKEDWORD(mx, my) (((my) << 16) | ((mx) & 0xFFFF)) /** * =========================================================================== * global variables * =========================================================================== */ static const int g_bit_depth = BIT_DEPTH; static const int max_pel_value = (1 << BIT_DEPTH) - 1; static const int g_dc_value = (1 << BIT_DEPTH) >> 1; /** * =========================================================================== * inline function defines * =========================================================================== */ static ALWAYS_INLINE pel_t xavs2_clip_pixel(int x) { return (pel_t)((x & ~max_pel_value) ? (-x) >> 31 & max_pel_value : x); } static ALWAYS_INLINE int xavs2_clip3(int i_min, int i_max, int v) { return ((v < i_min) ? i_min : (v > i_max) ? i_max : v); } static ALWAYS_INLINE double xavs2_clip3f(double f_min, double f_max, double v) { return ((v < f_min) ? f_min : (v > f_max) ? f_max : v); } static ALWAYS_INLINE float xavs2_clip3ff(float f_min, float f_max, float v) { return ((v < f_min) ? f_min : (v > f_max) ? f_max : v); } static ALWAYS_INLINE int xavs2_median(int a, int b, int c) { int t = (a - b) & ((a - b) >> 31); a -= t; b += t; b -= (b - c) & ((b - c) >> 31); b += (a - b) & ((a - b) >> 31); return b; } // ֵķλ-1򷵻1 static ALWAYS_INLINE int xavs2_sign2(int val) { return ((val >> 31) << 1) + 1; } // ֵķλ-10ֵ01 static ALWAYS_INLINE int xavs2_sign3(int val) { return (val >> 31) | (int)(((uint32_t)-val) >> 31u); } // log2ֵ01ʱ0log2(val) #define xavs2_log2u(val) xavs2_ctz(val) /* --------------------------------------------------------------------------- * unions for type-punning. * Mn: load or store n bits, aligned, native-endian * CPn: copy n bits, aligned, native-endian * we don't use memcpy for CPn because memcpy's args aren't assumed * to be aligned */ typedef union { uint16_t i; uint8_t c[2]; } MAY_ALIAS xavs2_union16_t; typedef union { uint32_t i; uint16_t b[2]; uint8_t c[4]; } MAY_ALIAS xavs2_union32_t; typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } MAY_ALIAS xavs2_union64_t; #define M16(src) (((xavs2_union16_t *)(src))->i) #define M32(src) (((xavs2_union32_t *)(src))->i) #define M64(src) (((xavs2_union64_t *)(src))->i) #define CP16(dst,src) M16(dst) = M16(src) #define CP32(dst,src) M32(dst) = M32(src) #define CP64(dst,src) M64(dst) = M64(src) #define CP128(dst,src) M128(dst) = M128(src) #if defined(_MSC_VER) || defined(__ICL) #define M128(src) (*(__m128*)(src)) #define M128_ZERO _mm_setzero_ps() #else typedef struct { uint64_t i[2]; } xavs2_uint128_t; typedef union { xavs2_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS xavs2_union128_t; #define M128(src) (((xavs2_union128_t*)(src))->i) #if (ARCH_X86 || ARCH_X86_64) && defined(__SSE__) #define M128_ZERO ((__m128){0,0,0,0}) #define xavs2_union128_t xavs2_union128_sse_t typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS xavs2_union128_sse_t; #else #define M128_ZERO ((xavs2_uint128_t){{0,0}}) #endif // (ARCH_X86 || ARCH_X86_64) && defined(__SSE__) #endif // defined(_MSC_VER) || defined(__ICL) #endif // XAVS2_BASIC_TYPES_H xavs2-1.3/source/common/avs2tab.h000066400000000000000000000056241340660520300167310ustar00rootroot00000000000000/* * avs2tab.h * * Description of this file: * AVS2 tables definition of the xavs2 library (this file is ONLY included by block_info.c) * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_AVS2_TABLE_H #define XAVS2_AVS2_TABLE_H /* 任ϵɨ˳ */ extern ALIGN32(const int16_t tab_scan_4x4 [ 16][2]); extern ALIGN32(const int16_t tab_scan_4x4_yx[16][2]); extern ALIGN32(const int16_t tab_1d_scan_4x4[16]); extern const int16_t *tab_coef_scan1_list_nxn[2][4]; extern const int16_t *tab_coef_scan1_list_hor[3]; extern const int16_t *tab_coef_scan1_list_ver[3]; extern const int16_t (*tab_coef_scan_list[4])[2]; extern const int16_t (*tab_coef_scan_list_hor[3])[2]; extern const int16_t (*tab_coef_scan_list_ver[3])[2]; extern const int16_t (*tab_cg_scan_list_nxn[4])[2]; extern const int16_t (*tab_cg_scan_list_hor[3])[2]; extern const int16_t (*tab_cg_scan_list_ver[3])[2]; /* 任Сұ */ extern const uint8_t tab_split_tu_pos[MAX_PRED_MODES][4][2]; /* ˲ */ extern const uint8_t tab_deblock_alpha[64]; extern const uint8_t tab_deblock_beta[64]; extern const int tab_saoclip[NUM_SAO_OFFSET][3]; extern const uint16_t tab_Q_TAB [80]; extern const uint16_t tab_IQ_TAB [80]; extern const uint8_t tab_IQ_SHIFT[80]; extern const uint8_t tab_qp_scale_chroma[64]; extern const int8_t tab_intra_mode_luma2chroma[NUM_INTRA_MODE]; extern const int16_t tab_dmh_pos[DMH_MODE_NUM + DMH_MODE_NUM - 1][2]; extern const float FRAME_RATE[8]; extern const char * xavs2_preset_names[]; #endif // XAVS2_AVS2_TABLE_H xavs2-1.3/source/common/basic_types.h000066400000000000000000000074101340660520300176670ustar00rootroot00000000000000/* * basic_types.h * * Description of this file: * basic types definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_BASIC_TYPES_H #define XAVS2_BASIC_TYPES_H #include #include "defines.h" #include "osdep.h" /* * =========================================================================== * basic types * =========================================================================== */ typedef uint8_t pel_t; /* type for pixel */ typedef int16_t itr_t; /* intra prediction temp */ typedef uint16_t sum_t; typedef uint32_t sum2_t; typedef uint32_t pixel4; typedef int32_t ssum2_t; /* Signed sum */ typedef int32_t dist_t; typedef int8_t bool_t; /* Bool type, true or false */ typedef int16_t mct_t; /* motion compensation temp */ typedef int16_t coeff_t; /* type for transform coefficient */ typedef int32_t cmp_dist_t; /* distortion type */ typedef double rdcost_t; /* type for RDcost calculation, can also be int64_t */ /* * =========================================================================== * structure types * =========================================================================== */ typedef struct xavs2_handler_t xavs2_handler_t; /* top handler of the encoder */ typedef struct xavs2_log_t xavs2_log_t; /* log module */ typedef struct xavs2_t xavs2_t; /* main encoder context for one thread */ typedef struct xavs2_frame_t xavs2_frame_t; typedef struct xavs2_frame_buffer_t xavs2_frame_buffer_t; typedef struct ratectrl_t ratectrl_t; typedef struct cu_size_ctrl_t cu_size_ctrl_t; typedef struct td_rdo_t td_rdo_t; typedef struct aec_t aec_t; typedef struct cu_t cu_t; typedef union mv_t mv_t; typedef struct cu_info_t cu_info_t; typedef struct outputframe_t outputframe_t; /* --------------------------------------------------------------------------- * SAOStatData */ typedef struct SAOBlkParam { int mergeIdx; // 0: merge_left, 1: merge_up, 2 not merge (new parameter) int typeIdc; // OFF(-1), EO_0, EO_90, EO_135, EO_45, BO int startBand; // BO: starting band index int deltaBand; // BO: third starting band distance int offset[MAX_NUM_SAO_CLASSES]; } SAOBlkParam; #endif // XAVS2_BASIC_TYPES_H xavs2-1.3/source/common/block_info.c000066400000000000000000000226041340660520300174640ustar00rootroot00000000000000/* * block_info.c * * Description of this file: * Block-infomation functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "block_info.h" #include "cudata.h" #include "avs2tab.h" // AVS2 tables /** * =========================================================================== * global variables (const tables) * =========================================================================== */ /* --------------------------------------------------------------------------- */ static const uint8_t tab_DL_Avail64[16 * 16] = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; /* --------------------------------------------------------------------------- */ static const uint8_t tab_DL_Avail32[8 * 8] = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; /* --------------------------------------------------------------------------- */ static const uint8_t tab_DL_Avail16[4 * 4] = { 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0 }; /* --------------------------------------------------------------------------- */ static const uint8_t tab_DL_Avail8[2 * 2] = { 1, 0, 0, 0 }; static const uint8_t tab_TR_Avail64[16 * 16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; /* --------------------------------------------------------------------------- */ static const uint8_t tab_TR_Avail32[8 * 8] = { // 0: 8 1:16 2: 32 pu size 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; /* --------------------------------------------------------------------------- */ static const uint8_t tab_TR_Avail16[4 * 4] = { 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0 }; /* --------------------------------------------------------------------------- */ static const uint8_t tab_TR_Avail8[2 * 2] = { 1, 1, 1, 0 }; /** * =========================================================================== * function definition * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE cu_info_t *get_neighbor_cu_in_slice(xavs2_t *h, cu_info_t *p_cur, int slice_index_cur_cu, int x4x4, int y4x4) { const int shift_4x4 = MIN_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT; if (x4x4 < 0 || y4x4 < 0 || x4x4 >= h->i_width_in_minpu || y4x4 >= h->i_height_in_minpu) { return NULL; } else if ((p_cur->i_scu_x << shift_4x4) <= x4x4 && (p_cur->i_scu_y << shift_4x4) <= y4x4) { return p_cur; } else { cu_info_t *p_neighbor = &h->cu_info[(y4x4 >> 1) * h->i_width_in_mincu + (x4x4 >> 1)]; return cu_get_slice_index(h, x4x4 >> 1, y4x4 >> 1) == slice_index_cur_cu ? p_neighbor : NULL; } } /* --------------------------------------------------------------------------- * get neighboring CBP */ int get_neighbor_cbp_y(xavs2_t *h, cu_info_t *p_cur, int slice_index_cur_cu, int x_4x4, int y_4x4) { cu_info_t *p_neighbor = get_neighbor_cu_in_slice(h, p_cur, slice_index_cur_cu, x_4x4, y_4x4); if (p_neighbor == NULL) { return 0; } else if (p_neighbor->i_tu_split == TU_SPLIT_NON) { return p_neighbor->i_cbp & 1; // TUʱֱӷضӦȿCBP } else { int cbp = p_neighbor->i_cbp; int level = p_neighbor->i_level - MIN_PU_SIZE_IN_BIT; int cu_mask = (1 << level) - 1; /* 4x4CUڵԵַ */ x_4x4 &= cu_mask; y_4x4 &= cu_mask; /* Ӧ4x4ڵı任CTP */ if (p_neighbor->i_tu_split == TU_SPLIT_VER) { // ֱ x_4x4 >>= (level - 2); return (cbp >> x_4x4) & 1; } else if (p_neighbor->i_tu_split == TU_SPLIT_HOR) { // ˮƽ y_4x4 >>= (level - 2); return (cbp >> y_4x4) & 1; } else { // IJ滮 x_4x4 >>= (level - 1); y_4x4 >>= (level - 1); return (cbp >> (x_4x4 + (y_4x4 << 1))) & 1; } } } /* --------------------------------------------------------------------------- */ void set_available_tables(xavs2_t *h) { switch (h->i_lcu_level) { case B64X64_IN_BIT: h->tab_avail_DL = (uint8_t *)tab_DL_Avail64; h->tab_avail_TR = (uint8_t *)tab_TR_Avail64; break; case B32X32_IN_BIT: h->tab_avail_DL = (uint8_t *)tab_DL_Avail32; h->tab_avail_TR = (uint8_t *)tab_TR_Avail32; break; case B16X16_IN_BIT: h->tab_avail_DL = (uint8_t *)tab_DL_Avail16; h->tab_avail_TR = (uint8_t *)tab_TR_Avail16; break; default: h->tab_avail_DL = (uint8_t *)tab_DL_Avail8; h->tab_avail_TR = (uint8_t *)tab_TR_Avail8; break; } } /* --------------------------------------------------------------------------- * check for available neighbor CUs and set pointers in current CU */ void check_neighbor_cu_avail(xavs2_t *h, cu_t *p_cu, int scu_x, int scu_y, int scu_xy) { const int first_scu_y = h->slices[h->i_slice_index]->i_first_scu_y; int slice_index_of_cur_cu = cu_get_slice_index(h, scu_x, scu_y); /* reset */ p_cu->p_topA_cu = p_cu->p_left_cu = NULL; p_cu->p_topL_cu = p_cu->p_topR_cu = NULL; /* check top row */ if (scu_y > first_scu_y) { const int width_in_scu = h->i_width_in_mincu; const int right_cu_offset = 1 << (p_cu->cu_info.i_level - MIN_CU_SIZE_IN_BIT); /* check top */ p_cu->p_topA_cu = h->cu_info + (scu_xy - width_in_scu); /* check top-left */ if (scu_x > 0) { p_cu->p_topL_cu = p_cu->p_topA_cu - 1; } /* check top-right */ if (scu_x + right_cu_offset < width_in_scu) { if (slice_index_of_cur_cu == cu_get_slice_index(h, scu_x + right_cu_offset, scu_y - 1)) { cu_info_t *p_tmp_cu = p_cu->p_topA_cu + right_cu_offset; p_cu->p_topR_cu = p_tmp_cu; } } } /* check left */ if (scu_x > 0) { p_cu->p_left_cu = &h->cu_info[scu_xy - 1]; } } xavs2-1.3/source/common/block_info.h000066400000000000000000000036651340660520300174770ustar00rootroot00000000000000/* * block_info.h * * Description of this file: * Block Infomation functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_BLOCK_INFO_H #define XAVS2_BLOCK_INFO_H #define get_neighbor_cbp_y FPFX(get_neighbor_cbp_y) int get_neighbor_cbp_y(xavs2_t *h, cu_info_t *p_cur, int slice_idx_cur_cu, int x_4x4, int y_4x4); #define set_available_tables FPFX(set_available_tables) void set_available_tables(xavs2_t *h); #define check_neighbor_cu_avail FPFX(check_neighbor_cu_avail) void check_neighbor_cu_avail(xavs2_t *h, cu_t *p_cu, int scu_x, int scu_y, int scu_xy); #endif // XAVS2_BLOCK_INFO_H xavs2-1.3/source/common/cg_scan.c000066400000000000000000001651511340660520300167610ustar00rootroot00000000000000/* * cg-scan.c * * Description of this file: * CG-Scan functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "primitives.h" #include "block_info.h" #include "cpu.h" /** * =========================================================================== * global variables (const tables) * =========================================================================== */ /* --------------------------------------------------------------------------- */ ALIGN32(const int16_t tab_scan_2x2[4][2]) = { { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } }; /* --------------------------------------------------------------------------- */ ALIGN32(const int16_t tab_scan_4x4[16][2]) = { { 0, 0 }, { 1, 0 }, { 0, 1 }, { 0, 2 }, { 1, 1 }, { 2, 0 }, { 3, 0 }, { 2, 1 }, { 1, 2 }, { 0, 3 }, { 1, 3 }, { 2, 2 }, { 3, 1 }, { 3, 2 }, { 2, 3 }, { 3, 3 } }; /* --------------------------------------------------------------------------- */ ALIGN32(const int16_t tab_scan_4x4_yx[16][2]) = { { 0, 0 }, { 0, 1 }, { 1, 0 }, { 2, 0 }, { 1, 1 }, { 0, 2 }, { 0, 3 }, { 1, 2 }, { 2, 1 }, { 3, 0 }, { 3, 1 }, { 2, 2 }, { 1, 3 }, { 2, 3 }, { 3, 2 }, { 3, 3 } }; /* --------------------------------------------------------------------------- */ ALIGN32(const int16_t tab_scan_8x8[64][2]) = { { 0, 0 }, { 1, 0 }, { 0, 1 }, { 0, 2 }, { 1, 1 }, { 2, 0 }, { 3, 0 }, { 2, 1 }, { 1, 2 }, { 0, 3 }, { 0, 4 }, { 1, 3 }, { 2, 2 }, { 3, 1 }, { 4, 0 }, { 5, 0 }, { 4, 1 }, { 3, 2 }, { 2, 3 }, { 1, 4 }, { 0, 5 }, { 0, 6 }, { 1, 5 }, { 2, 4 }, { 3, 3 }, { 4, 2 }, { 5, 1 }, { 6, 0 }, { 7, 0 }, { 6, 1 }, { 5, 2 }, { 4, 3 }, { 3, 4 }, { 2, 5 }, { 1, 6 }, { 0, 7 }, { 1, 7 }, { 2, 6 }, { 3, 5 }, { 4, 4 }, { 5, 3 }, { 6, 2 }, { 7, 1 }, { 7, 2 }, { 6, 3 }, { 5, 4 }, { 4, 5 }, { 3, 6 }, { 2, 7 }, { 3, 7 }, { 4, 6 }, { 5, 5 }, { 6, 4 }, { 7, 3 }, { 7, 4 }, { 6, 5 }, { 5, 6 }, { 4, 7 }, { 5, 7 }, { 6, 6 }, { 7, 5 }, { 7, 6 }, { 6, 7 }, { 7, 7 } }; /* --------------------------------------------------------------------------- */ const int16_t tab_scan_2x8[16][2] = { { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 }, { 2, 0 }, { 3, 0 }, { 2, 1 }, { 3, 1 }, { 4, 0 }, { 5, 0 }, { 4, 1 }, { 5, 1 }, { 6, 0 }, { 7, 0 }, { 6, 1 }, { 7, 1 } }; /* --------------------------------------------------------------------------- */ const int16_t tab_scan_1x4[4][2] = { { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 }, }; /* --------------------------------------------------------------------------- */ const int16_t tab_scan_8x2[16][2] = { { 0, 0 }, { 1, 0 }, { 0, 1 }, { 0, 2 }, { 1, 1 }, { 1, 2 }, { 0, 3 }, { 0, 4 }, { 1, 3 }, { 1, 4 }, { 0, 5 }, { 0, 6 }, { 1, 5 }, { 1, 6 }, { 0, 7 }, { 1, 7 } }; /* --------------------------------------------------------------------------- */ const int16_t tab_scan_4x1[4][2] = { { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 }, }; /* --------------------------------------------------------------------------- */ const int16_t tab_coef_scan_4x16[64][2] = { { 0, 0}, { 1, 0}, { 0, 1}, { 0, 2}, { 1, 1}, { 2, 0}, { 3, 0}, { 2, 1}, { 1, 2}, { 0, 3}, { 1, 3}, { 2, 2}, { 3, 1}, { 3, 2}, { 2, 3}, { 3, 3}, { 4, 0}, { 5, 0}, { 4, 1}, { 4, 2}, { 5, 1}, { 6, 0}, { 7, 0}, { 6, 1}, { 5, 2}, { 4, 3}, { 5, 3}, { 6, 2}, { 7, 1}, { 7, 2}, { 6, 3}, { 7, 3}, { 8, 0}, { 9, 0}, { 8, 1}, { 8, 2}, { 9, 1}, { 10, 0}, { 11, 0}, { 10, 1}, { 9, 2}, { 8, 3}, { 9, 3}, { 10, 2}, { 11, 1}, { 11, 2}, { 10, 3}, { 11, 3}, { 12, 0}, { 13, 0}, { 12, 1}, { 12, 2}, { 13, 1}, { 14, 0}, { 15, 0}, { 14, 1}, { 13, 2}, { 12, 3}, { 13, 3}, { 14, 2}, { 15, 1}, { 15, 2}, { 14, 3}, { 15, 3} }; /* --------------------------------------------------------------------------- */ const int16_t tab_coef_scan_16x4[64][2] = { { 0, 0}, { 1, 0}, { 0, 1}, { 0, 2}, { 1, 1}, { 2, 0}, { 3, 0}, { 2, 1}, { 1, 2}, { 0, 3}, { 1, 3}, { 2, 2}, { 3, 1}, { 3, 2}, { 2, 3}, { 3, 3}, { 0, 4}, { 1, 4}, { 0, 5}, { 0, 6}, { 1, 5}, { 2, 4}, { 3, 4}, { 2, 5}, { 1, 6}, { 0, 7}, { 1, 7}, { 2, 6}, { 3, 5}, { 3, 6}, { 2, 7}, { 3, 7}, { 0, 8}, { 1, 8}, { 0, 9}, { 0, 10}, { 1, 9}, { 2, 8}, { 3, 8}, { 2, 9}, { 1, 10}, { 0, 11}, { 1, 11}, { 2, 10}, { 3, 9}, { 3, 10}, { 2, 11}, { 3, 11}, { 0, 12}, { 1, 12}, { 0, 13}, { 0, 14}, { 1, 13}, { 2, 12}, { 3, 12}, { 2, 13}, { 1, 14}, { 0, 15}, { 1, 15}, { 2, 14}, { 3, 13}, { 3, 14}, { 2, 15}, { 3, 15} }; /* --------------------------------------------------------------------------- */ const int16_t tab_coef_scan_8x32[256][2] = { { 0, 0}, { 1, 0}, { 0, 1}, { 0, 2}, { 1, 1}, { 2, 0}, { 3, 0}, { 2, 1}, { 1, 2}, { 0, 3}, { 1, 3}, { 2, 2}, { 3, 1}, { 3, 2}, { 2, 3}, { 3, 3}, { 4, 0}, { 5, 0}, { 4, 1}, { 4, 2}, { 5, 1}, { 6, 0}, { 7, 0}, { 6, 1}, { 5, 2}, { 4, 3}, { 5, 3}, { 6, 2}, { 7, 1}, { 7, 2}, { 6, 3}, { 7, 3}, { 0, 4}, { 1, 4}, { 0, 5}, { 0, 6}, { 1, 5}, { 2, 4}, { 3, 4}, { 2, 5}, { 1, 6}, { 0, 7}, { 1, 7}, { 2, 6}, { 3, 5}, { 3, 6}, { 2, 7}, { 3, 7}, { 4, 4}, { 5, 4}, { 4, 5}, { 4, 6}, { 5, 5}, { 6, 4}, { 7, 4}, { 6, 5}, { 5, 6}, { 4, 7}, { 5, 7}, { 6, 6}, { 7, 5}, { 7, 6}, { 6, 7}, { 7, 7}, { 8, 0}, { 9, 0}, { 8, 1}, { 8, 2}, { 9, 1}, { 10, 0}, { 11, 0}, { 10, 1}, { 9, 2}, { 8, 3}, { 9, 3}, { 10, 2}, { 11, 1}, { 11, 2}, { 10, 3}, { 11, 3}, { 12, 0}, { 13, 0}, { 12, 1}, { 12, 2}, { 13, 1}, { 14, 0}, { 15, 0}, { 14, 1}, { 13, 2}, { 12, 3}, { 13, 3}, { 14, 2}, { 15, 1}, { 15, 2}, { 14, 3}, { 15, 3}, { 8, 4}, { 9, 4}, { 8, 5}, { 8, 6}, { 9, 5}, { 10, 4}, { 11, 4}, { 10, 5}, { 9, 6}, { 8, 7}, { 9, 7}, { 10, 6}, { 11, 5}, { 11, 6}, { 10, 7}, { 11, 7}, { 12, 4}, { 13, 4}, { 12, 5}, { 12, 6}, { 13, 5}, { 14, 4}, { 15, 4}, { 14, 5}, { 13, 6}, { 12, 7}, { 13, 7}, { 14, 6}, { 15, 5}, { 15, 6}, { 14, 7}, { 15, 7}, { 16, 0}, { 17, 0}, { 16, 1}, { 16, 2}, { 17, 1}, { 18, 0}, { 19, 0}, { 18, 1}, { 17, 2}, { 16, 3}, { 17, 3}, { 18, 2}, { 19, 1}, { 19, 2}, { 18, 3}, { 19, 3}, { 20, 0}, { 21, 0}, { 20, 1}, { 20, 2}, { 21, 1}, { 22, 0}, { 23, 0}, { 22, 1}, { 21, 2}, { 20, 3}, { 21, 3}, { 22, 2}, { 23, 1}, { 23, 2}, { 22, 3}, { 23, 3}, { 16, 4}, { 17, 4}, { 16, 5}, { 16, 6}, { 17, 5}, { 18, 4}, { 19, 4}, { 18, 5}, { 17, 6}, { 16, 7}, { 17, 7}, { 18, 6}, { 19, 5}, { 19, 6}, { 18, 7}, { 19, 7}, { 20, 4}, { 21, 4}, { 20, 5}, { 20, 6}, { 21, 5}, { 22, 4}, { 23, 4}, { 22, 5}, { 21, 6}, { 20, 7}, { 21, 7}, { 22, 6}, { 23, 5}, { 23, 6}, { 22, 7}, { 23, 7}, { 24, 0}, { 25, 0}, { 24, 1}, { 24, 2}, { 25, 1}, { 26, 0}, { 27, 0}, { 26, 1}, { 25, 2}, { 24, 3}, { 25, 3}, { 26, 2}, { 27, 1}, { 27, 2}, { 26, 3}, { 27, 3}, { 28, 0}, { 29, 0}, { 28, 1}, { 28, 2}, { 29, 1}, { 30, 0}, { 31, 0}, { 30, 1}, { 29, 2}, { 28, 3}, { 29, 3}, { 30, 2}, { 31, 1}, { 31, 2}, { 30, 3}, { 31, 3}, { 24, 4}, { 25, 4}, { 24, 5}, { 24, 6}, { 25, 5}, { 26, 4}, { 27, 4}, { 26, 5}, { 25, 6}, { 24, 7}, { 25, 7}, { 26, 6}, { 27, 5}, { 27, 6}, { 26, 7}, { 27, 7}, { 28, 4}, { 29, 4}, { 28, 5}, { 28, 6}, { 29, 5}, { 30, 4}, { 31, 4}, { 30, 5}, { 29, 6}, { 28, 7}, { 29, 7}, { 30, 6}, { 31, 5}, { 31, 6}, { 30, 7}, { 31, 7} }; /* --------------------------------------------------------------------------- */ const int16_t tab_coef_scan_32x8[256][2] = { { 0, 0}, { 1, 0}, { 0, 1}, { 0, 2}, { 1, 1}, { 2, 0}, { 3, 0}, { 2, 1}, { 1, 2}, { 0, 3}, { 1, 3}, { 2, 2}, { 3, 1}, { 3, 2}, { 2, 3}, { 3, 3}, { 4, 0}, { 5, 0}, { 4, 1}, { 4, 2}, { 5, 1}, { 6, 0}, { 7, 0}, { 6, 1}, { 5, 2}, { 4, 3}, { 5, 3}, { 6, 2}, { 7, 1}, { 7, 2}, { 6, 3}, { 7, 3}, { 0, 4}, { 1, 4}, { 0, 5}, { 0, 6}, { 1, 5}, { 2, 4}, { 3, 4}, { 2, 5}, { 1, 6}, { 0, 7}, { 1, 7}, { 2, 6}, { 3, 5}, { 3, 6}, { 2, 7}, { 3, 7}, { 0, 8}, { 1, 8}, { 0, 9}, { 0, 10}, { 1, 9}, { 2, 8}, { 3, 8}, { 2, 9}, { 1, 10}, { 0, 11}, { 1, 11}, { 2, 10}, { 3, 9}, { 3, 10}, { 2, 11}, { 3, 11}, { 4, 4}, { 5, 4}, { 4, 5}, { 4, 6}, { 5, 5}, { 6, 4}, { 7, 4}, { 6, 5}, { 5, 6}, { 4, 7}, { 5, 7}, { 6, 6}, { 7, 5}, { 7, 6}, { 6, 7}, { 7, 7}, { 4, 8}, { 5, 8}, { 4, 9}, { 4, 10}, { 5, 9}, { 6, 8}, { 7, 8}, { 6, 9}, { 5, 10}, { 4, 11}, { 5, 11}, { 6, 10}, { 7, 9}, { 7, 10}, { 6, 11}, { 7, 11}, { 0, 12}, { 1, 12}, { 0, 13}, { 0, 14}, { 1, 13}, { 2, 12}, { 3, 12}, { 2, 13}, { 1, 14}, { 0, 15}, { 1, 15}, { 2, 14}, { 3, 13}, { 3, 14}, { 2, 15}, { 3, 15}, { 0, 16}, { 1, 16}, { 0, 17}, { 0, 18}, { 1, 17}, { 2, 16}, { 3, 16}, { 2, 17}, { 1, 18}, { 0, 19}, { 1, 19}, { 2, 18}, { 3, 17}, { 3, 18}, { 2, 19}, { 3, 19}, { 4, 12}, { 5, 12}, { 4, 13}, { 4, 14}, { 5, 13}, { 6, 12}, { 7, 12}, { 6, 13}, { 5, 14}, { 4, 15}, { 5, 15}, { 6, 14}, { 7, 13}, { 7, 14}, { 6, 15}, { 7, 15}, { 4, 16}, { 5, 16}, { 4, 17}, { 4, 18}, { 5, 17}, { 6, 16}, { 7, 16}, { 6, 17}, { 5, 18}, { 4, 19}, { 5, 19}, { 6, 18}, { 7, 17}, { 7, 18}, { 6, 19}, { 7, 19}, { 0, 20}, { 1, 20}, { 0, 21}, { 0, 22}, { 1, 21}, { 2, 20}, { 3, 20}, { 2, 21}, { 1, 22}, { 0, 23}, { 1, 23}, { 2, 22}, { 3, 21}, { 3, 22}, { 2, 23}, { 3, 23}, { 0, 24}, { 1, 24}, { 0, 25}, { 0, 26}, { 1, 25}, { 2, 24}, { 3, 24}, { 2, 25}, { 1, 26}, { 0, 27}, { 1, 27}, { 2, 26}, { 3, 25}, { 3, 26}, { 2, 27}, { 3, 27}, { 4, 20}, { 5, 20}, { 4, 21}, { 4, 22}, { 5, 21}, { 6, 20}, { 7, 20}, { 6, 21}, { 5, 22}, { 4, 23}, { 5, 23}, { 6, 22}, { 7, 21}, { 7, 22}, { 6, 23}, { 7, 23}, { 4, 24}, { 5, 24}, { 4, 25}, { 4, 26}, { 5, 25}, { 6, 24}, { 7, 24}, { 6, 25}, { 5, 26}, { 4, 27}, { 5, 27}, { 6, 26}, { 7, 25}, { 7, 26}, { 6, 27}, { 7, 27}, { 0, 28}, { 1, 28}, { 0, 29}, { 0, 30}, { 1, 29}, { 2, 28}, { 3, 28}, { 2, 29}, { 1, 30}, { 0, 31}, { 1, 31}, { 2, 30}, { 3, 29}, { 3, 30}, { 2, 31}, { 3, 31}, { 4, 28}, { 5, 28}, { 4, 29}, { 4, 30}, { 5, 29}, { 6, 28}, { 7, 28}, { 6, 29}, { 5, 30}, { 4, 31}, { 5, 31}, { 6, 30}, { 7, 29}, { 7, 30}, { 6, 31}, { 7, 31} }; /* --------------------------------------------------------------------------- */ const int16_t tab_coef_scan_8x8[64][2] = { { 0, 0 }, { 1, 0 }, { 0, 1 }, { 0, 2 }, { 1, 1 }, { 2, 0 }, { 3, 0 }, { 2, 1 }, { 1, 2 }, { 0, 3 }, { 1, 3 }, { 2, 2 }, { 3, 1 }, { 3, 2 }, { 2, 3 }, { 3, 3 }, { 4, 0 }, { 5, 0 }, { 4, 1 }, { 4, 2 }, { 5, 1 }, { 6, 0 }, { 7, 0 }, { 6, 1 }, { 5, 2 }, { 4, 3 }, { 5, 3 }, { 6, 2 }, { 7, 1 }, { 7, 2 }, { 6, 3 }, { 7, 3 }, { 0, 4 }, { 1, 4 }, { 0, 5 }, { 0, 6 }, { 1, 5 }, { 2, 4 }, { 3, 4 }, { 2, 5 }, { 1, 6 }, { 0, 7 }, { 1, 7 }, { 2, 6 }, { 3, 5 }, { 3, 6 }, { 2, 7 }, { 3, 7 }, { 4, 4 }, { 5, 4 }, { 4, 5 }, { 4, 6 }, { 5, 5 }, { 6, 4 }, { 7, 4 }, { 6, 5 }, { 5, 6 }, { 4, 7 }, { 5, 7 }, { 6, 6 }, { 7, 5 }, { 7, 6 }, { 6, 7 }, { 7, 7 } }; /* --------------------------------------------------------------------------- */ const int16_t tab_coef_scan_16x16[256][2] = { { 0, 0}, { 1, 0}, { 0, 1}, { 0, 2}, { 1, 1}, { 2, 0}, { 3, 0}, { 2, 1}, { 1, 2}, { 0, 3}, { 1, 3}, { 2, 2}, { 3, 1}, { 3, 2}, { 2, 3}, { 3, 3}, { 4, 0}, { 5, 0}, { 4, 1}, { 4, 2}, { 5, 1}, { 6, 0}, { 7, 0}, { 6, 1}, { 5, 2}, { 4, 3}, { 5, 3}, { 6, 2}, { 7, 1}, { 7, 2}, { 6, 3}, { 7, 3}, { 0, 4}, { 1, 4}, { 0, 5}, { 0, 6}, { 1, 5}, { 2, 4}, { 3, 4}, { 2, 5}, { 1, 6}, { 0, 7}, { 1, 7}, { 2, 6}, { 3, 5}, { 3, 6}, { 2, 7}, { 3, 7}, { 0, 8}, { 1, 8}, { 0, 9}, { 0, 10}, { 1, 9}, { 2, 8}, { 3, 8}, { 2, 9}, { 1, 10}, { 0, 11}, { 1, 11}, { 2, 10}, { 3, 9}, { 3, 10}, { 2, 11}, { 3, 11}, { 4, 4}, { 5, 4}, { 4, 5}, { 4, 6}, { 5, 5}, { 6, 4}, { 7, 4}, { 6, 5}, { 5, 6}, { 4, 7}, { 5, 7}, { 6, 6}, { 7, 5}, { 7, 6}, { 6, 7}, { 7, 7}, { 8, 0}, { 9, 0}, { 8, 1}, { 8, 2}, { 9, 1}, { 10, 0}, { 11, 0}, { 10, 1}, { 9, 2}, { 8, 3}, { 9, 3}, { 10, 2}, { 11, 1}, { 11, 2}, { 10, 3}, { 11, 3}, { 12, 0}, { 13, 0}, { 12, 1}, { 12, 2}, { 13, 1}, { 14, 0}, { 15, 0}, { 14, 1}, { 13, 2}, { 12, 3}, { 13, 3}, { 14, 2}, { 15, 1}, { 15, 2}, { 14, 3}, { 15, 3}, { 8, 4}, { 9, 4}, { 8, 5}, { 8, 6}, { 9, 5}, { 10, 4}, { 11, 4}, { 10, 5}, { 9, 6}, { 8, 7}, { 9, 7}, { 10, 6}, { 11, 5}, { 11, 6}, { 10, 7}, { 11, 7}, { 4, 8}, { 5, 8}, { 4, 9}, { 4, 10}, { 5, 9}, { 6, 8}, { 7, 8}, { 6, 9}, { 5, 10}, { 4, 11}, { 5, 11}, { 6, 10}, { 7, 9}, { 7, 10}, { 6, 11}, { 7, 11}, { 0, 12}, { 1, 12}, { 0, 13}, { 0, 14}, { 1, 13}, { 2, 12}, { 3, 12}, { 2, 13}, { 1, 14}, { 0, 15}, { 1, 15}, { 2, 14}, { 3, 13}, { 3, 14}, { 2, 15}, { 3, 15}, { 4, 12}, { 5, 12}, { 4, 13}, { 4, 14}, { 5, 13}, { 6, 12}, { 7, 12}, { 6, 13}, { 5, 14}, { 4, 15}, { 5, 15}, { 6, 14}, { 7, 13}, { 7, 14}, { 6, 15}, { 7, 15}, { 8, 8}, { 9, 8}, { 8, 9}, { 8, 10}, { 9, 9}, { 10, 8}, { 11, 8}, { 10, 9}, { 9, 10}, { 8, 11}, { 9, 11}, { 10, 10}, { 11, 9}, { 11, 10}, { 10, 11}, { 11, 11}, { 12, 4}, { 13, 4}, { 12, 5}, { 12, 6}, { 13, 5}, { 14, 4}, { 15, 4}, { 14, 5}, { 13, 6}, { 12, 7}, { 13, 7}, { 14, 6}, { 15, 5}, { 15, 6}, { 14, 7}, { 15, 7}, { 12, 8}, { 13, 8}, { 12, 9}, { 12, 10}, { 13, 9}, { 14, 8}, { 15, 8}, { 14, 9}, { 13, 10}, { 12, 11}, { 13, 11}, { 14, 10}, { 15, 9}, { 15, 10}, { 14, 11}, { 15, 11}, { 8, 12}, { 9, 12}, { 8, 13}, { 8, 14}, { 9, 13}, { 10, 12}, { 11, 12}, { 10, 13}, { 9, 14}, { 8, 15}, { 9, 15}, { 10, 14}, { 11, 13}, { 11, 14}, { 10, 15}, { 11, 15}, { 12, 12}, { 13, 12}, { 12, 13}, { 12, 14}, { 13, 13}, { 14, 12}, { 15, 12}, { 14, 13}, { 13, 14}, { 12, 15}, { 13, 15}, { 14, 14}, { 15, 13}, { 15, 14}, { 14, 15}, { 15, 15} }; /* --------------------------------------------------------------------------- */ const int16_t tab_coef_scan_32x32[1024][2] = { { 0, 0}, { 1, 0}, { 0, 1}, { 0, 2}, { 1, 1}, { 2, 0}, { 3, 0}, { 2, 1}, { 1, 2}, { 0, 3}, { 1, 3}, { 2, 2}, { 3, 1}, { 3, 2}, { 2, 3}, { 3, 3}, { 4, 0}, { 5, 0}, { 4, 1}, { 4, 2}, { 5, 1}, { 6, 0}, { 7, 0}, { 6, 1}, { 5, 2}, { 4, 3}, { 5, 3}, { 6, 2}, { 7, 1}, { 7, 2}, { 6, 3}, { 7, 3}, { 0, 4}, { 1, 4}, { 0, 5}, { 0, 6}, { 1, 5}, { 2, 4}, { 3, 4}, { 2, 5}, { 1, 6}, { 0, 7}, { 1, 7}, { 2, 6}, { 3, 5}, { 3, 6}, { 2, 7}, { 3, 7}, { 0, 8}, { 1, 8}, { 0, 9}, { 0, 10}, { 1, 9}, { 2, 8}, { 3, 8}, { 2, 9}, { 1, 10}, { 0, 11}, { 1, 11}, { 2, 10}, { 3, 9}, { 3, 10}, { 2, 11}, { 3, 11}, { 4, 4}, { 5, 4}, { 4, 5}, { 4, 6}, { 5, 5}, { 6, 4}, { 7, 4}, { 6, 5}, { 5, 6}, { 4, 7}, { 5, 7}, { 6, 6}, { 7, 5}, { 7, 6}, { 6, 7}, { 7, 7}, { 8, 0}, { 9, 0}, { 8, 1}, { 8, 2}, { 9, 1}, { 10, 0}, { 11, 0}, { 10, 1}, { 9, 2}, { 8, 3}, { 9, 3}, { 10, 2}, { 11, 1}, { 11, 2}, { 10, 3}, { 11, 3}, { 12, 0}, { 13, 0}, { 12, 1}, { 12, 2}, { 13, 1}, { 14, 0}, { 15, 0}, { 14, 1}, { 13, 2}, { 12, 3}, { 13, 3}, { 14, 2}, { 15, 1}, { 15, 2}, { 14, 3}, { 15, 3}, { 8, 4}, { 9, 4}, { 8, 5}, { 8, 6}, { 9, 5}, { 10, 4}, { 11, 4}, { 10, 5}, { 9, 6}, { 8, 7}, { 9, 7}, { 10, 6}, { 11, 5}, { 11, 6}, { 10, 7}, { 11, 7}, { 4, 8}, { 5, 8}, { 4, 9}, { 4, 10}, { 5, 9}, { 6, 8}, { 7, 8}, { 6, 9}, { 5, 10}, { 4, 11}, { 5, 11}, { 6, 10}, { 7, 9}, { 7, 10}, { 6, 11}, { 7, 11}, { 0, 12}, { 1, 12}, { 0, 13}, { 0, 14}, { 1, 13}, { 2, 12}, { 3, 12}, { 2, 13}, { 1, 14}, { 0, 15}, { 1, 15}, { 2, 14}, { 3, 13}, { 3, 14}, { 2, 15}, { 3, 15}, { 0, 16}, { 1, 16}, { 0, 17}, { 0, 18}, { 1, 17}, { 2, 16}, { 3, 16}, { 2, 17}, { 1, 18}, { 0, 19}, { 1, 19}, { 2, 18}, { 3, 17}, { 3, 18}, { 2, 19}, { 3, 19}, { 4, 12}, { 5, 12}, { 4, 13}, { 4, 14}, { 5, 13}, { 6, 12}, { 7, 12}, { 6, 13}, { 5, 14}, { 4, 15}, { 5, 15}, { 6, 14}, { 7, 13}, { 7, 14}, { 6, 15}, { 7, 15}, { 8, 8}, { 9, 8}, { 8, 9}, { 8, 10}, { 9, 9}, { 10, 8}, { 11, 8}, { 10, 9}, { 9, 10}, { 8, 11}, { 9, 11}, { 10, 10}, { 11, 9}, { 11, 10}, { 10, 11}, { 11, 11}, { 12, 4}, { 13, 4}, { 12, 5}, { 12, 6}, { 13, 5}, { 14, 4}, { 15, 4}, { 14, 5}, { 13, 6}, { 12, 7}, { 13, 7}, { 14, 6}, { 15, 5}, { 15, 6}, { 14, 7}, { 15, 7}, { 16, 0}, { 17, 0}, { 16, 1}, { 16, 2}, { 17, 1}, { 18, 0}, { 19, 0}, { 18, 1}, { 17, 2}, { 16, 3}, { 17, 3}, { 18, 2}, { 19, 1}, { 19, 2}, { 18, 3}, { 19, 3}, { 20, 0}, { 21, 0}, { 20, 1}, { 20, 2}, { 21, 1}, { 22, 0}, { 23, 0}, { 22, 1}, { 21, 2}, { 20, 3}, { 21, 3}, { 22, 2}, { 23, 1}, { 23, 2}, { 22, 3}, { 23, 3}, { 16, 4}, { 17, 4}, { 16, 5}, { 16, 6}, { 17, 5}, { 18, 4}, { 19, 4}, { 18, 5}, { 17, 6}, { 16, 7}, { 17, 7}, { 18, 6}, { 19, 5}, { 19, 6}, { 18, 7}, { 19, 7}, { 12, 8}, { 13, 8}, { 12, 9}, { 12, 10}, { 13, 9}, { 14, 8}, { 15, 8}, { 14, 9}, { 13, 10}, { 12, 11}, { 13, 11}, { 14, 10}, { 15, 9}, { 15, 10}, { 14, 11}, { 15, 11}, { 8, 12}, { 9, 12}, { 8, 13}, { 8, 14}, { 9, 13}, { 10, 12}, { 11, 12}, { 10, 13}, { 9, 14}, { 8, 15}, { 9, 15}, { 10, 14}, { 11, 13}, { 11, 14}, { 10, 15}, { 11, 15}, { 4, 16}, { 5, 16}, { 4, 17}, { 4, 18}, { 5, 17}, { 6, 16}, { 7, 16}, { 6, 17}, { 5, 18}, { 4, 19}, { 5, 19}, { 6, 18}, { 7, 17}, { 7, 18}, { 6, 19}, { 7, 19}, { 0, 20}, { 1, 20}, { 0, 21}, { 0, 22}, { 1, 21}, { 2, 20}, { 3, 20}, { 2, 21}, { 1, 22}, { 0, 23}, { 1, 23}, { 2, 22}, { 3, 21}, { 3, 22}, { 2, 23}, { 3, 23}, { 0, 24}, { 1, 24}, { 0, 25}, { 0, 26}, { 1, 25}, { 2, 24}, { 3, 24}, { 2, 25}, { 1, 26}, { 0, 27}, { 1, 27}, { 2, 26}, { 3, 25}, { 3, 26}, { 2, 27}, { 3, 27}, { 4, 20}, { 5, 20}, { 4, 21}, { 4, 22}, { 5, 21}, { 6, 20}, { 7, 20}, { 6, 21}, { 5, 22}, { 4, 23}, { 5, 23}, { 6, 22}, { 7, 21}, { 7, 22}, { 6, 23}, { 7, 23}, { 8, 16}, { 9, 16}, { 8, 17}, { 8, 18}, { 9, 17}, { 10, 16}, { 11, 16}, { 10, 17}, { 9, 18}, { 8, 19}, { 9, 19}, { 10, 18}, { 11, 17}, { 11, 18}, { 10, 19}, { 11, 19}, { 12, 12}, { 13, 12}, { 12, 13}, { 12, 14}, { 13, 13}, { 14, 12}, { 15, 12}, { 14, 13}, { 13, 14}, { 12, 15}, { 13, 15}, { 14, 14}, { 15, 13}, { 15, 14}, { 14, 15}, { 15, 15}, { 16, 8}, { 17, 8}, { 16, 9}, { 16, 10}, { 17, 9}, { 18, 8}, { 19, 8}, { 18, 9}, { 17, 10}, { 16, 11}, { 17, 11}, { 18, 10}, { 19, 9}, { 19, 10}, { 18, 11}, { 19, 11}, { 20, 4}, { 21, 4}, { 20, 5}, { 20, 6}, { 21, 5}, { 22, 4}, { 23, 4}, { 22, 5}, { 21, 6}, { 20, 7}, { 21, 7}, { 22, 6}, { 23, 5}, { 23, 6}, { 22, 7}, { 23, 7}, { 24, 0}, { 25, 0}, { 24, 1}, { 24, 2}, { 25, 1}, { 26, 0}, { 27, 0}, { 26, 1}, { 25, 2}, { 24, 3}, { 25, 3}, { 26, 2}, { 27, 1}, { 27, 2}, { 26, 3}, { 27, 3}, { 28, 0}, { 29, 0}, { 28, 1}, { 28, 2}, { 29, 1}, { 30, 0}, { 31, 0}, { 30, 1}, { 29, 2}, { 28, 3}, { 29, 3}, { 30, 2}, { 31, 1}, { 31, 2}, { 30, 3}, { 31, 3}, { 24, 4}, { 25, 4}, { 24, 5}, { 24, 6}, { 25, 5}, { 26, 4}, { 27, 4}, { 26, 5}, { 25, 6}, { 24, 7}, { 25, 7}, { 26, 6}, { 27, 5}, { 27, 6}, { 26, 7}, { 27, 7}, { 20, 8}, { 21, 8}, { 20, 9}, { 20, 10}, { 21, 9}, { 22, 8}, { 23, 8}, { 22, 9}, { 21, 10}, { 20, 11}, { 21, 11}, { 22, 10}, { 23, 9}, { 23, 10}, { 22, 11}, { 23, 11}, { 16, 12}, { 17, 12}, { 16, 13}, { 16, 14}, { 17, 13}, { 18, 12}, { 19, 12}, { 18, 13}, { 17, 14}, { 16, 15}, { 17, 15}, { 18, 14}, { 19, 13}, { 19, 14}, { 18, 15}, { 19, 15}, { 12, 16}, { 13, 16}, { 12, 17}, { 12, 18}, { 13, 17}, { 14, 16}, { 15, 16}, { 14, 17}, { 13, 18}, { 12, 19}, { 13, 19}, { 14, 18}, { 15, 17}, { 15, 18}, { 14, 19}, { 15, 19}, { 8, 20}, { 9, 20}, { 8, 21}, { 8, 22}, { 9, 21}, { 10, 20}, { 11, 20}, { 10, 21}, { 9, 22}, { 8, 23}, { 9, 23}, { 10, 22}, { 11, 21}, { 11, 22}, { 10, 23}, { 11, 23}, { 4, 24}, { 5, 24}, { 4, 25}, { 4, 26}, { 5, 25}, { 6, 24}, { 7, 24}, { 6, 25}, { 5, 26}, { 4, 27}, { 5, 27}, { 6, 26}, { 7, 25}, { 7, 26}, { 6, 27}, { 7, 27}, { 0, 28}, { 1, 28}, { 0, 29}, { 0, 30}, { 1, 29}, { 2, 28}, { 3, 28}, { 2, 29}, { 1, 30}, { 0, 31}, { 1, 31}, { 2, 30}, { 3, 29}, { 3, 30}, { 2, 31}, { 3, 31}, { 4, 28}, { 5, 28}, { 4, 29}, { 4, 30}, { 5, 29}, { 6, 28}, { 7, 28}, { 6, 29}, { 5, 30}, { 4, 31}, { 5, 31}, { 6, 30}, { 7, 29}, { 7, 30}, { 6, 31}, { 7, 31}, { 8, 24}, { 9, 24}, { 8, 25}, { 8, 26}, { 9, 25}, { 10, 24}, { 11, 24}, { 10, 25}, { 9, 26}, { 8, 27}, { 9, 27}, { 10, 26}, { 11, 25}, { 11, 26}, { 10, 27}, { 11, 27}, { 12, 20}, { 13, 20}, { 12, 21}, { 12, 22}, { 13, 21}, { 14, 20}, { 15, 20}, { 14, 21}, { 13, 22}, { 12, 23}, { 13, 23}, { 14, 22}, { 15, 21}, { 15, 22}, { 14, 23}, { 15, 23}, { 16, 16}, { 17, 16}, { 16, 17}, { 16, 18}, { 17, 17}, { 18, 16}, { 19, 16}, { 18, 17}, { 17, 18}, { 16, 19}, { 17, 19}, { 18, 18}, { 19, 17}, { 19, 18}, { 18, 19}, { 19, 19}, { 20, 12}, { 21, 12}, { 20, 13}, { 20, 14}, { 21, 13}, { 22, 12}, { 23, 12}, { 22, 13}, { 21, 14}, { 20, 15}, { 21, 15}, { 22, 14}, { 23, 13}, { 23, 14}, { 22, 15}, { 23, 15}, { 24, 8}, { 25, 8}, { 24, 9}, { 24, 10}, { 25, 9}, { 26, 8}, { 27, 8}, { 26, 9}, { 25, 10}, { 24, 11}, { 25, 11}, { 26, 10}, { 27, 9}, { 27, 10}, { 26, 11}, { 27, 11}, { 28, 4}, { 29, 4}, { 28, 5}, { 28, 6}, { 29, 5}, { 30, 4}, { 31, 4}, { 30, 5}, { 29, 6}, { 28, 7}, { 29, 7}, { 30, 6}, { 31, 5}, { 31, 6}, { 30, 7}, { 31, 7}, { 28, 8}, { 29, 8}, { 28, 9}, { 28, 10}, { 29, 9}, { 30, 8}, { 31, 8}, { 30, 9}, { 29, 10}, { 28, 11}, { 29, 11}, { 30, 10}, { 31, 9}, { 31, 10}, { 30, 11}, { 31, 11}, { 24, 12}, { 25, 12}, { 24, 13}, { 24, 14}, { 25, 13}, { 26, 12}, { 27, 12}, { 26, 13}, { 25, 14}, { 24, 15}, { 25, 15}, { 26, 14}, { 27, 13}, { 27, 14}, { 26, 15}, { 27, 15}, { 20, 16}, { 21, 16}, { 20, 17}, { 20, 18}, { 21, 17}, { 22, 16}, { 23, 16}, { 22, 17}, { 21, 18}, { 20, 19}, { 21, 19}, { 22, 18}, { 23, 17}, { 23, 18}, { 22, 19}, { 23, 19}, { 16, 20}, { 17, 20}, { 16, 21}, { 16, 22}, { 17, 21}, { 18, 20}, { 19, 20}, { 18, 21}, { 17, 22}, { 16, 23}, { 17, 23}, { 18, 22}, { 19, 21}, { 19, 22}, { 18, 23}, { 19, 23}, { 12, 24}, { 13, 24}, { 12, 25}, { 12, 26}, { 13, 25}, { 14, 24}, { 15, 24}, { 14, 25}, { 13, 26}, { 12, 27}, { 13, 27}, { 14, 26}, { 15, 25}, { 15, 26}, { 14, 27}, { 15, 27}, { 8, 28}, { 9, 28}, { 8, 29}, { 8, 30}, { 9, 29}, { 10, 28}, { 11, 28}, { 10, 29}, { 9, 30}, { 8, 31}, { 9, 31}, { 10, 30}, { 11, 29}, { 11, 30}, { 10, 31}, { 11, 31}, { 12, 28}, { 13, 28}, { 12, 29}, { 12, 30}, { 13, 29}, { 14, 28}, { 15, 28}, { 14, 29}, { 13, 30}, { 12, 31}, { 13, 31}, { 14, 30}, { 15, 29}, { 15, 30}, { 14, 31}, { 15, 31}, { 16, 24}, { 17, 24}, { 16, 25}, { 16, 26}, { 17, 25}, { 18, 24}, { 19, 24}, { 18, 25}, { 17, 26}, { 16, 27}, { 17, 27}, { 18, 26}, { 19, 25}, { 19, 26}, { 18, 27}, { 19, 27}, { 20, 20}, { 21, 20}, { 20, 21}, { 20, 22}, { 21, 21}, { 22, 20}, { 23, 20}, { 22, 21}, { 21, 22}, { 20, 23}, { 21, 23}, { 22, 22}, { 23, 21}, { 23, 22}, { 22, 23}, { 23, 23}, { 24, 16}, { 25, 16}, { 24, 17}, { 24, 18}, { 25, 17}, { 26, 16}, { 27, 16}, { 26, 17}, { 25, 18}, { 24, 19}, { 25, 19}, { 26, 18}, { 27, 17}, { 27, 18}, { 26, 19}, { 27, 19}, { 28, 12}, { 29, 12}, { 28, 13}, { 28, 14}, { 29, 13}, { 30, 12}, { 31, 12}, { 30, 13}, { 29, 14}, { 28, 15}, { 29, 15}, { 30, 14}, { 31, 13}, { 31, 14}, { 30, 15}, { 31, 15}, { 28, 16}, { 29, 16}, { 28, 17}, { 28, 18}, { 29, 17}, { 30, 16}, { 31, 16}, { 30, 17}, { 29, 18}, { 28, 19}, { 29, 19}, { 30, 18}, { 31, 17}, { 31, 18}, { 30, 19}, { 31, 19}, { 24, 20}, { 25, 20}, { 24, 21}, { 24, 22}, { 25, 21}, { 26, 20}, { 27, 20}, { 26, 21}, { 25, 22}, { 24, 23}, { 25, 23}, { 26, 22}, { 27, 21}, { 27, 22}, { 26, 23}, { 27, 23}, { 20, 24}, { 21, 24}, { 20, 25}, { 20, 26}, { 21, 25}, { 22, 24}, { 23, 24}, { 22, 25}, { 21, 26}, { 20, 27}, { 21, 27}, { 22, 26}, { 23, 25}, { 23, 26}, { 22, 27}, { 23, 27}, { 16, 28}, { 17, 28}, { 16, 29}, { 16, 30}, { 17, 29}, { 18, 28}, { 19, 28}, { 18, 29}, { 17, 30}, { 16, 31}, { 17, 31}, { 18, 30}, { 19, 29}, { 19, 30}, { 18, 31}, { 19, 31}, { 20, 28}, { 21, 28}, { 20, 29}, { 20, 30}, { 21, 29}, { 22, 28}, { 23, 28}, { 22, 29}, { 21, 30}, { 20, 31}, { 21, 31}, { 22, 30}, { 23, 29}, { 23, 30}, { 22, 31}, { 23, 31}, { 24, 24}, { 25, 24}, { 24, 25}, { 24, 26}, { 25, 25}, { 26, 24}, { 27, 24}, { 26, 25}, { 25, 26}, { 24, 27}, { 25, 27}, { 26, 26}, { 27, 25}, { 27, 26}, { 26, 27}, { 27, 27}, { 28, 20}, { 29, 20}, { 28, 21}, { 28, 22}, { 29, 21}, { 30, 20}, { 31, 20}, { 30, 21}, { 29, 22}, { 28, 23}, { 29, 23}, { 30, 22}, { 31, 21}, { 31, 22}, { 30, 23}, { 31, 23}, { 28, 24}, { 29, 24}, { 28, 25}, { 28, 26}, { 29, 25}, { 30, 24}, { 31, 24}, { 30, 25}, { 29, 26}, { 28, 27}, { 29, 27}, { 30, 26}, { 31, 25}, { 31, 26}, { 30, 27}, { 31, 27}, { 24, 28}, { 25, 28}, { 24, 29}, { 24, 30}, { 25, 29}, { 26, 28}, { 27, 28}, { 26, 29}, { 25, 30}, { 24, 31}, { 25, 31}, { 26, 30}, { 27, 29}, { 27, 30}, { 26, 31}, { 27, 31}, { 28, 28}, { 29, 28}, { 28, 29}, { 28, 30}, { 29, 29}, { 30, 28}, { 31, 28}, { 30, 29}, { 29, 30}, { 28, 31}, { 29, 31}, { 30, 30}, { 31, 29}, { 31, 30}, { 30, 31}, { 31, 31} }; /* --------------------------------------------------------------------------- */ const int16_t(*tab_coef_scan_list[4])[2] = { tab_scan_4x4, tab_coef_scan_8x8, tab_coef_scan_16x16, tab_coef_scan_32x32 }; /* --------------------------------------------------------------------------- */ const int16_t(*tab_coef_scan_list_hor[3])[2] = { NULL, tab_coef_scan_4x16, tab_coef_scan_8x32 }; /* --------------------------------------------------------------------------- */ const int16_t(*tab_coef_scan_list_ver[3])[2] = { NULL, tab_coef_scan_16x4, tab_coef_scan_32x8 }; /* --------------------------------------------------------------------------- */ const int16_t(*tab_cg_scan_list_nxn[])[2] = { tab_scan_2x2, // 4x4 tab_scan_2x2, // 8x8 tab_scan_4x4, // 16x16 tab_scan_8x8 // 32x32 }; const int16_t(*tab_cg_scan_list_hor[3])[2] = { NULL, tab_scan_1x4, tab_scan_2x8, }; const int16_t(*tab_cg_scan_list_ver[3])[2] = { NULL, tab_scan_4x1, tab_scan_8x2, }; /* --------------------------------------------------------------------------- * һάɨ˳ */ ALIGN32(const int16_t tab_1d_scan_4x4[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 }; ALIGN32(const int16_t tab_1d_scan_8x8[64]) = { 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 25, 18, 11, 19, 26, 27, 4, 5, 12, 20, 13, 6, 7, 14, 21, 28, 29, 22, 15, 23, 30, 31, 32, 33, 40, 48, 41, 34, 35, 42, 49, 56, 57, 50, 43, 51, 58, 59, 36, 37, 44, 52, 45, 38, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63 }; ALIGN32(const int16_t tab_1d_scan_16x16[256]) = { 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 49, 34, 19, 35, 50, 51, 4, 5, 20, 36, 21, 6, 7, 22, 37, 52, 53, 38, 23, 39, 54, 55, 64, 65, 80, 96, 81, 66, 67, 82, 97, 112, 113, 98, 83, 99, 114, 115, 128, 129, 144, 160, 145, 130, 131, 146, 161, 176, 177, 162, 147, 163, 178, 179, 68, 69, 84, 100, 85, 70, 71, 86, 101, 116, 117, 102, 87, 103, 118, 119, 8, 9, 24, 40, 25, 10, 11, 26, 41, 56, 57, 42, 27, 43, 58, 59, 12, 13, 28, 44, 29, 14, 15, 30, 45, 60, 61, 46, 31, 47, 62, 63, 72, 73, 88, 104, 89, 74, 75, 90, 105, 120, 121, 106, 91, 107, 122, 123, 132, 133, 148, 164, 149, 134, 135, 150, 165, 180, 181, 166, 151, 167, 182, 183, 192, 193, 208, 224, 209, 194, 195, 210, 225, 240, 241, 226, 211, 227, 242, 243, 196, 197, 212, 228, 213, 198, 199, 214, 229, 244, 245, 230, 215, 231, 246, 247, 136, 137, 152, 168, 153, 138, 139, 154, 169, 184, 185, 170, 155, 171, 186, 187, 76, 77, 92, 108, 93, 78, 79, 94, 109, 124, 125, 110, 95, 111, 126, 127, 140, 141, 156, 172, 157, 142, 143, 158, 173, 188, 189, 174, 159, 175, 190, 191, 200, 201, 216, 232, 217, 202, 203, 218, 233, 248, 249, 234, 219, 235, 250, 251, 204, 205, 220, 236, 221, 206, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255 }; ALIGN32(const int16_t tab_1d_scan_32x32[1024]) = { 0, 1, 32, 64, 33, 2, 3, 34, 65, 96, 97, 66, 35, 67, 98, 99, 4, 5, 36, 68, 37, 6, 7, 38, 69, 100, 101, 70, 39, 71, 102, 103, 128, 129, 160, 192, 161, 130, 131, 162, 193, 224, 225, 194, 163, 195, 226, 227, 256, 257, 288, 320, 289, 258, 259, 290, 321, 352, 353, 322, 291, 323, 354, 355, 132, 133, 164, 196, 165, 134, 135, 166, 197, 228, 229, 198, 167, 199, 230, 231, 8, 9, 40, 72, 41, 10, 11, 42, 73, 104, 105, 74, 43, 75, 106, 107, 12, 13, 44, 76, 45, 14, 15, 46, 77, 108, 109, 78, 47, 79, 110, 111, 136, 137, 168, 200, 169, 138, 139, 170, 201, 232, 233, 202, 171, 203, 234, 235, 260, 261, 292, 324, 293, 262, 263, 294, 325, 356, 357, 326, 295, 327, 358, 359, 384, 385, 416, 448, 417, 386, 387, 418, 449, 480, 481, 450, 419, 451, 482, 483, 512, 513, 544, 576, 545, 514, 515, 546, 577, 608, 609, 578, 547, 579, 610, 611, 388, 389, 420, 452, 421, 390, 391, 422, 453, 484, 485, 454, 423, 455, 486, 487, 264, 265, 296, 328, 297, 266, 267, 298, 329, 360, 361, 330, 299, 331, 362, 363, 140, 141, 172, 204, 173, 142, 143, 174, 205, 236, 237, 206, 175, 207, 238, 239, 16, 17, 48, 80, 49, 18, 19, 50, 81, 112, 113, 82, 51, 83, 114, 115, 20, 21, 52, 84, 53, 22, 23, 54, 85, 116, 117, 86, 55, 87, 118, 119, 144, 145, 176, 208, 177, 146, 147, 178, 209, 240, 241, 210, 179, 211, 242, 243, 268, 269, 300, 332, 301, 270, 271, 302, 333, 364, 365, 334, 303, 335, 366, 367, 392, 393, 424, 456, 425, 394, 395, 426, 457, 488, 489, 458, 427, 459, 490, 491, 516, 517, 548, 580, 549, 518, 519, 550, 581, 612, 613, 582, 551, 583, 614, 615, 640, 641, 672, 704, 673, 642, 643, 674, 705, 736, 737, 706, 675, 707, 738, 739, 768, 769, 800, 832, 801, 770, 771, 802, 833, 864, 865, 834, 803, 835, 866, 867, 644, 645, 676, 708, 677, 646, 647, 678, 709, 740, 741, 710, 679, 711, 742, 743, 520, 521, 552, 584, 553, 522, 523, 554, 585, 616, 617, 586, 555, 587, 618, 619, 396, 397, 428, 460, 429, 398, 399, 430, 461, 492, 493, 462, 431, 463, 494, 495, 272, 273, 304, 336, 305, 274, 275, 306, 337, 368, 369, 338, 307, 339, 370, 371, 148, 149, 180, 212, 181, 150, 151, 182, 213, 244, 245, 214, 183, 215, 246, 247, 24, 25, 56, 88, 57, 26, 27, 58, 89, 120, 121, 90, 59, 91, 122, 123, 28, 29, 60, 92, 61, 30, 31, 62, 93, 124, 125, 94, 63, 95, 126, 127, 152, 153, 184, 216, 185, 154, 155, 186, 217, 248, 249, 218, 187, 219, 250, 251, 276, 277, 308, 340, 309, 278, 279, 310, 341, 372, 373, 342, 311, 343, 374, 375, 400, 401, 432, 464, 433, 402, 403, 434, 465, 496, 497, 466, 435, 467, 498, 499, 524, 525, 556, 588, 557, 526, 527, 558, 589, 620, 621, 590, 559, 591, 622, 623, 648, 649, 680, 712, 681, 650, 651, 682, 713, 744, 745, 714, 683, 715, 746, 747, 772, 773, 804, 836, 805, 774, 775, 806, 837, 868, 869, 838, 807, 839, 870, 871, 896, 897, 928, 960, 929, 898, 899, 930, 961, 992, 993, 962, 931, 963, 994, 995, 900, 901, 932, 964, 933, 902, 903, 934, 965, 996, 997, 966, 935, 967, 998, 999, 776, 777, 808, 840, 809, 778, 779, 810, 841, 872, 873, 842, 811, 843, 874, 875, 652, 653, 684, 716, 685, 654, 655, 686, 717, 748, 749, 718, 687, 719, 750, 751, 528, 529, 560, 592, 561, 530, 531, 562, 593, 624, 625, 594, 563, 595, 626, 627, 404, 405, 436, 468, 437, 406, 407, 438, 469, 500, 501, 470, 439, 471, 502, 503, 280, 281, 312, 344, 313, 282, 283, 314, 345, 376, 377, 346, 315, 347, 378, 379, 156, 157, 188, 220, 189, 158, 159, 190, 221, 252, 253, 222, 191, 223, 254, 255, 284, 285, 316, 348, 317, 286, 287, 318, 349, 380, 381, 350, 319, 351, 382, 383, 408, 409, 440, 472, 441, 410, 411, 442, 473, 504, 505, 474, 443, 475, 506, 507, 532, 533, 564, 596, 565, 534, 535, 566, 597, 628, 629, 598, 567, 599, 630, 631, 656, 657, 688, 720, 689, 658, 659, 690, 721, 752, 753, 722, 691, 723, 754, 755, 780, 781, 812, 844, 813, 782, 783, 814, 845, 876, 877, 846, 815, 847, 878, 879, 904, 905, 936, 968, 937, 906, 907, 938, 969, 1000, 1001, 970, 939, 971, 1002, 1003, 908, 909, 940, 972, 941, 910, 911, 942, 973, 1004, 1005, 974, 943, 975, 1006, 1007, 784, 785, 816, 848, 817, 786, 787, 818, 849, 880, 881, 850, 819, 851, 882, 883, 660, 661, 692, 724, 693, 662, 663, 694, 725, 756, 757, 726, 695, 727, 758, 759, 536, 537, 568, 600, 569, 538, 539, 570, 601, 632, 633, 602, 571, 603, 634, 635, 412, 413, 444, 476, 445, 414, 415, 446, 477, 508, 509, 478, 447, 479, 510, 511, 540, 541, 572, 604, 573, 542, 543, 574, 605, 636, 637, 606, 575, 607, 638, 639, 664, 665, 696, 728, 697, 666, 667, 698, 729, 760, 761, 730, 699, 731, 762, 763, 788, 789, 820, 852, 821, 790, 791, 822, 853, 884, 885, 854, 823, 855, 886, 887, 912, 913, 944, 976, 945, 914, 915, 946, 977, 1008, 1009, 978, 947, 979, 1010, 1011, 916, 917, 948, 980, 949, 918, 919, 950, 981, 1012, 1013, 982, 951, 983, 1014, 1015, 792, 793, 824, 856, 825, 794, 795, 826, 857, 888, 889, 858, 827, 859, 890, 891, 668, 669, 700, 732, 701, 670, 671, 702, 733, 764, 765, 734, 703, 735, 766, 767, 796, 797, 828, 860, 829, 798, 799, 830, 861, 892, 893, 862, 831, 863, 894, 895, 920, 921, 952, 984, 953, 922, 923, 954, 985, 1016, 1017, 986, 955, 987, 1018, 1019, 924, 925, 956, 988, 957, 926, 927, 958, 989, 1020, 1021, 990, 959, 991, 1022, 1023 }; ALIGN32(const int16_t tab_1d_scan_2x8[16]) = { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }; ALIGN32(const int16_t tab_1d_scan_4x16[64]) = { 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 49, 34, 19, 35, 50, 51, 4, 5, 20, 36, 21, 6, 7, 22, 37, 52, 53, 38, 23, 39, 54, 55, 8, 9, 24, 40, 25, 10, 11, 26, 41, 56, 57, 42, 27, 43, 58, 59, 12, 13, 28, 44, 29, 14, 15, 30, 45, 60, 61, 46, 31, 47, 62, 63 }; ALIGN32(const int16_t tab_1d_scan_8x32[256]) = { 0, 1, 32, 64, 33, 2, 3, 34, 65, 96, 97, 66, 35, 67, 98, 99, 4, 5, 36, 68, 37, 6, 7, 38, 69, 100, 101, 70, 39, 71, 102, 103, 128, 129, 160, 192, 161, 130, 131, 162, 193, 224, 225, 194, 163, 195, 226, 227, 132, 133, 164, 196, 165, 134, 135, 166, 197, 228, 229, 198, 167, 199, 230, 231, 8, 9, 40, 72, 41, 10, 11, 42, 73, 104, 105, 74, 43, 75, 106, 107, 12, 13, 44, 76, 45, 14, 15, 46, 77, 108, 109, 78, 47, 79, 110, 111, 136, 137, 168, 200, 169, 138, 139, 170, 201, 232, 233, 202, 171, 203, 234, 235, 140, 141, 172, 204, 173, 142, 143, 174, 205, 236, 237, 206, 175, 207, 238, 239, 16, 17, 48, 80, 49, 18, 19, 50, 81, 112, 113, 82, 51, 83, 114, 115, 20, 21, 52, 84, 53, 22, 23, 54, 85, 116, 117, 86, 55, 87, 118, 119, 144, 145, 176, 208, 177, 146, 147, 178, 209, 240, 241, 210, 179, 211, 242, 243, 148, 149, 180, 212, 181, 150, 151, 182, 213, 244, 245, 214, 183, 215, 246, 247, 24, 25, 56, 88, 57, 26, 27, 58, 89, 120, 121, 90, 59, 91, 122, 123, 28, 29, 60, 92, 61, 30, 31, 62, 93, 124, 125, 94, 63, 95, 126, 127, 152, 153, 184, 216, 185, 154, 155, 186, 217, 248, 249, 218, 187, 219, 250, 251, 156, 157, 188, 220, 189, 158, 159, 190, 221, 252, 253, 222, 191, 223, 254, 255 }; ALIGN32(const int16_t tab_1d_scan_8x2[16]) = { 0, 1, 2, 4, 3, 5, 6, 8, 7, 9, 10, 12, 11, 13, 14, 15 }; ALIGN32(const int16_t tab_1d_scan_16x4[64]) = { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15, 16, 17, 20, 24, 21, 18, 19, 22, 25, 28, 29, 26, 23, 27, 30, 31, 32, 33, 36, 40, 37, 34, 35, 38, 41, 44, 45, 42, 39, 43, 46, 47, 48, 49, 52, 56, 53, 50, 51, 54, 57, 60, 61, 58, 55, 59, 62, 63 }; ALIGN32(const int16_t tab_1d_scan_32x8[256]) = { 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 25, 18, 11, 19, 26, 27, 4, 5, 12, 20, 13, 6, 7, 14, 21, 28, 29, 22, 15, 23, 30, 31, 32, 33, 40, 48, 41, 34, 35, 42, 49, 56, 57, 50, 43, 51, 58, 59, 64, 65, 72, 80, 73, 66, 67, 74, 81, 88, 89, 82, 75, 83, 90, 91, 36, 37, 44, 52, 45, 38, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, 68, 69, 76, 84, 77, 70, 71, 78, 85, 92, 93, 86, 79, 87, 94, 95, 96, 97, 104, 112, 105, 98, 99, 106, 113, 120, 121, 114, 107, 115, 122, 123, 128, 129, 136, 144, 137, 130, 131, 138, 145, 152, 153, 146, 139, 147, 154, 155, 100, 101, 108, 116, 109, 102, 103, 110, 117, 124, 125, 118, 111, 119, 126, 127, 132, 133, 140, 148, 141, 134, 135, 142, 149, 156, 157, 150, 143, 151, 158, 159, 160, 161, 168, 176, 169, 162, 163, 170, 177, 184, 185, 178, 171, 179, 186, 187, 192, 193, 200, 208, 201, 194, 195, 202, 209, 216, 217, 210, 203, 211, 218, 219, 164, 165, 172, 180, 173, 166, 167, 174, 181, 188, 189, 182, 175, 183, 190, 191, 196, 197, 204, 212, 205, 198, 199, 206, 213, 220, 221, 214, 207, 215, 222, 223, 224, 225, 232, 240, 233, 226, 227, 234, 241, 248, 249, 242, 235, 243, 250, 251, 228, 229, 236, 244, 237, 230, 231, 238, 245, 252, 253, 246, 239, 247, 254, 255, }; ALIGN32(const int16_t tab_1d_scan_swap_4x4[16]) = { 0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7, 10, 13, 14, 11, 15 }; ALIGN32(const int16_t tab_1d_scan_swap_8x8[64]) = { 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 11, 18, 25, 26, 19, 27, 32, 40, 33, 34, 41, 48, 56, 49, 42, 35, 43, 50, 57, 58, 51, 59, 4, 12, 5, 6, 13, 20, 28, 21, 14, 7, 15, 22, 29, 30, 23, 31, 36, 44, 37, 38, 45, 52, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63 }; ALIGN32(const int16_t tab_1d_scan_swap_16x16[256]) = { 0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 19, 34, 49, 50, 35, 51, 64, 80, 65, 66, 81, 96, 112, 97, 82, 67, 83, 98, 113, 114, 99, 115, 4, 20, 5, 6, 21, 36, 52, 37, 22, 7, 23, 38, 53, 54, 39, 55, 8, 24, 9, 10, 25, 40, 56, 41, 26, 11, 27, 42, 57, 58, 43, 59, 68, 84, 69, 70, 85, 100, 116, 101, 86, 71, 87, 102, 117, 118, 103, 119, 128, 144, 129, 130, 145, 160, 176, 161, 146, 131, 147, 162, 177, 178, 163, 179, 192, 208, 193, 194, 209, 224, 240, 225, 210, 195, 211, 226, 241, 242, 227, 243, 132, 148, 133, 134, 149, 164, 180, 165, 150, 135, 151, 166, 181, 182, 167, 183, 72, 88, 73, 74, 89, 104, 120, 105, 90, 75, 91, 106, 121, 122, 107, 123, 12, 28, 13, 14, 29, 44, 60, 45, 30, 15, 31, 46, 61, 62, 47, 63, 76, 92, 77, 78, 93, 108, 124, 109, 94, 79, 95, 110, 125, 126, 111, 127, 136, 152, 137, 138, 153, 168, 184, 169, 154, 139, 155, 170, 185, 186, 171, 187, 196, 212, 197, 198, 213, 228, 244, 229, 214, 199, 215, 230, 245, 246, 231, 247, 200, 216, 201, 202, 217, 232, 248, 233, 218, 203, 219, 234, 249, 250, 235, 251, 140, 156, 141, 142, 157, 172, 188, 173, 158, 143, 159, 174, 189, 190, 175, 191, 204, 220, 205, 206, 221, 236, 252, 237, 222, 207, 223, 238, 253, 254, 239, 255 }; ALIGN32(const int16_t tab_1d_scan_swap_32x32[1024]) = { 0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 35, 66, 97, 98, 67, 99, 128, 160, 129, 130, 161, 192, 224, 193, 162, 131, 163, 194, 225, 226, 195, 227, 4, 36, 5, 6, 37, 68, 100, 69, 38, 7, 39, 70, 101, 102, 71, 103, 8, 40, 9, 10, 41, 72, 104, 73, 42, 11, 43, 74, 105, 106, 75, 107, 132, 164, 133, 134, 165, 196, 228, 197, 166, 135, 167, 198, 229, 230, 199, 231, 256, 288, 257, 258, 289, 320, 352, 321, 290, 259, 291, 322, 353, 354, 323, 355, 384, 416, 385, 386, 417, 448, 480, 449, 418, 387, 419, 450, 481, 482, 451, 483, 260, 292, 261, 262, 293, 324, 356, 325, 294, 263, 295, 326, 357, 358, 327, 359, 136, 168, 137, 138, 169, 200, 232, 201, 170, 139, 171, 202, 233, 234, 203, 235, 12, 44, 13, 14, 45, 76, 108, 77, 46, 15, 47, 78, 109, 110, 79, 111, 16, 48, 17, 18, 49, 80, 112, 81, 50, 19, 51, 82, 113, 114, 83, 115, 140, 172, 141, 142, 173, 204, 236, 205, 174, 143, 175, 206, 237, 238, 207, 239, 264, 296, 265, 266, 297, 328, 360, 329, 298, 267, 299, 330, 361, 362, 331, 363, 388, 420, 389, 390, 421, 452, 484, 453, 422, 391, 423, 454, 485, 486, 455, 487, 512, 544, 513, 514, 545, 576, 608, 577, 546, 515, 547, 578, 609, 610, 579, 611, 640, 672, 641, 642, 673, 704, 736, 705, 674, 643, 675, 706, 737, 738, 707, 739, 516, 548, 517, 518, 549, 580, 612, 581, 550, 519, 551, 582, 613, 614, 583, 615, 392, 424, 393, 394, 425, 456, 488, 457, 426, 395, 427, 458, 489, 490, 459, 491, 268, 300, 269, 270, 301, 332, 364, 333, 302, 271, 303, 334, 365, 366, 335, 367, 144, 176, 145, 146, 177, 208, 240, 209, 178, 147, 179, 210, 241, 242, 211, 243, 20, 52, 21, 22, 53, 84, 116, 85, 54, 23, 55, 86, 117, 118, 87, 119, 24, 56, 25, 26, 57, 88, 120, 89, 58, 27, 59, 90, 121, 122, 91, 123, 148, 180, 149, 150, 181, 212, 244, 213, 182, 151, 183, 214, 245, 246, 215, 247, 272, 304, 273, 274, 305, 336, 368, 337, 306, 275, 307, 338, 369, 370, 339, 371, 396, 428, 397, 398, 429, 460, 492, 461, 430, 399, 431, 462, 493, 494, 463, 495, 520, 552, 521, 522, 553, 584, 616, 585, 554, 523, 555, 586, 617, 618, 587, 619, 644, 676, 645, 646, 677, 708, 740, 709, 678, 647, 679, 710, 741, 742, 711, 743, 768, 800, 769, 770, 801, 832, 864, 833, 802, 771, 803, 834, 865, 866, 835, 867, 896, 928, 897, 898, 929, 960, 992, 961, 930, 899, 931, 962, 993, 994, 963, 995, 772, 804, 773, 774, 805, 836, 868, 837, 806, 775, 807, 838, 869, 870, 839, 871, 648, 680, 649, 650, 681, 712, 744, 713, 682, 651, 683, 714, 745, 746, 715, 747, 524, 556, 525, 526, 557, 588, 620, 589, 558, 527, 559, 590, 621, 622, 591, 623, 400, 432, 401, 402, 433, 464, 496, 465, 434, 403, 435, 466, 497, 498, 467, 499, 276, 308, 277, 278, 309, 340, 372, 341, 310, 279, 311, 342, 373, 374, 343, 375, 152, 184, 153, 154, 185, 216, 248, 217, 186, 155, 187, 218, 249, 250, 219, 251, 28, 60, 29, 30, 61, 92, 124, 93, 62, 31, 63, 94, 125, 126, 95, 127, 156, 188, 157, 158, 189, 220, 252, 221, 190, 159, 191, 222, 253, 254, 223, 255, 280, 312, 281, 282, 313, 344, 376, 345, 314, 283, 315, 346, 377, 378, 347, 379, 404, 436, 405, 406, 437, 468, 500, 469, 438, 407, 439, 470, 501, 502, 471, 503, 528, 560, 529, 530, 561, 592, 624, 593, 562, 531, 563, 594, 625, 626, 595, 627, 652, 684, 653, 654, 685, 716, 748, 717, 686, 655, 687, 718, 749, 750, 719, 751, 776, 808, 777, 778, 809, 840, 872, 841, 810, 779, 811, 842, 873, 874, 843, 875, 900, 932, 901, 902, 933, 964, 996, 965, 934, 903, 935, 966, 997, 998, 967, 999, 904, 936, 905, 906, 937, 968, 1000, 969, 938, 907, 939, 970, 1001, 1002, 971, 1003, 780, 812, 781, 782, 813, 844, 876, 845, 814, 783, 815, 846, 877, 878, 847, 879, 656, 688, 657, 658, 689, 720, 752, 721, 690, 659, 691, 722, 753, 754, 723, 755, 532, 564, 533, 534, 565, 596, 628, 597, 566, 535, 567, 598, 629, 630, 599, 631, 408, 440, 409, 410, 441, 472, 504, 473, 442, 411, 443, 474, 505, 506, 475, 507, 284, 316, 285, 286, 317, 348, 380, 349, 318, 287, 319, 350, 381, 382, 351, 383, 412, 444, 413, 414, 445, 476, 508, 477, 446, 415, 447, 478, 509, 510, 479, 511, 536, 568, 537, 538, 569, 600, 632, 601, 570, 539, 571, 602, 633, 634, 603, 635, 660, 692, 661, 662, 693, 724, 756, 725, 694, 663, 695, 726, 757, 758, 727, 759, 784, 816, 785, 786, 817, 848, 880, 849, 818, 787, 819, 850, 881, 882, 851, 883, 908, 940, 909, 910, 941, 972, 1004, 973, 942, 911, 943, 974, 1005, 1006, 975, 1007, 912, 944, 913, 914, 945, 976, 1008, 977, 946, 915, 947, 978, 1009, 1010, 979, 1011, 788, 820, 789, 790, 821, 852, 884, 853, 822, 791, 823, 854, 885, 886, 855, 887, 664, 696, 665, 666, 697, 728, 760, 729, 698, 667, 699, 730, 761, 762, 731, 763, 540, 572, 541, 542, 573, 604, 636, 605, 574, 543, 575, 606, 637, 638, 607, 639, 668, 700, 669, 670, 701, 732, 764, 733, 702, 671, 703, 734, 765, 766, 735, 767, 792, 824, 793, 794, 825, 856, 888, 857, 826, 795, 827, 858, 889, 890, 859, 891, 916, 948, 917, 918, 949, 980, 1012, 981, 950, 919, 951, 982, 1013, 1014, 983, 1015, 920, 952, 921, 922, 953, 984, 1016, 985, 954, 923, 955, 986, 1017, 1018, 987, 1019, 796, 828, 797, 798, 829, 860, 892, 861, 830, 799, 831, 862, 893, 894, 863, 895, 924, 956, 925, 926, 957, 988, 1020, 989, 958, 927, 959, 990, 1021, 1022, 991, 1023 }; const int16_t *tab_coef_scan1_list_nxn[2][4] = { { tab_1d_scan_4x4, tab_1d_scan_8x8, tab_1d_scan_16x16, tab_1d_scan_32x32 }, { tab_1d_scan_swap_4x4, tab_1d_scan_swap_8x8, tab_1d_scan_swap_16x16, tab_1d_scan_swap_32x32 } }; const int16_t *tab_coef_scan1_list_hor[3] = { tab_1d_scan_2x8, tab_1d_scan_4x16, tab_1d_scan_8x32 }; const int16_t *tab_coef_scan1_list_ver[3] = { tab_1d_scan_8x2, tab_1d_scan_16x4, tab_1d_scan_32x8 }; /** * =========================================================================== * function definition * =========================================================================== */ /* --------------------------------------------------------------------------- * CGϵɨ˳ϵתã */ static void coeff_scan_4x4_xy_c(coeff_t *dst, const coeff_t *src, int i_src_shift) { int i; int j = 0; for (i = 0; i < 16; i++) { int xx = tab_scan_4x4[i][0]; int yy = tab_scan_4x4[i][1]; coeff_t level = src[(yy << i_src_shift) + xx]; dst[j] = level; j++; } } /* --------------------------------------------------------------------------- * CGϵɨ˳ϵתã */ static void coeff_scan_4x4_yx_c(coeff_t *dst, const coeff_t *src, int i_src_shift) { int i; int j = 0; for (i = 0; i < 16; i++) { int xx = tab_scan_4x4[i][1]; int yy = tab_scan_4x4[i][0]; coeff_t level = src[(yy << i_src_shift) + xx]; dst[j] = level; j++; } } /* --------------------------------------------------------------------------- */ void coeff_scan4_xy_c(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4) { uint64_t src[16]; src[0] = r1; src[1] = r2; src[2] = r3; src[3] = r4; g_funcs.transpose_coeff_scan[LUMA_4x4][0](dst, (coeff_t *)src, 2); } /* --------------------------------------------------------------------------- */ void coeff_scan4_yx_c(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4) { uint64_t src[16]; src[0] = r1; src[1] = r2; src[2] = r3; src[3] = r4; g_funcs.transpose_coeff_scan[LUMA_4x4][1](dst, (coeff_t *)src, 2); } /* --------------------------------------------------------------------------- */ static void coeff_scan_8x8_xy_c(coeff_t *dst, const coeff_t *src, int i_src_shift) { const int num_cg = 4; int i_cg; for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { int cg_x = tab_scan_2x2[i_cg][0] << 2; int cg_y = tab_scan_2x2[i_cg][1] << 2; g_funcs.transpose_coeff_scan[LUMA_4x4][0](dst, src + (cg_y << i_src_shift) + cg_x, i_src_shift); dst += 16; } } /* --------------------------------------------------------------------------- */ static void coeff_scan_8x8_yx_c(coeff_t *dst, const coeff_t *src, int i_src_shift) { const int num_cg = 4; int i_cg; for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { int cg_x = tab_scan_2x2[i_cg][1] << 2; int cg_y = tab_scan_2x2[i_cg][0] << 2; g_funcs.transpose_coeff_scan[LUMA_4x4][1](dst, src + (cg_y << i_src_shift) + cg_x, i_src_shift); dst += 16; } } /* --------------------------------------------------------------------------- */ static void coeff_scan_16x16_xy_c(coeff_t *dst, const coeff_t *src, int i_src_shift) { const int num_cg = 16; int i_cg; for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { int cg_x = tab_scan_4x4[i_cg][0] << 2; int cg_y = tab_scan_4x4[i_cg][1] << 2; g_funcs.transpose_coeff_scan[LUMA_4x4][0](dst, src + (cg_y << i_src_shift) + cg_x, i_src_shift); dst += 16; } } /* --------------------------------------------------------------------------- */ static void coeff_scan_16x16_yx_c(coeff_t *dst, const coeff_t *src, int i_src_shift) { const int num_cg = 16; int i_cg; for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { int cg_x = tab_scan_4x4[i_cg][1] << 2; int cg_y = tab_scan_4x4[i_cg][0] << 2; g_funcs.transpose_coeff_scan[LUMA_4x4][1](dst, src + (cg_y << i_src_shift) + cg_x, i_src_shift); dst += 16; } } /* --------------------------------------------------------------------------- */ static void coeff_scan_32x32_xy_c(coeff_t *dst, const coeff_t *src, int i_src_shift) { const int num_cg = 64; int i_cg; for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { int cg_x = tab_scan_8x8[i_cg][0] << 2; int cg_y = tab_scan_8x8[i_cg][1] << 2; g_funcs.transpose_coeff_scan[LUMA_4x4][0](dst, src + (cg_y << i_src_shift) + cg_x, i_src_shift); dst += 16; } } /* --------------------------------------------------------------------------- */ static void coeff_scan_32x32_yx_c(coeff_t *dst, const coeff_t *src, int i_src_shift) { const int num_cg = 64; int i_cg; for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { int cg_x = tab_scan_8x8[i_cg][1] << 2; int cg_y = tab_scan_8x8[i_cg][0] << 2; g_funcs.transpose_coeff_scan[LUMA_4x4][1](dst, src + (cg_y << i_src_shift) + cg_x, i_src_shift); dst += 16; } } /* --------------------------------------------------------------------------- */ static void coeff_scan_4x16_xy_c(coeff_t *dst, const coeff_t *src, int i_src_shift) { const int num_cg = 4; int i_cg; for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { int cg_x = tab_scan_1x4[i_cg][0] << 2; int cg_y = tab_scan_1x4[i_cg][1] << 2; g_funcs.transpose_coeff_scan[LUMA_4x4][0](dst, src + (cg_y << i_src_shift) + cg_x, i_src_shift); dst += 16; } } /* --------------------------------------------------------------------------- */ static void coeff_scan_16x4_xy_c(coeff_t *dst, const coeff_t *src, int i_src_shift) { const int num_cg = 4; int i_cg; for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { int cg_x = tab_scan_4x1[i_cg][0] << 2; int cg_y = tab_scan_4x1[i_cg][1] << 2; g_funcs.transpose_coeff_scan[LUMA_4x4][0](dst, src + (cg_y << i_src_shift) + cg_x, i_src_shift); dst += 16; } } /* --------------------------------------------------------------------------- */ static void coeff_scan_8x32_xy_c(coeff_t *dst, const coeff_t *src, int i_src_shift) { const int num_cg = 16; int i_cg; for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { int cg_x = tab_scan_2x8[i_cg][0] << 2; int cg_y = tab_scan_2x8[i_cg][1] << 2; g_funcs.transpose_coeff_scan[LUMA_4x4][0](dst, src + (cg_y << i_src_shift) + cg_x, i_src_shift); dst += 16; } } /* --------------------------------------------------------------------------- */ static void coeff_scan_32x8_xy_c(coeff_t *dst, const coeff_t *src, int i_src_shift) { const int num_cg = 16; int i_cg; for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { int cg_x = tab_scan_8x2[i_cg][0] << 2; int cg_y = tab_scan_8x2[i_cg][1] << 2; g_funcs.transpose_coeff_scan[LUMA_4x4][0](dst, src + (cg_y << i_src_shift) + cg_x, i_src_shift); dst += 16; } } /* --------------------------------------------------------------------------- */ void xavs2_cg_scan_init(uint32_t cpuid, intrinsic_func_t *pf) { pf->transpose_coeff_scan[LUMA_4x4 ][0] = coeff_scan_4x4_xy_c; pf->transpose_coeff_scan[LUMA_4x4 ][1] = coeff_scan_4x4_yx_c; pf->transpose_coeff_scan[LUMA_8x8 ][0] = coeff_scan_8x8_xy_c; pf->transpose_coeff_scan[LUMA_8x8 ][1] = coeff_scan_8x8_yx_c; pf->transpose_coeff_scan[LUMA_16x16][0] = coeff_scan_16x16_xy_c; pf->transpose_coeff_scan[LUMA_16x16][1] = coeff_scan_16x16_yx_c; pf->transpose_coeff_scan[LUMA_32x32][0] = coeff_scan_32x32_xy_c; pf->transpose_coeff_scan[LUMA_32x32][1] = coeff_scan_32x32_yx_c; pf->transpose_coeff_scan[LUMA_4x16 ][0] = coeff_scan_4x16_xy_c; pf->transpose_coeff_scan[LUMA_16x4 ][0] = coeff_scan_16x4_xy_c; pf->transpose_coeff_scan[LUMA_8x32 ][0] = coeff_scan_8x32_xy_c; pf->transpose_coeff_scan[LUMA_32x8 ][0] = coeff_scan_32x8_xy_c; pf->transpose_coeff_4x4[0] = coeff_scan4_xy_c; pf->transpose_coeff_4x4[1] = coeff_scan4_yx_c; #if HAVE_MMX /* SSE 128bit */ if (cpuid & XAVS2_CPU_SSE42) { pf->transpose_coeff_scan[LUMA_4x4][0] = coeff_scan_4x4_xy_sse128; pf->transpose_coeff_scan[LUMA_4x4][1] = coeff_scan_4x4_yx_sse128; #if ARCH_X86_64 pf->transpose_coeff_4x4[0] = coeff_scan4_xy_sse128; pf->transpose_coeff_4x4[1] = coeff_scan4_yx_sse128; #endif } /* AVX 256bit */ if (cpuid & XAVS2_CPU_AVX2) { #if ARCH_X86_64 /* avxsse45%ңĬϹر */ // pf->transpose_coeff_4x4[0] = coeff_scan4_xy_avx; // pf->transpose_coeff_4x4[1] = coeff_scan4_yx_avx; #endif } #else UNUSED_PARAMETER(cpuid); #endif } xavs2-1.3/source/common/common.c000066400000000000000000000223461340660520300166520ustar00rootroot00000000000000/* * common.c * * Description of this file: * misc common functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include #include #if SYS_WINDOWS #include #include #else #include #endif #include #if HAVE_MALLOC_H #include #endif /** * =========================================================================== * global variables * =========================================================================== */ static size_t g_xavs2_size_mem_alloc = 0; const float FRAME_RATE[8] = { 24000.0f / 1001.0f, 24.0f, 25.0f, 30000.0f / 1001.0f, 30.0f, 50.0f, 60000.0f / 1001.0f, 60.0f }; const char *xavs2_preset_names[] = { "ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow", "slower", "veryslow", "placebo", NULL }; xavs2_log_t g_xavs2_default_log = { XAVS2_LOG_DEBUG, "default" }; #if XAVS2_TRACE FILE *h_trace = NULL; /* global file handle for trace file */ int g_sym_count = 0; /* global symbol count for trace */ int g_bit_count = 0; /* global bit count for trace */ #endif #if PTW32_STATIC_LIB #define WIN32_LEAN_AND_MEAN #include /* this is a global in pthread-win32 to indicate if it has been * initialized or not */ extern int ptw32_processInitialized; #endif #if XAVS2_TRACE /** * =========================================================================== * trace file * =========================================================================== */ /* --------------------------------------------------------------------------- */ int xavs2_trace_init(xavs2_param_t *param) { if (strlen(param->psz_trace_file) > 0) { /* create or truncate the trace file */ h_trace = fopen(param->psz_trace_file, "wt"); if (h_trace == NULL) { xavs2_log(NULL, XAVS2_LOG_ERROR, "trace: can't write to %s\n", param->psz_trace_file); return -1; } } return 0; } /* --------------------------------------------------------------------------- */ void xavs2_trace_destroy(void) { if (h_trace) { fclose(h_trace); } } /* --------------------------------------------------------------------------- */ int xavs2_trace(const char *psz_fmt, ...) { int len = 0; /* append to the trace file */ if (h_trace) { va_list arg; va_start(arg, psz_fmt); len = vfprintf(h_trace, psz_fmt, arg); fflush(h_trace); va_end(arg); } return len; } #endif /** * =========================================================================== * xavs2_log * =========================================================================== */ #ifdef _MSC_VER /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void xavs2_set_font_color(int color) { static const WORD colors[] = { FOREGROUND_INTENSITY | FOREGROUND_RED, // ɫ FOREGROUND_INTENSITY | FOREGROUND_RED | FOREGROUND_GREEN, // ɫ FOREGROUND_INTENSITY | FOREGROUND_GREEN, // ɫ FOREGROUND_INTENSITY | FOREGROUND_GREEN | FOREGROUND_BLUE, // cyan FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_BLUE, // ɫ }; color = XAVS2_MIN(4, color); SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), colors[color]); } #endif /* --------------------------------------------------------------------------- */ static void xavs2_log_default(int i_log_level, const char *psz_fmt) { #if !defined(_MSC_VER) static const char str_color_clear[] = "\033[0m"; // "\033[0m" static const char str_color[][16] = { /* red yellow green cyan (default) */ "\033[1;31m", "\033[1;33m", "\033[1;32m", "\033[1;36m", "\033[0m" }; const char *cur_color = str_color[i_log_level]; #endif static const char *null_prefix = ""; const char *psz_prefix = null_prefix; switch (i_log_level) { case XAVS2_LOG_ERROR: psz_prefix = "xavs2[e]: "; break; case XAVS2_LOG_WARNING: psz_prefix = "xavs2[w]: "; break; case XAVS2_LOG_INFO: psz_prefix = "xavs2[i]: "; break; case XAVS2_LOG_DEBUG: psz_prefix = "xavs2[d]: "; break; default: psz_prefix = "xavs2[u]: "; #if !defined(_MSC_VER) cur_color = str_color[0]; #endif break; } #if defined(_MSC_VER) xavs2_set_font_color(i_log_level); /* set color */ fprintf(stdout, "%s%s", psz_prefix, psz_fmt); xavs2_set_font_color(4); /* restore to white color */ #else if (i_log_level != XAVS2_LOG_INFO) { fprintf(stdout, "%s%s%s%s", cur_color, psz_prefix, psz_fmt, str_color_clear); } else { fprintf(stdout, "%s%s", psz_prefix, psz_fmt); } #endif } /* --------------------------------------------------------------------------- */ void xavs2_log(void *p, int i_log_level, const char *psz_fmt, ...) { xavs2_log_t *h = (xavs2_log_t *)p; int i_output_log_level = g_xavs2_default_log.i_log_level; if (h != NULL) { i_output_log_level = h->i_log_level; } if ((i_log_level & 0x0F) <= i_output_log_level) { va_list arg; char str_in[2048]; va_start(arg, psz_fmt); vsprintf(str_in, psz_fmt, arg); xavs2_log_default(i_log_level, str_in); va_end(arg); } } /* xavs2_malloc : will do or emulate a memalign * you have to use xavs2_free for buffers allocated with xavs2_malloc */ void *xavs2_malloc(size_t i_size) { intptr_t mask = CACHE_LINE_SIZE - 1; uint8_t *align_buf = NULL; size_t size_malloc = i_size + mask + sizeof(void **); uint8_t *buf = (uint8_t *)malloc(size_malloc); if (buf != NULL) { g_xavs2_size_mem_alloc += size_malloc; align_buf = buf + mask + sizeof(void **); align_buf -= (intptr_t)align_buf & mask; *(((void **)align_buf) - 1) = buf; } else { fprintf(stderr, "malloc of size %zu failed\n", i_size); } return align_buf; } void *xavs2_calloc(size_t count, size_t size) { void *p = xavs2_malloc(count * size); if (p != NULL) { memset(p, 0, size * sizeof(uint8_t)); } return p; } void xavs2_free(void *ptr) { if (ptr != NULL) { free(*(((void **)ptr) - 1)); } } size_t xavs2_get_total_malloc_space(void) { return g_xavs2_size_mem_alloc; } /** * =========================================================================== * utilities * =========================================================================== */ /* --------------------------------------------------------------------------- * get time */ int64_t xavs2_mdate(void) { #if SYS_WINDOWS LARGE_INTEGER nFreq; if (QueryPerformanceFrequency(&nFreq)) { // طʾӲָ֧߾ȼ LARGE_INTEGER t1; QueryPerformanceCounter(&t1); return (int64_t)(1000000 * t1.QuadPart / (double)nFreq.QuadPart); } else { // Ӳ֧£ʹú뼶ϵͳʱ struct timeb tb; ftime(&tb); return ((int64_t)tb.time * 1000 + (int64_t)tb.millitm) * 1000; } #else struct timeval tv_date; gettimeofday(&tv_date, NULL); return (int64_t)tv_date.tv_sec * 1000000 + (int64_t)tv_date.tv_usec; #endif } /** * =========================================================================== * thread * =========================================================================== */ /* --------------------------------------------------------------------------- */ int xavs2_create_thread(xavs2_thread_t *tid, xavs2_tfunc_t tfunc, void *targ) { return xavs2_thread_create(tid, NULL, tfunc, targ); } xavs2-1.3/source/common/common.h000066400000000000000000002274241340660520300166630ustar00rootroot00000000000000/* * common.h * * Description of this file: * misc common functionsdefinition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_COMMON_H #define XAVS2_COMMON_H /** * =========================================================================== * common include files * =========================================================================== */ #include "defines.h" #include "osdep.h" #include "avs2_defs.h" #include #include #include #include #include #include /** * =========================================================================== * macros * =========================================================================== */ /* --------------------------------------------------------------------------- * predicate mode & cu type */ #define ALLOW_HOR_TU_PART(mode) (((1 << (mode)) & MASK_HOR_TU_MODES) != 0) #define ALLOW_VER_TU_PART(mode) (((1 << (mode)) & MASK_VER_TU_MODES) != 0) #define IS_HOR_PU_PART(mode) (((1 << (mode)) & MASK_HOR_PU_MODES) != 0) #define IS_VER_PU_PART(mode) (((1 << (mode)) & MASK_VER_PU_MODES) != 0) #define IS_INTRA_MODE(mode) (((1 << (mode)) & MASK_INTRA_MODES ) != 0) #define IS_INTER_MODE(mode) (((1 << (mode)) & MASK_INTER_MODES ) != 0) #define IS_INTER_MODE_NS(mode) (((1 << (mode)) & MASK_INTER_NOSKIP) != 0) /* is inter mode (except SKIP)? */ #define IS_SKIP_MODE(mode) ((mode) == PRED_SKIP) #define IS_INTRA(cu) IS_INTRA_MODE((cu)->i_mode) #define IS_INTER(cu) IS_INTER_MODE((cu)->i_mode) #define IS_SKIP(cu) IS_SKIP_MODE((cu)->i_mode) /* --------------------------------------------------------------------------- * weight cost of mvd/ref */ #define LAMBDA_ACCURACY_BITS 16 #define LAMBDA_FACTOR(lambda) ((int)((double)(1<>LAMBDA_ACCURACY_BITS) /* --------------------------------------------------------------------------- * multi line macros */ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #define MULTI_LINE_MACRO_BEGIN do { #define MULTI_LINE_MACRO_END \ __pragma(warning(push))\ __pragma(warning(disable:4127))\ } while (0)\ __pragma(warning(pop)) #else #define MULTI_LINE_MACRO_BEGIN { #define MULTI_LINE_MACRO_END } #endif /* --------------------------------------------------------------------------- * memory malloc */ #define CHECKED_MALLOC(var, type, size) \ MULTI_LINE_MACRO_BEGIN\ (var) = (type)xavs2_malloc(size);\ if ((var) == NULL) {\ goto fail;\ }\ MULTI_LINE_MACRO_END #define CHECKED_MALLOCZERO(var, type, size) \ MULTI_LINE_MACRO_BEGIN\ size_t new_size = ((size + 31) >> 5) << 5; /* align the size to 32 bytes */ \ CHECKED_MALLOC(var, type, new_size);\ g_funcs.memzero_aligned(var, new_size); \ MULTI_LINE_MACRO_END /** * =========================================================================== * enum defines * =========================================================================== */ /* --------------------------------------------------------------------------- * rate control methods */ enum rc_method_e { XAVS2_RC_CQP = 0, /* const QP */ XAVS2_RC_CBR_FRM = 1, /* const bit-rate, frame level */ XAVS2_RC_CBR_SCU = 2 /* const bit-rate, SCU level */ }; /* --------------------------------------------------------------------------- * ME methods */ enum me_mothod_e { XAVS2_ME_FS = 0, /* full search */ XAVS2_ME_DIA = 1, /* diamond search */ XAVS2_ME_HEX = 2, /* hexagon search */ XAVS2_ME_UMH = 3, /* UMH search */ XAVS2_ME_TZ = 4 /* TZ search */ }; /* --------------------------------------------------------------------------- * slice types */ enum slice_type_e { SLICE_TYPE_I = 0, /* slice type: I */ SLICE_TYPE_P = 1, /* slice type: P */ SLICE_TYPE_B = 2, /* slice type: B */ SLICE_TYPE_F = 3, /* slice type: F */ SLICE_TYPE_NUM = 4 /* slice type number */ }; /* --------------------------------------------------------------------------- * NAL unit type */ enum nal_unit_type_e { NAL_UNKNOWN = 0, NAL_SLICE = 1, NAL_SLICE_DPA = 2, NAL_SLICE_DPB = 3, NAL_SLICE_DPC = 4, NAL_SLICE_IDR = 5, /* ref_idc != 0 */ NAL_SEI = 6, /* ref_idc == 0 */ NAL_SPS = 7, NAL_PPS = 8, NAL_AUD = 9, NAL_FILLER = 12 }; /* --------------------------------------------------------------------------- * NAL priority */ enum nal_priority_e { NAL_PRIORITY_DISPOSABLE = 0, NAL_PRIORITY_LOW = 1, NAL_PRIORITY_HIGH = 2, NAL_PRIORITY_HIGHEST = 3 }; /* --------------------------------------------------------------------------- * all prediction modes (n = N/2) */ enum cu_pred_mode_e { PRED_INVLALID = -1, /* invalid mode, as initial value */ /* all inter modes: 8 */ PRED_SKIP = 0, /* skip/direct block: 1 */ PRED_2Nx2N = 1, /* 2N x 2N block: 1 */ PRED_2NxN = 2, /* 2N x N block: 2 */ PRED_Nx2N = 3, /* N x 2N block: 2 */ PRED_2NxnU = 4, /* 2N x n + 2N x 3n block: 2 */ PRED_2NxnD = 5, /* 2N x 3n + 2N x n block: 2 */ PRED_nLx2N = 6, /* n x 2N + 3n x 2N block: 2 */ PRED_nRx2N = 7, /* 3n x 2N + n x 2N block: 2 */ /* all intra modes: 4 */ PRED_I_2Nx2N = 8, /* 2N x 2N block: 1 */ PRED_I_NxN = 9, /* N x N block: 4 */ PRED_I_2Nxn = 10, /* 2N x n (32x8, 16x4) block: 4 */ PRED_I_nx2N = 11, /* n x 2N (8x32, 4x16) block: 4 */ /* mode numbers */ MAX_PRED_MODES = 12, /* total 12 pred modes, include: */ MAX_INTER_MODES = 8, /* 8 inter modes */ MAX_INTRA_MODES = 4, /* 4 intra modes */ /* masks */ MASK_HOR_TU_MODES = 0x0430, /* mask for horizontal TU partition */ MASK_VER_TU_MODES = 0x08C0, /* mask for vertical TU partition */ MASK_HOR_PU_MODES = 0x0434, /* mask for horizontal PU partition */ MASK_VER_PU_MODES = 0x08C8, /* mask for vertical PU partition */ MASK_INTER_MODES = 0x00FF, /* mask for inter modes */ MASK_INTER_NOSKIP = 0x00FE, /* mask for inter modes except skip */ MASK_INTRA_MODES = 0x0F00 /* mask for intra modes */ }; /* --------------------------------------------------------------------------- * splitting type of transform unit */ enum tu_split_type_e { TU_SPLIT_INVALID = -1, /* invalid split type */ TU_SPLIT_NON = 0, /* not split */ TU_SPLIT_HOR = 1, /* horizontally split into 4 blocks */ TU_SPLIT_VER = 2, /* vertically split into 4 blocks */ TU_SPLIT_CROSS = 3, /* cross split into 4 blocks */ TU_SPLIT_TYPE_NUM = 4 /* number of split types */ }; /* --------------------------------------------------------------------------- * image components */ enum image_component_type_e { IMG_Y = 0, /* image component: Y */ IMG_U = 1, /* image component: Cb */ IMG_V = 2, /* image component: Cr */ IMG_CMPNTS = 3 /* image component number */ }; /* --------------------------------------------------------------------------- */ enum coding_type_e { FRAME_CODING = 0, FIELD_CODING = 3 }; /* --------------------------------------------------------------------------- */ enum sequence_type_e { FIELD, FRAME }; /* --------------------------------------------------------------------------- * task type */ typedef enum task_type_e { XAVS2_TASK_FRAME = 0, /* frame task */ XAVS2_TASK_SLICE = 1, /* slice task */ XAVS2_TASK_ROW = 2 /* row task */ } task_type_e; /* --------------------------------------------------------------------------- * task status */ typedef enum task_status_e { XAVS2_TASK_FREE = 0, /* task is free */ XAVS2_TASK_BUSY = 1, /* task is alloted */ XAVS2_TASK_RDO_DONE = 2, /* RDO is finished */ XAVS2_TASK_AEC_DONE = 3 /* AEC is finished */ } task_status_e; /* --------------------------------------------------------------------------- * signals */ enum xavs2_signal_e { SIG_FRM_CONTEXT_ALLOCATED = 0, /* one frame context is allocated */ SIG_FRM_CONTEXT_RELEASED = 1, /* one frame context is released */ SIG_FRM_AEC_COMPLETED = 2, /* one frame finishes AEC */ SIG_FRM_AEC_DONE = 3, /* one frame finishes AEC */ SIG_FRM_DELIVERED = 4, /* one frame is outputted */ SIG_FRM_BUFFER_RELEASED = 5, /* one frame buffer is available */ SIG_ROW_CONTEXT_RELEASED = 6, /* one row context is released */ SIG_COUNT = 7 }; /** * =========================================================================== * type defines * =========================================================================== */ #include "basic_types.h" /* --------------------------------------------------------------------------- * function handle types */ /* thread function: proceeding of one thread */ typedef void *(*xavs2_tfunc_t)(void *); // typedef void *(__stdcall *xavs2_afunc_t)(void *); /** * =========================================================================== * internal include files * =========================================================================== */ #include "xavs2.h" #include "pixel.h" #include "intra.h" #include "transform.h" #include "filter.h" #if HAVE_MMX #include "vec/intrinsic.h" #endif #include "primitives.h" /** * =========================================================================== * struct type defines * =========================================================================== */ #if defined(_MSC_VER) || defined(__ICL) #pragma warning(disable: 4201) // non-standard extension used (nameless struct/union) #endif /* --------------------------------------------------------------------------- * motion vector */ union mv_t { uint32_t v; // v = ((y << 16) | (x & 0xFFFF)), 32-bit struct { int16_t x; // x, low 16-bit int16_t y; // y, high 16-bit }; }; /* --------------------------------------------------------------------------- * bitstream */ typedef struct bitstream { uint8_t *p_start; /* actual buffer for written bytes */ uint8_t *p; /* pointer to byte written currently */ uint8_t *p_end; /* end of the actual buffer */ int i_left; /* current bit counter to go */ } bs_t; /* --------------------------------------------------------------------------- * struct for context management */ typedef union context_t { struct { unsigned MPS : 1; // 1 bit unsigned LG_PMPS : 11; // 11 bits unsigned cycno : 2; // 2 bits }; uint16_t v; } context_t; typedef union runlevel_pair_t { struct { coeff_t level; int8_t run; }; uint32_t v; } runlevel_pair_t; /* --------------------------------------------------------------------------- * run-level infos (CG: Coefficient Group) * رı任Ϊ 32x32 8*8 CG */ typedef struct runlevel_t { ALIGN16(runlevel_pair_t runlevels_cg[16]); int num_cg; int last_pos_cg; /* Last Coeff Position in CG */ int b_hor; int i_stride_shift; coeff_t *quant_coeff; /* coefficients */ coeff_t *transposed_coeff; /* coefficients in CG scan order */ const int16_t(*tab_cg_scan)[2]; /* CG scan table */ cu_info_t *p_cu_info; } runlevel_t; /* --------------------------------------------------------------------------- * binary_t */ typedef struct binary_t { /* ﷨Ԫرúָ */ int (*write_intra_pred_mode)(aec_t *p_aec, int ipmode); int (*write_ctu_split_flag)(aec_t *p_aec, int i_cu_split, int i_cu_level); int (*est_cu_header)(xavs2_t *h, aec_t *p_aec, cu_t *p_cu); int (*est_cu_refs_mvds)(xavs2_t *h, aec_t *p_aec, cu_t *p_cu); int (*est_luma_block_coeff)(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, coeff_t *quant_coeff, runlevel_t *runlevel, int i_level, int i_stride_shift, int is_intra, int intra_mode, int max_bits); int (*est_chroma_block_coeff)(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, coeff_t *quant_coeff, runlevel_t *runlevel, int i_level, int max_bits); #if ENABLE_RATE_CONTROL_CU int (*write_cu_cbp_dqp)(xavs2_t *h, aec_t *p_aec, cu_info_t *p_cu_info, int slice_index_cur_cu, int *last_dqp); #else int (*write_cu_cbp)(aec_t *p_aec, cu_info_t *p_cu_info, int slice_index_cur_cu, xavs2_t *h); #endif int (*write_sao_mergeflag)(aec_t *p_aec, int mergeleft_avail, int mergeup_avail, SAOBlkParam *saoBlkParam); int (*write_sao_mode)(aec_t *p_aec, SAOBlkParam *saoBlkParam); int (*write_sao_offset)(aec_t *p_aec, SAOBlkParam *saoBlkParam); int (*write_sao_type)(aec_t *p_aec, SAOBlkParam *saoBlkParam); int (*write_alf_lcu_ctrl)(aec_t *p_aec, uint8_t iflag); } binary_t; /* --------------------------------------------------------------------------- * const for syntax elements */ #define NUM_BLOCK_TYPES 3 #define NUM_CU_TYPE_CTX 5 #define NUM_INTRA_PU_TYPE_CTX 1 #define NUM_INTRA_MODE_CTX 7 #define NUM_INTRA_MODE_C_CTX 3 #define NUM_SPLIT_CTX (CTU_DEPTH - 1) // CU depth #define NUM_TU_CTX 3 #define NUM_INTER_DIR_CTX 15 #define NUM_INTER_DIR_MIN_CTX 2 #define NUM_AMP_CTX 2 #define NUM_CBP_CTX 9 #define NUM_MVD_CTX 3 #define NUM_DMH_MODE_CTX 12 // (MAX_CU_SIZE_IN_BIT - MIN_CU_SIZE_IN_BIT + 1) * 4 #define NUM_REF_NO_CTX 3 #define NUM_DELTA_QP_CTX 4 #define NUM_LAST_CG_CTX_LUMA 6 #define NUM_LAST_CG_CTX_CHROMA 6 #define NUM_SIGN_CG_CTX_LUMA 2 #define NUM_SIGN_CG_CTX_CHROMA 1 #define NUM_LAST_POS_CTX_LUMA 48 /* last_coeff_pos_x last_coeff_pos_y 48ɫȷ */ #define NUM_LAST_POS_CTX_CHROMA 12 /* last_coeff_pos_x last_coeff_pos_y 12ɫȷ */ #define NUM_MAP_CTX 12 #define NUM_LAST_CG_CTX (NUM_LAST_CG_CTX_LUMA + NUM_LAST_CG_CTX_CHROMA) /* last_cg_pos:6; + last_cg0_flag:2(IsChroma); last_cg_x:2; last_cg_y:2 */ #define NUM_SIGN_CG_CTX (NUM_SIGN_CG_CTX_LUMA + NUM_SIGN_CG_CTX_CHROMA) #define NUM_LAST_POS_CTX (NUM_LAST_POS_CTX_LUMA + NUM_LAST_POS_CTX_CHROMA) /* last_coeff_pos_x: (30) + last_coeff_pos_y: (30) */ #define NUM_COEFF_LEVEL_CTX 40 /* CoeffLevelMinus1Band Ϊ 0 ʱ coeff_level_minus1_pos_in_band */ #define NUM_SAO_MERGE_FLAG_CTX 3 #define NUM_SAO_MODE_CTX 1 #define NUM_SAO_OFFSET_CTX 1 #define NUM_ALF_LCU_CTX 4 /* --------------------------------------------------------------------------- * reference parameter set */ typedef struct xavs2_rps_t { int idx_in_gop; /* index within a GOP */ int poc; /* picture order count */ int qp_offset; /* QP offset based on key frame */ int referd_by_others; /* referenced by other pictures? */ int temporal_id; /* temporal id */ int reserved; /* reserved (not used) */ int num_of_ref; /* number of reference pictures */ int num_to_rm; /* number of picture to be removed */ int ref_pic[XAVS2_MAX_REFS];/* delta COI of reference pictures */ int rm_pic [8]; /* delta COI of removed pictures */ } xavs2_rps_t; /* --------------------------------------------------------------------------- * xavs2 encoder input parameters */ typedef struct xavs2_param_t { /* --- sequence --------------------------------------------- */ int profile_id; /* profile id */ int level_id; /* level id */ int progressive_sequence; /* progressive sequence? */ int chroma_format; /* YUV format (0=4:0:0, 1=4:2:0, 2=4:2:2, 3=4:4:4,currently only 4:2:0 is supported) */ int input_sample_bit_depth; /* input file bit depth */ int sample_bit_depth; /* sample bit depth */ int sample_precision; /* sample precision */ int aspect_ratio_information; /* aspect ratio information */ int frame_rate_code; /* frame rate code */ float frame_rate; /* frame rate */ int bitrate_lower; /* bit rate lower */ int bitrate_upper; /* bit rate upper */ int low_delay; /* low delay */ int temporal_id_exist_flag; /* temporal_id exist flag */ int bbv_buffer_size; /* bbv buffer size */ int lcu_bit_level; /* largest coding unit size in bit, 3:8x8, 4:16x16, 5:32x32, 6:64x64 */ int scu_bit_level; /* smallest coding unit size in bit, 3:8x8, 4:16x16, 5:32x32, 6:64x64 */ int org_width; /* original source image width */ int org_height; /* original source image height */ // sequence display extension // int video_format; /* video format */ // int video_range; /* video range */ // int color_description; /* color description */ // int color_primaries; /* color primaries */ // int transfer_characteristics; /* transfer characteristics */ // int matrix_coefficients; /* matrix coefficients */ // int display_horizontal_size; /* display horizontal size */ // int display_vertical_size; /* display vertical size */ // int TD_mode; /* 3D mode */ // int view_packing_mode; /* 3D packing mode */ // int view_reverse; /* view reverse */ /* --- stream structure ------------------------------------- */ int intra_period_max; /* maximum intra-period, one I-frame mush appear in any NumMax of frames */ int intra_period_min; /* minimum intra-period, only one I-frame can appear in at most NumMin of frames */ int b_open_gop; /* open GOP? 1: open, 0: close */ int enable_f_frame; /* enable F-frame */ int num_bframes; /* number of B frames that will be used */ int InterlaceCodingOption; /* coding type: frame coding? field coding? */ /* --- picture ---------------------------------------------- */ int progressive_frame; /* progressive frame */ int time_code_flag; /* time code flag */ int top_field_first; /* top field first */ int repeat_first_field; /* repeat first field */ int fixed_picture_qp; /* fixed picture qp */ /* --- slice ------------------------------------------------ */ int slice_num; /* slice number */ /* --- analysis options ------------------------------------- */ int enable_hadamard; /* 0: 'normal' SAD in 1/4 pixel search. 1: use 4x4 Haphazard transform and * Sum of absolute transform difference' in 1/4 pixel search */ int me_method; /* Fast motion estimation method. 1: DIA, 2: HEX 3: UMH */ int search_range; /* search range - integer pel search and 16x16 blocks. The search window is * generally around the predicted vector. Max vector is 2xmcrange. For 8x8 * and 4x4 block sizes the search range is 1/2 of that for 16x16 blocks. */ int num_max_ref; /* 1: prediction from the last frame only. 2: prediction from the last or * second last frame etc. Maximum 5 frames (number of reference frames) */ int inter_2pu; /* enable inter 2NxN or Nx2N or AMP mode */ int enable_amp; /* enable Asymmetric Motion Partitions */ int enable_intra; /* enable intra mode for inter frame */ int rdo_bit_est_method; /* RDO bit estimation method: * 0: AEC with context updating; 1: AEC without context update * 2: VLC */ int preset_level; /* preset level */ int is_preset_configured; /* whether preset configuration is utilized */ /* encoding tools ------------------------------------------- */ int enable_mhp_skip; /* enable MHP-skip */ int enable_dhp; /* enabled DHP */ int enable_wsm; /* enable Weight Skip Mode */ int enable_nsqt; /* use NSQT or not */ int enable_sdip; /* use SDIP or not */ int enable_secT; /* secT enabled */ int enable_sao; /* SAO enable flag */ int enable_alf; /* ALF enable flag */ int alf_LowLatencyEncoding; /* ALF low latency encoding enable flag */ int enable_pmvr; /* pmvr enabled */ int b_cross_slice_loop_filter; /* cross loop filter flag */ int enable_dmh; /* DMH mode enable, (always true) */ int i_rd_level; /* RDO level, * 0: off, * 1: only for best partition mode of one CU, * 2: only for best 2 partition modes; * 3: All partition modes */ bool_t b_sao_before_deblock; /* conduct SAO parameter decision before deblock totally finish */ bool_t b_fast_sao; /* Fast SAO encoding decision */ bool_t b_fast_2lelvel_tu; /* enable fast 2-level TU for inter */ float factor_zero_block; /* threadhold factor for zero block detection */ /* RDOQ */ int i_rdoq_level; /* RDOQ level, * 0: off, * 1: only for best partition mode of one CU, * 2: for all modes */ int lambda_factor_rdoq; /* */ int lambda_factor_rdoq_p; /* */ int lambda_factor_rdoq_b; /* */ int enable_refine_qp; /* refine QP? */ int enable_tdrdo; /* enable TDRDO? */ /* loop filter */ int loop_filter_disable; /* loop filter disable */ int loop_filter_parameter_flag; /* loop filter parameter flag */ int alpha_c_offset; /* alpha offset */ int beta_offset; /* beta offset */ /* weight quant */ int enable_wquant; /* enable weight quant */ #if ENABLE_WQUANT int SeqWQM; /* load seq weight quant data flag */ int PicWQEnable; /* weighting quant_flag */ int PicWQDataIndex; /* Picture level WQ data index */ char WeightParamDetailed[WQMODEL_PARAM_SIZE]; char WeightParamUnDetailed[WQMODEL_PARAM_SIZE]; int MBAdaptQuant; int WQParam; /* weight quant param index */ int WQModel; /* weight quant model */ #endif int chroma_quant_param_disable; /* chroma quant param disable */ int chroma_quant_param_delta_u; /* chroma quant param delta cb */ int chroma_quant_param_delta_v; /* chroma quant param delta cr */ /* --- rate control ----------------------------------------- */ int i_rc_method; /* rate control method: 0: CQP, 1: CBR (frame level), 2: CBR (SCU level), 3: VBR */ int i_target_bitrate; /* target bitrate (bps) */ int i_initial_qp; /* initial QP */ int i_min_qp; /* min QP */ int i_max_qp; /* max QP */ /* --- parallel --------------------------------------------- */ int num_parallel_gop; /* number of parallel GOP */ int i_frame_threads; /* number of thread in frame level parallel */ int i_lcurow_threads; /* number of thread in LCU-row level parallel */ int enable_aec_thread; /* enable AEC threadpool or not */ /* --- log -------------------------------------------------- */ int i_log_level; /* log level */ int enable_psnr; /* enable PSNR calculation or not */ int enable_ssim; /* enable SSIM calculation or not */ /* --- reference management --------------------------------- */ int i_gop_size; /* sub GOP size */ xavs2_rps_t cfg_ref_all[XAVS2_MAX_GOPS]; /* ref_man array */ /* --- input/output for testing ----------------------------- */ int infile_header; /* if input file has a header set this to the length of the header */ int output_merged_picture; int num_frames; /* number of frames to be encoded */ #define FN_LEN 128 char psz_in_file[FN_LEN]; /* YUV 4:2:0 input format */ char psz_bs_file[FN_LEN]; /* AVS compressed output bitstream */ char psz_dump_yuv[FN_LEN]; /* filename for reconstructed frames */ #if XAVS2_TRACE char psz_trace_file[FN_LEN]; /* filename for trace information */ #endif #if ENABLE_WQUANT char psz_seq_wq_file[FN_LEN]; char psz_pic_wq_file[FN_LEN]; #endif } xavs2_param_t; /* --------------------------------------------------------------------------- * syntax element set */ typedef struct ctx_set_t { ALIGN16(context_t cu_type_contexts [NUM_CU_TYPE_CTX ]); context_t intra_pu_type_contexts [NUM_INTRA_PU_TYPE_CTX ]; context_t split_flag [NUM_SPLIT_CTX ]; context_t transform_split_flag [NUM_TU_CTX ]; context_t shape_of_partition_index [NUM_AMP_CTX ]; context_t pu_reference_index [NUM_REF_NO_CTX ]; context_t cbp_contexts [NUM_CBP_CTX ]; context_t mvd_contexts [2][NUM_MVD_CTX ]; /* ֡Ԥ */ context_t pu_type_index [NUM_INTER_DIR_CTX ]; // b_pu_type_index[15] = f_pu_type_index[3] + dir_multi_hypothesis_mode[12] context_t b_pu_type_min_index [NUM_INTER_DIR_MIN_CTX ]; // b_pu_type_index2 // for B_NxN // f_pu_type_index2 // for F_NxN context_t cu_subtype_index [DS_MAX_NUM ]; // B_Skip/B_Direct, F_Skip/F_Direct context_t weighted_skip_mode [WPM_NUM ]; /* ֡Ԥ */ context_t intra_luma_pred_mode [NUM_INTRA_MODE_CTX ]; context_t intra_chroma_pred_mode [NUM_INTRA_MODE_C_CTX ]; /* CU QP */ #if ENABLE_RATE_CONTROL_CU context_t delta_qp_contexts [NUM_DELTA_QP_CTX ]; #endif /* 任ϵ */ context_t coeff_run [2][NUM_BLOCK_TYPES][NUM_MAP_CTX ]; // [0:Luma, 1:Chroma][rank][ctx_idx] context_t nonzero_cg_flag [NUM_SIGN_CG_CTX ]; context_t last_cg_contexts [NUM_LAST_CG_CTX ]; context_t last_pos_contexts [NUM_LAST_POS_CTX ]; context_t coeff_level [NUM_COEFF_LEVEL_CTX ]; /* ģ */ context_t sao_merge_type_index [NUM_SAO_MERGE_FLAG_CTX]; context_t sao_mode [NUM_SAO_MODE_CTX ]; context_t sao_interval_offset_abs [NUM_SAO_OFFSET_CTX ]; context_t alf_cu_enable_scmodel [3][NUM_ALF_LCU_CTX ]; } ctx_set_t; /* --------------------------------------------------------------------------- * struct to characterize the state of the arithmetic coding */ struct aec_t { ALIGN16(uint8_t *p_start); /* actual buffer for written bytes */ /* bitstream */ uint8_t *p; /* pointer to byte written currently */ uint8_t *p_end; /* end of actual buffer for written bytes */ uint32_t reg_flush_bits; /* register: flushing bits (not written into byte buffer) */ uint32_t num_left_flush_bits; /* number of bits in \ref{reg_flush_bits} could be used */ /* AEC codec */ uint32_t i_low; /* low */ uint32_t i_t1; /* t1 */ uint32_t i_bits_to_follow; /* current bit counter to follow */ /* flag */ uint32_t b_writting; /* write to bitstream buffer? */ /* handle */ binary_t binary; /* binary function handles */ /* context */ ctx_set_t *p_ctx_set; /* can reference other aec_t object */ ctx_set_t ctx_set; /* context models for AEC (current object) */ }; /* --------------------------------------------------------------------------- * slice_t */ typedef struct slice_t { bs_t bs; /* bitstream controller */ /* bitstream buffer */ int len_slice_bs_buf; /* length of bitstream buffer */ uint8_t *p_slice_bs_buf; /* pointer of bitstream buffer (start address) */ /* slice buffers */ pel_t *slice_intra_border[3]; /* buffer for store decoded bottom pixels of the top lcu row (before filter) */ uint8_t *slice_deblock_flag[2]; /* buffer for edge filter flag (of one LCU row), [dir][(scu_y, scu_x)] */ int8_t *slice_ipredmode; /* [(i_height_in_minpu + 1) * (i_width_in_minpu + 16)], prediction intra mode */ /* slice properties */ int i_first_lcu_xy; /* first LCU index (in scan order) */ int i_last_lcu_xy; /* last LCU index (in scan order) */ int i_first_lcu_y; /* first LCU position y in this slice */ int i_last_lcu_y; /* last LCU position y in this slice */ int i_lcu_row_num; /* number of LCU-row in this slice */ int i_first_scu_y; /* first SCU position y in this slice */ int i_qp; /* slice qp */ int index_slice; /* index of current Slice */ } slice_t; /* --------------------------------------------------------------------------- * prediction mode */ typedef struct neighbor_inter_t { int8_t is_available; /* is block available */ int8_t i_dir_pred; /* prediction direction, -1 for intra or un-available */ int8_t ref_idx[2]; /* reference indexes of 1st and 2nd frame */ mv_t mv[2]; /* motion vectors */ } neighbor_inter_t; /* --------------------------------------------------------------------------- * candidate node, used for intra coding */ struct intra_candidate_t { rdcost_t cost; /* the cost of one mode */ int mode; /* the mode index */ int8_t padding_bytes[4]; /* padding byte number */ }; /* --------------------------------------------------------------------------- * coding block */ typedef union cb_t { struct { int8_t x; /* start position (x, in pixel) within current CU */ int8_t y; /* start position (y, in pixel) within current CU */ int8_t w; /* block width (in pixel) */ int8_t h; /* block height (in pixel) */ }; uint32_t v; /* used for fast operation for all components */ } cb_t; /* --------------------------------------------------------------------------- * coding unit data for storing */ struct cu_info_t { /* basic */ int i_scu_x; /* horizontal position for the first SCU in CU */ int i_scu_y; /* vertical position for the first SCU in CU */ pel_t *p_rec[3]; /* reconstruction pixels for current cu [y/u/v] */ coeff_t *p_coeff[3]; /* residual coefficient for current cu [y/u/v] */ int8_t i_level; /* cu level, 3: 8x8, 4: 16x16, 5: 32x32, 6: 64x64 */ #if ENABLE_RATE_CONTROL_CU /* qp */ int8_t i_cu_qp; /* qp of current CU */ int8_t i_delta_qp; /* delta qp */ #endif // ENABLE_RATE_CONTROL_CU /* mode */ int8_t i_mode; /* cu type (partition into prediction units (PUs)) */ int8_t directskip_wsm_idx; /* weighted skip mode */ int8_t directskip_mhp_idx; /* direct skip mode index */ int8_t dmh_mode; /* DMH mode */ /* partition */ int8_t num_pu; /* number of prediction units (PU) */ /* trans size */ int8_t i_tu_split; /* transform unit split flag, tu_split_type_e */ /* cbp */ int8_t i_cbp; /* Coding Block Pattern (CBP) or Coding Transform Pattern (CTP): * Indicating whether transform block (TB) has nonzero coefficients * When it is zero, it means all 6 TBs are zero block */ /* intra predicated mode */ int8_t i_intra_mode_c; /* real intra mode (chroma) */ /* buffers */ cb_t cb[4]; /* coding blocks (2 for inter, 4 for intra) */ /* intra buffers */ int8_t pred_intra_modes[4]; /* pred intra modes */ int8_t real_intra_modes[4]; /* real intra modes */ /* inter buffers */ mv_t mvd[2][4]; /* [fwd,bwd][block_y][block_x] */ #if XAVS2_TRACE mv_t mvp[2][4]; /* [fwd,bwd][block_y][block_x], used only for normal inter mode */ mv_t mv [2][4]; /* [fwd,bwd][block_y][block_x], used only for normal inter mode */ #endif int8_t b8pdir[4]; int8_t ref_idx_1st[4]; /* reference index of 1st direction */ int8_t ref_idx_2nd[4]; /* reference index of 2nd direction */ }; /* --------------------------------------------------------------------------- * cu_mv_mode_t */ typedef struct cu_mv_mode_t { mv_t all_sym_mv[1]; /* ԳģʽMV */ mv_t all_single_mv[MAX_REFS]; /* mvpֻLCUֻһݣ밴ȷֲ */ mv_t all_mvp[MAX_REFS]; /* 1st MVP of dual hypothesis prediction mode, or Foreword of BiPrediction */ /* ˫MVҲֻҪһ */ mv_t all_dual_mv_1st[MAX_REFS]; mv_t all_dual_mv_2nd[MAX_REFS]; } cu_mv_mode_t; /* --------------------------------------------------------------------------- * MVs and references for motion compensation and references */ typedef struct cu_mc_param_t { mv_t mv[4][2]; /* [blockidx][refidx 1st/2nd] */ } cu_mc_param_t; /* --------------------------------------------------------------------------- * cu_mode_t */ typedef struct cu_mode_t { uint8_t mv_padding1[16]; /* Խ磬2ֽڣ˴Ϊ벹16ֽ */ cu_mv_mode_t mvs[MAX_INTER_MODES][4]; /* MVs for normal inter prediction */ cu_mc_param_t best_mc; /* MVs to store */ cu_mc_param_t best_mc_tmp; /* 㷨 OPT_ROUGH_PU_SEL ֡仮ģʽѲһȫţ */ int8_t ref_idx_single[4]; /* [block], preserved for DMH */ mv_t skip_mv_1st[DS_MAX_NUM]; /* MVs for spatial skip modes (only for F and B frames) */ mv_t skip_mv_2nd[DS_MAX_NUM]; int8_t skip_ref_1st[DS_MAX_NUM]; /* reference indexes */ int8_t skip_ref_2nd[DS_MAX_NUM]; mv_t tskip_mv[4][MAX_REFS]; /* MVs for temporal skip modes (Weighted skip and the default) */ // int8_t all_intra_mode[1 << ((MAX_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT) * 2)]; // int8_t all_ctp_y[1 << ((MAX_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT) * 2)]; } cu_mode_t; /* --------------------------------------------------------------------------- * cu_feature_t, used for fast encoding decision */ typedef struct cu_feature_t { int64_t intra_complexity; /* intra complexity */ int64_t complexity; /* minimum of intra and inter complexity */ double variance; /* variance of current CU */ double avg_variance_sub_block; /* average variance of 4 sub CUs */ double var_variance_sub_block; /* variance of variance of 4 sub CUs */ double var_diff; /* variance difference */ rdcost_t intra_had_cost; rdcost_t rdcost; rdcost_t rdcost_luma; /* 0: try both (not determined); * 1: only try split; * 2: only try current depth * --------------------------- */ int pred_split_type; /* prediction of cu split type: 0: un-determined; 1: split; 2: not-split */ rdcost_t pred_costs[MAX_PRED_MODES]; /* ÿPUģʽ cost ԤȻȡ */ } cu_feature_t; /* --------------------------------------------------------------------------- * coding unit */ struct cu_t { /* basic */ int i_size; /* cu size */ int i_pos_x; /* pixel position (x) within CTU */ int i_pos_y; /* pixel position (y) within CTU */ int i_pix_x; /* pixel position (x) within picture */ int i_pix_y; /* pixel position (y) within picture */ int i_scu_xy; /* CU position within picture */ cu_info_t cu_info; /* information of CU */ cu_mc_param_t mc; /* motion information for MC and neighboring prediction */ /* pointer to neighbor CUs. NULL pointer identifies unavailable */ cu_info_t *p_left_cu; /* pointer to left neighbor cu */ cu_info_t *p_topA_cu; /* pointer to top-above neighbor cu */ cu_info_t *p_topL_cu; /* pointer to top-left neighbor cu */ cu_info_t *p_topR_cu; /* pointer to top-right neighbor cu */ /* block available */ uint8_t intra_avail; /* intra availability of current CU */ uint8_t block_avail; /* intra availability of current intra PU */ int8_t in_lcu_edge; /* 0: on top-left of LCU; 1: x on edge (y == 0); 2: y on edge (x == 0); 3: not on edge */ int8_t idx_zorder; /* CU z-order index in CTU (basic: 8x8 CU) */ int8_t idx_cu_bfs; /* index of CU in BFS scan order */ /* TU size */ bool_t is_ctu_split; /* whether CTU is split */ bool_t b_cbp_direct; /* direct mode total distortion is zero*/ bool_t is_zero_block; /* low residual in luma component */ int sum_satd; /* satd sum for zero block detection */ /* splitting */ cu_t *sub_cu[4]; /* pointer to 4 sub CUs split */ /* RDO result of current depth */ rdcost_t rdcost; /* RD-Cost of current CTU */ dist_t best_dist_total; /* Total distortion for current CU, no split */ cu_feature_t feature; /* used for fast mode decision */ dist_t mvcost[4]; /* mvcost of every pu*/ }; /* --------------------------------------------------------------------------- * frame complexity */ typedef struct complex_t { int64_t i_best_cost; /* sum best cost of whole frame */ int64_t i_inter_cost; /* sum inter cost of whole frame */ int64_t i_intra_cost; /* sum intra cost of whole frame */ int i_sum_intras; /* number of intra blocks in frame */ int i_sum_blocks; /* number of total blocks in frame */ int i_slice_type; /* slice type of frame, or -1 for uncertain */ int b_valid; /* indicates whether complexity estimation has conducted */ } complex_t; #if XAVS2_ADAPT_LAYER /* --------------------------------------------------------------------------- * nal info */ typedef struct xavs2_nal_info_t { short i_priority; /* nal_priority_e */ short i_type; /* nal_unit_type_e */ int i_payload; /* size of payload in bytes */ } xavs2_nal_info_t; #endif typedef struct com_stat_t { double f_psnr[3]; /* psnr for all components: Y, U, V */ double f_ssim[3]; /* SSIM for all components: Y, U, V */ double f_lambda_frm; /* lambda of current frame */ // int64_t i_time_start; /* encoding start time */ // int64_t i_time_end; /* encoding end time */ int64_t i_time_duration; /* encoding time */ int64_t i_frame_size; /* frame size (bs len) */ int num_frames; } com_stat_t; /* --------------------------------------------------------------------------- * xavs2_frame_t */ struct xavs2_frame_t { /* magic number */ ALIGN16(void *magic); /* must be the 1st member variable. do not change it */ /* properties */ int i_frm_type; /* frame type: XAVS2_TYPE_* */ int i_state; /* flag, -1 for exit flag in thread */ int b_keyframe; /* key frame? */ int64_t i_pts; /* user pts (Presentation Time Stamp) */ int64_t i_dts; /* user dts (Decoding Time Stamp) */ int64_t i_reordered_pts; /* reordered PTS (in coding order) */ int i_frame; /* presentation frame number */ int i_frm_coi; /* COI (coding order index) */ int i_frm_poc; /* POC (picture order count), used for MV scaling */ int i_gop_idr_coi; /* COI of IDR frame in this gop */ int ref_dpoc[MAX_REFS]; /* POC difference of its reference frames */ int ref_dpoc_multi[MAX_REFS]; /* MULTI / ref_dpoc[x] */ int i_frm_qp; /* QP of frame */ int i_frm_lambda_sad; /* frame level lambda in SAD domain */ double f_frm_lambda_ssd; /* frame level lambda in SSD domain */ int i_qpplus1; /* qp + 1: used for rate control */ xavs2_rps_t rps; int rps_index_in_gop; bool_t b_random_access_decodable; /* random_access_decodable_flag */ /* YUV buffer */ int i_plane; /* number of planes */ int i_stride[3]; /* stride for Y/U/V */ int i_width[3]; /* width for Y/U/V */ int i_lines[3]; /* height for Y/U/V */ pel_t *planes[3]; /* pointers to Y/U/V data buffer */ pel_t *filtered[16]; /* pointers to interpolated luma data buffers */ pel_t *plane_buf; int size_plane_buf; /* bit stream buffer */ uint8_t *p_bs_buf; /* bit stream buffer for encoding this frame */ int i_bs_buf; /* length of bit stream buffer */ int i_bs_len; /* length of bit stream data */ int b_enable_intra; /* enable intra coding in frame level */ /* encoding parameters */ int8_t *pu_ref; /* pu reference index (store in 16x16 block) */ mv_t *pu_mv; /* pu motion vector (store in 16x16 block) */ #if SAVE_CU_INFO int8_t *cu_mode; /* cu type (store in SCU) */ int8_t *cu_cbp; /* cu cbp (store in SCU) */ int8_t *cu_level; /* cu size in bit (store in SCU) */ #endif int num_lcu_sao_off[NUM_SAO_COMPONENTS]; /* */ uint32_t cnt_refered; /* reference count for FT_DEC */ int *num_lcu_coded_in_row; /* 0, not ready, 1, ready */ xavs2_thread_cond_t cond; xavs2_thread_mutex_t mutex; #if XAVS2_ADAPT_LAYER /* nal */ int i_nal; /* number of nal */ xavs2_nal_info_t *nal_info; /* nal information */ #endif #if XAVS2_STAT int64_t i_time_start; /* encoding start time */ int64_t i_time_end; /* encoding end time */ #endif }; /* --------------------------------------------------------------------------- * xavs2_me_t */ typedef struct xavs2_me_t { /* PU info */ int16_t i_ref_idx; /* current reference index */ int16_t i_pixel; /* partition index via the block width and height */ int i_bias; /* offset of the current PU block in the frame */ int i_pix_x; /* pixel position (x) in frame */ int i_pix_y; /* pixel position (y) in frame */ int i_block_w; /* width of the current PU block */ int i_block_h; /* height of the current PU block */ bool_t b_search_dmh; /* is searching for DMH mode */ /* pointers */ pel_t *p_fenc; /* pointer to the current PU block in source CTU */ xavs2_frame_t *p_fref_1st; /* pointer to the current (1st) reference frame */ xavs2_frame_t *p_fref_2nd; /* pointer to the current 2nd reference frame */ int i_distance_1st; /* distance index for 1st reference frame */ int i_distance_2nd; /* distance index for 2nd reference frame */ /* thresholds for UMH */ double beta2; double beta3; /* SAD prediction */ dist_t pred_sad_space; dist_t pred_sad_ref; dist_t pred_sad_uplayer; dist_t pred_sad; /* mv range */ int mv_min[2]; /* allowed qpel MV range to stay within */ int mv_max[2]; /* the picture + emulated edge pixels */ int mv_min_fpel[2]; /* full pel MV range for motion search */ int mv_max_fpel[2]; /* pred motion vector */ mv_t mvp; /* pred motion vector for the current block */ mv_t mvp1; /* MVP via space */ mv_t mvp2; /* MVP via temporal collocation (previous search result) */ mv_t mvp3; /* MVP via collocated frame */ /* output */ mv_t bmv; /* best motion vector (subpel ) */ mv_t bmv2; /* best motion vector (fullpel) */ dist_t bcost; /* best cost of subpel motion search, satd + lambda * nbits */ dist_t bcost2; /* best cost of fullpel motion search, sad + lambda * nbits */ dist_t mvcost[5]; /* mv cost for every direction*/ dist_t bmvcost[5]; /* cost of best mv of all ref for every direction */ mv_t all_best_mv[MAX_INTER_MODES][4][MAX_REFS]; /* all best mv results generated in ME (single) */ mv_t all_best_imv[MAX_REFS]; /* best integer MV for current PU in current CU */ } xavs2_me_t; /* --------------------------------------------------------------------------- * SAOStatData */ typedef struct SAOStatData{ long diff[MAX_NUM_SAO_CLASSES]; long count[MAX_NUM_SAO_CLASSES]; } SAOStatData; /* --------------------------------------------------------------------------- * ALFParam */ typedef struct ALFParam { int alf_flag; int num_coeff; int filters_per_group; int filterPattern[NO_VAR_BINS]; int coeffmulti[NO_VAR_BINS][ALF_MAX_NUM_COEF]; } ALFParam; /* --------------------------------------------------------------------------- * parameters and buffers for RDOQ */ typedef struct rdoq_t { /* buffers */ ALIGN32(coeff_t coeff_buff[32 * 32]); ALIGN32(coeff_t ncur_blk [32 * 32]); ALIGN32(int8_t sig_cg_flag[64]); /* pointers */ context_t *p_ctx_coeff_run; context_t *p_ctx_coeff_level; context_t (*p_ctx_primary)[NUM_MAP_CTX]; context_t *p_ctx_sign_cg; context_t *p_ctx_last_cg; context_t *p_ctx_last_pos; const int16_t *p_scan_tab_1d; /* scan table */ const int16_t (*p_scan_cg)[2]; /* scan table (CG) */ /* properties */ int num_cg_x; /* number of CG in x axis */ int num_cg_y; /* number of CG in y axis */ int bit_size_shift_x; /* log2 (block size x) */ int bit_size_shift_y; /* log2 (block size x) */ int i_tu_level; /* */ int b_luma; /* is luma? */ int b_dc_diag; /* is INTRA_PRED_DC_DIAG or not */ } rdoq_t; #if ENABLE_WQUANT /* --------------------------------------------------------------------------- * weighted quant data */ typedef struct wq_data_t { int16_t wq_param [2][ 6]; int16_t cur_wq_matrix [4][64]; // [matrix_id][coef] int16_t wq_matrix [2][2][64]; // [matrix_id][detail/undetail][coef] int16_t seq_wq_matrix [2][64]; // [matrix_id][coef] int16_t pic_user_wq_matrix[2][64]; // [matrix_id][coef] int LevelScale4x4 [2][ 4 * 4]; int LevelScale8x8 [2][ 8 * 8]; // [intra/inter][j * stride + i] int LevelScale16x16[2][16 * 16]; int LevelScale32x32[2][32 * 32]; int *levelScale[4][2]; // [bit_size][intra/inter] int cur_frame_wq_param; // weighting quant param } wq_data_t; #endif /* --------------------------------------------------------------------------- * The data within the payload is already NAL-encapsulated; the ref_idc and * type are merely in the struct for easy access by the calling application. * All data returned in an nal_t, including the data in p_payload, is no * longer valid after the next call to xavs2_encoder_encode. */ typedef struct nal_t { int i_ref_idc; /* nal_priority_e */ int i_type; /* nal_unit_type_e */ int i_payload; /* size of payload in bytes */ uint8_t *p_payload; /* payload */ } nal_t; /* --------------------------------------------------------------------------- * lcu_info_t */ typedef struct lcu_info_t { ALIGN32(coeff_t coeffs_y[MAX_CU_SIZE * MAX_CU_SIZE]); /* dct coefficients of Y component */ ALIGN32(coeff_t coeffs_uv[2][MAX_CU_SIZE * MAX_CU_SIZE / 4]); /* dct coefficients of U/V component */ int scu_xy; /* index (scan order ) for the first SCU in lcu */ int pix_x; /* horizontal position (in pixel) of lcu (luma) */ int pix_y; /* vertical position (in pixel) of lcu (luma) */ int slice_index; /* slice index */ #if ENABLE_RATE_CONTROL_CU int last_dqp; /* last delta QP */ #endif } lcu_info_t; /* --------------------------------------------------------------------------- * row_info_t */ typedef struct row_info_t { int row; /* row index [0, xavs2_t::i_height_in_lcu) */ int b_top_slice_border; /* whether top slice border should be processed */ int b_down_slice_border; /* whether down slice border should be processed */ volatile int coded; /* position of latest coded LCU. [0, xavs2_t::i_width_in_lcu) */ xavs2_t *h; /* context for the row */ lcu_info_t *lcus; /* [LCUs] */ xavs2_thread_cond_t cond; /* lcu cond */ xavs2_thread_mutex_t mutex; aec_t aec_set; /* aec contexts of the 2nd LCU which will be * referenced by the next row on startup */ } row_info_t; #if XAVS2_STAT /* --------------------------------------------------------------------------- * struct for encoding statistics of one frame */ typedef struct frame_stat_t { int i_type; /* frame type */ int i_frame; /* POC */ int i_qp; /* frame QP */ int i_ref; /* number of reference frames */ int ref_poc_set[XAVS2_MAX_REFS]; /* POCs of reference frames */ com_stat_t stat_frm; } frame_stat_t; /* --------------------------------------------------------------------------- * struct for encoding statistics of all frames */ typedef struct xavs2_stat_t { int64_t i_start_time; /* encoding start time */ int64_t i_end_time; /* encoding end time */ com_stat_t stat_i_frame; com_stat_t stat_p_frame; com_stat_t stat_b_frame; com_stat_t stat_total; int num_frame_small_qp; /* number of frames whose QP is too small */ } xavs2_stat_t; #endif /* --------------------------------------------------------------------------- * frame_info_t */ typedef struct frame_info_t { row_info_t *rows; /* all lcu rows */ #if XAVS2_STAT frame_stat_t frame_stat; /* encoding statistics */ #endif } frame_info_t; /* --------------------------------------------------------------------------- * outputframe_t */ struct outputframe_t { xavs2_frame_t *frm_enc; /* frame with nalus */ #if XAVS2_STAT frame_stat_t out_frm_stat; /* encoding statistics */ #endif outputframe_t *next; /* pointer to the next output frame */ }; /* --------------------------------------------------------------------------- * buffer data used for each cu layer in xavs2_t */ typedef struct cu_layer_t { rdcost_t best_rdcost; /* best rd-cost of current CU */ rdcost_t mode_rdcost[MAX_PRED_MODES]; /* min rd-cost for each mode */ int mask_md_res_pred; /* available mode mask */ pel_t *p_rec_tmp[3]; /* tmp pointers to ping-pong buffer for swapping */ coeff_t *p_coeff_tmp[3]; /* tmp pointers to ping-pong buffer for swapping */ cu_info_t cu_best; /* best info for each cu depth */ cu_mode_t cu_mode; /* mode info for each cu depth (TODO: simplification for motion info like x265 ?) */ intra_candidate_t intra_candidates[INTRA_MODE_NUM_FOR_RDO + 1]; /* candidate list, reserving the cost */ neighbor_inter_t neighbor_inter[BLK_COL + 1]; /* neighboring inter modes of 4x4 blocks*/ aec_t cs_cu ; /* coding state after encoding each cu partition mode for current CU level */ aec_t cs_rdo; /* coding state for mode decision (rdo) */ // uint8_t padding_bytes[24]; /* padding bytes to make align */ #define FENC_STRIDE (MAX_CU_SIZE) /* stride for LCU enc buffer, Y component */ #define FDEC_STRIDE (MAX_CU_SIZE) /* stride for LCU dec buffer, Y component */ #define FREC_STRIDE (p_cu->i_size) /* stride for current CU, Y component */ #define FREC_CSTRIDE (p_cu->i_size) /* stride for current CU, UV component */ #define FENC_BUF_SIZE (FENC_STRIDE * (MAX_CU_SIZE + MAX_CU_SIZE / 2)) #define FDEC_BUF_SIZE (FDEC_STRIDE * (MAX_CU_SIZE + MAX_CU_SIZE / 2)) #define LCU_BUF_SIZE (MAX_CU_SIZE * MAX_CU_SIZE) ALIGN32(pel_t rec_buf_y [3][LCU_BUF_SIZE]); /* luma reconstruction buffer [cur/tmp/best][] */ ALIGN32(coeff_t coef_buf_y [3][LCU_BUF_SIZE]); /* luma coefficient buffer [cur/tmp/best][] */ ALIGN32(pel_t rec_buf_uv [2][3][LCU_BUF_SIZE >> 2]); /* chroma reconstruction buffer [uv][cur/tmp/best][] */ ALIGN32(coeff_t coef_buf_uv[2][3][LCU_BUF_SIZE >> 2]); /* chroma coefficient buffer [uv][cur/tmp/best][] */ /* inter prediction buffer */ ALIGN32(pel_t buf_pred_inter_luma[2][LCU_BUF_SIZE]); /* temporary decoding buffer for inter prediction (luma) */ /* Ping-pong buffer for inter prediction */ pel_t *buf_pred_inter; /* current inter prediction buffer */ pel_t *buf_pred_inter_best; /* backup of best inter prediction */ } cu_layer_t; /* --------------------------------------------------------------------------- * buffer data used for encode each CU layer */ typedef struct cu_parallel_t { /* dct coefficients buffers */ ALIGN32(coeff_t coeff_blk[LCU_BUF_SIZE]); ALIGN32(coeff_t coeff_bak[LCU_BUF_SIZE]); /* buffers used for inter prediction */ ALIGN32(pel_t buf_pred_inter_c[LCU_BUF_SIZE >> 1]); /* temporary decoding buffer for inter prediction (chroma) */ ALIGN32(pel_t buf_pixel_temp [LCU_BUF_SIZE]); /* temporary pixel buffer, used for bi/dual-prediction */ /* predication buffers for all intra modes */ ALIGN32(pel_t intra_pred [NUM_INTRA_MODE ][LCU_BUF_SIZE]); /* for all 33 luma prediction modes */ ALIGN32(pel_t intra_pred_c[NUM_INTRA_MODE_CHROMA][LCU_BUF_SIZE >> 1]); /* for all chroma intra prediction modes */ ALIGN32(pel_t buf_edge_pixels[MAX_CU_SIZE << 3]); /* reference pixels for intra luma/chroma prediction */ runlevel_t runlevel; /* run level buffer for RDO */ /* parameters for RDOQ */ ALIGN16(rdoq_t rdoq_info); aec_t cs_tu; /* coding state after encoding cu with different TU partition, or PU partition in intra */ aec_t cs_pu_init; /* coding state before encoding one CU partition */ } cu_parallel_t; /* --------------------------------------------------------------------------- */ struct xavs2_log_t { int i_log_level; /* log level */ char module_name[60]; /* module name */ }; /* --------------------------------------------------------------------------- * xavs2_t */ struct xavs2_t { ALIGN32(xavs2_log_t module_log); /* log module */ /* === BEGIN =================================================== * communal variables * м֡ʼ */ ALIGN32(SYNC_VARS_1(communal_vars_1)); const xavs2_param_t* param; /* input parameters */ /* ------------------------------------------------------------- * contexts synchronization control */ xavs2_handler_t*h_top; /* encoder top handler */ task_type_e task_type; /* task type: frame/slice/row */ task_status_e task_status; /* for frame tasks: task status */ int i_aec_frm; /* for frame tasks(task order for aec): [0, i_frame_threads) */ int b_all_row_ctx_released; /* is all row context released */ /* ------------------------------------------------------------- * encoder contexts */ ratectrl_t *rc; /* rate control */ td_rdo_t *td_rdo; /* pointer to struct td_rdo_t */ uint32_t valid_modes[SLICE_TYPE_NUM][CTU_DEPTH]; /* [frame_type][bit_size] : valid modes for mode decision */ uint64_t i_fast_algs; /* all fast algorithms enabled */ bool_t b_progressive; bool_t b_field_sequence; bool_t use_fractional_me; /* whether use fractional Motion Estimation * 0: رշ1: 1/22:1/4 */ bool_t use_fast_sub_me; /* whether use fast quarter Motion Estimation: skip half fractional search point (from futl) */ bool_t UMH_big_hex_level; /* whether skip big hex pattern when using UMH (from futl) 0 : skip this step 1 : 8 points. 0.17% loss ~ 4% TimeSaving 2 : 16 points */ bool_t enable_tu_2level; /* enable 2-level TU for inter , * 0: off, * 1: tu-2level only for best partition mode of one CU, * 2: tu-2level rdo , * 3: tu - 2level rdoq */ bool_t skip_rough_improved; /* whether use the improved SKIP_ROUGH_SEL (from leimeng) */ float framerate; int i_gop_size; /* sub GOP size */ int picture_reorder_delay;/* picture reorder delay */ int i_lcu_level; /* level of largest cu, 3: 8x8, 4: 16x16, 5: 32x32, 6: 64x64 */ int i_scu_level; /* level of smallest cu, 3: 8x8, 4: 16x16, 5: 32x32, 6: 64x64 */ int i_width; /* frame width (number of pels, 8N, Luma) */ int i_height; /* frame height (number of lines, 8N, Luma) */ int i_width_in_lcu; /* frame width in lcu */ int i_height_in_lcu; /* frame height in lcu */ int i_width_in_mincu; /* frame width in 8x8-block */ int i_height_in_mincu; /* frame height in 8x8-block */ int i_width_in_minpu; /* frame width in 4x4-block */ int i_height_in_minpu; /* frame height in 4x4-block */ int i_chroma_v_shift; /* chroma vertical shift bits */ int i_max_ref; /* max number of reference frames */ int min_mv_range[2]; /* mv range (min) decided by the level id */ int max_mv_range[2]; /* mv range (max) decided by the level id */ /* function pointers */ int (*get_intra_candidates_luma)(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, pel_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h); int (*get_intra_candidates_chroma)(xavs2_t *h, cu_t *p_cu, int i_level, int pix_y_c, int pix_x_c, intra_candidate_t *p_candidate_list); void (*copy_aec_state_rdo)(aec_t *dst, aec_t *src); /* pointer to copy aec_t */ int size_aec_rdo_copy; /* number of bytes to copy in RDO for \function aec_copy_aec_state_rdo() */ uint8_t *tab_avail_TR; /* pointers to array of available table, Top Right */ uint8_t *tab_avail_DL; /* pointers to array of available table, Down Left */ uint8_t tab_num_intra_rdo[MAX_CU_SIZE_IN_BIT + 1]; /* pointers to array of table, indicate numbers of intra prediction modes for RDO */ int8_t num_intra_rmd_dist2; /* 2ĽǶȵ */ int8_t num_intra_rmd_dist1; /* 1ĽǶȵ */ int8_t num_rdo_intra_chroma; /* number of RDO modes for intra chroma prediction */ SYNC_VARS_2(communal_vars_2); /* === END ===================================================== */ /* === BEGIN =================================================== * row-dependent variables : values below need to be synchronized between rows * ֡ʼÿ֡Ķм֮߳ͬ */ SYNC_VARS_1(row_vars_1); frame_info_t *frameinfo; /* pointer to the frame info buffer */ int i_type; /* frame type: SLICE_TYPE_* */ int i_layer; /* temporal index of coding frame */ int i_qp; /* frame level QP */ int ip_pic_idx; /* encoded I/P/F-picture index (to be REMOVED) */ int i_frame_b; /* number of encoded B-picture in a GOP */ int b_top_field; /* top field flag */ rdcost_t f_lambda_mode; /* lambda for mode cost and motion cost */ rdcost_t f_lambda_rdoq; /* lambda for RDOQ */ int i_lambda_factor; /* factor for determining Lagrangian's motion cost */ double f_lambda_1th; /* 1.0 / f_lambda_mode */ xavs2_frame_t *fenc; /* current frame being encoded */ xavs2_frame_t *fdec; /* current frame being reconstructed */ int i_ref; /* current number of reference frames */ xavs2_frame_t *fref[MAX_REFS]; /* reference frame list */ mct_t *img4Y_tmp[3]; /* temporary buffer for 1/4 interpolation: a,1,b */ xavs2_frame_t *img_luma_pre; /* buffer used for TDRDO, only luma */ /* slices */ slice_t *slices[MAX_SLICES]; /* all slices */ int i_slice_index; /* slice index for the current thread */ /* ͬSliceͬbuffer */ pel_t *intra_border[3]; /* buffer for store decoded bottom pixels of the top lcu row (before filter) */ uint8_t *p_deblock_flag[2]; /* buffer for edge filter flag (of one LCU row), [dir][(scu_y, scu_x)] */ int8_t *ipredmode; /* [(i_height_in_minpu + 1) * (i_width_in_minpu + 16)], prediction intra mode */ /* ֡Ψһbuffer */ int8_t *lcu_slice_idx; /* [i_height_in_lcu][i_width_in_lcu] */ int8_t *dir_pred; /* [i_height_in_minpu][i_width_in_minpu], inter prediction direction */ int8_t *fwd_1st_ref; /* [i_height_in_minpu][i_width_in_minpu] */ int8_t *bwd_2nd_ref; /* [i_height_in_minpu][i_width_in_minpu] */ mv_t *fwd_1st_mv; /* [i_height_in_minpu][i_width_in_minpu] */ mv_t *bwd_2nd_mv; /* [i_height_in_minpu][i_width_in_minpu] */ uint16_t *mvbits; /* used for getting the mv bits */ dist_t (*all_mincost)[MAX_INTER_MODES][MAX_REFS]; /* store the min SAD (in 4x4 PU) */ double umh_bsize[MAX_INTER_MODES]; double thres_qsfd_cu[2][CTU_DEPTH]; /* QSFD threshold for inter frame, [0:inter, 1:intra][log2_cu_size - 3] */ xavs2_frame_t *img_sao; /* reconstruction image for SAO */ SAOStatData(*sao_stat_datas)[NUM_SAO_COMPONENTS][NUM_SAO_NEW_TYPES]; /* [lcu][comp][types], ɲȫ */ SAOBlkParam(*sao_blk_params)[NUM_SAO_COMPONENTS]; /* [lcu][comp] */ int (*num_sao_lcu_off)[NUM_SAO_COMPONENTS]; /* [lcu_row][comp] */ bool_t slice_sao_on [NUM_SAO_COMPONENTS]; xavs2_frame_t *img_alf; /* reconstruction image for ALF */ void *enc_alf; /* handler of ALF encoder */ ALFParam pic_alf_params[IMG_CMPNTS]; bool_t (*is_alf_lcu_on)[IMG_CMPNTS]; /* [lcu][comp] */ int pic_alf_on[IMG_CMPNTS]; #if ENABLE_WQUANT int WeightQuantEnable; /* enable weight quantization? */ wq_data_t wq_data; #endif cu_info_t *cu_info; /* pointer to buffer of all SCUs in frame */ SYNC_VARS_2(row_vars_2); /* === END ===================================================== */ nal_t *p_nal; /* pointer to struct nal_t */ int i_nal; /* current NAL index */ int i_nal_type; /* NAL type */ int i_nal_ref_idc; /* NAL priority */ bs_t header_bs; /* bitstream controller for main thread */ uint8_t *p_bs_buf_header; /* pointer to bitstream buffer for headers */ uint8_t *p_bs_buf_slice; /* pointer to bitstream buffer for slices */ int i_bs_buf_header; /* size of bitstream buffer for headers */ int i_bs_buf_slice; /* size of bitstream buffer for slices */ xavs2_me_t me_state; /* used for motion estimation */ aec_t aec; /* ac engine for RDO */ #if ENABLE_RATE_CONTROL_CU int *last_dquant; #endif struct lcu_t { /* variable properties when coding each LCU ---------------- */ ALIGN16(int16_t i_pix_width); /* actual width (in pixel) for current lcu */ int16_t i_pix_height; /* actual height (in pixel) for current lcu */ int i_pix_x; /* horizontal position (in pixel) of lcu (luma) */ int i_pix_y; /* vertical position (in pixel) of lcu (luma) */ int i_scu_x; /* horizontal position (raster scan order in frame buffer) for the first SCU of lcu */ int i_scu_y; /* vertical position (raster scan order in frame buffer) for the first SCU of lcu */ int i_scu_xy; /* SCU index (raster scan order in frame buffer) for the top-left SCU of current lcu */ int i_lcu_xy; /* LCU index (raster scan order in frame buffer) for current lcu */ bool_t b_enable_rdoq; bool_t bypass_all_dmh; bool_t b_2nd_rdcost_pass; /* 2nd pass for RDCost update */ /* function pointers for RDO */ int (*get_intra_dir_for_rdo_luma)(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, pel_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h); int (*get_skip_mvs)(xavs2_t *h, cu_t *p_cu); /* get MVs for skip/direct mode */ /* buffer and status for RDO & ENC ------------------------- */ /* 1, coding tree */ cu_t *p_ctu; /* pointer to the top of current CTU */ /* 2, enc/dec/pred Y/U/V pointers */ pel_t *p_fdec[3]; /* [Y/U/V] pointer over lcu of the frame to be reconstructed */ pel_t *p_fenc[3]; /* [Y/U/V] pointer over lcu of the frame to be compressed */ coeff_t *lcu_coeff[3]; /* [Y/U/V] coefficients of LCU */ // uint8_t padding_bytes[24];/* padding bytes to make align */ /* data used in each ctu layer */ #define PARALLEL_INSIDE_CTU 0 cu_layer_t cu_layer[CTU_DEPTH]; #if PARALLEL_INSIDE_CTU cu_parallel_t cu_enc [CTU_DEPTH]; #else cu_parallel_t cu_enc [1]; /* CTUڵĶ߳ʱֻҪһ */ #endif ALIGN32(pel_t fenc_buf[FENC_BUF_SIZE]); /* encoding buffer (source Y/U/V buffer) */ ALIGN32(pel_t fdec_buf[FDEC_BUF_SIZE]); /* decoding buffer (Reconstruction Y/U/V buffer) */ struct lcu_intra_border_t { ALIGN32(pel_t rec_left[MAX_CU_SIZE]); /* Left border of current LCU */ ALIGN32(pel_t rec_top[MAX_CU_SIZE * 2 + 32]); /* top-left, top and top-right samples (Reconstruction) of current LCU */ } ctu_border[IMG_CMPNTS]; /* Y, U, V components */ /* buffer for the coding tree units */ ALIGN16(cu_t all_cu[85]); /* all cu: 1(64x64) + 4(32x32) + 16(16x16) + 64(8x8) = 85 */ ALIGN16(cu_t *p_cu_l[4][8][8]); /* all CU pointers */ /* only used for AEC */ runlevel_t run_level_write; /* run-level buffer for encoding */ } lcu; /* coding states in RDO, independent for each thread */ struct coding_states { /* ֻڱ״̬ʼ */ aec_t cs_sao_start; aec_t cs_sao_best; aec_t cs_sao_temp; aec_t cs_alf_cu_ctr; aec_t cs_alf_initial; } cs_data; }; /** * =========================================================================== * general function declares * =========================================================================== */ /* time (us) */ #define xavs2_mdate FPFX(mdate) int64_t xavs2_mdate(void); /* trace */ #if XAVS2_TRACE #define xavs2_trace_init FPFX(trace_init) int xavs2_trace_init(xavs2_param_t *param); #define xavs2_trace_destroy FPFX(trace_destroy) void xavs2_trace_destroy(void); #define xavs2_trace FPFX(trace) int xavs2_trace(const char *psz_fmt, ...); #endif /* thread */ #if HAVE_WIN32THREAD || PTW32_STATIC_LIB #define xavs2_threading_init FPFX(threading_init) int xavs2_threading_init(void); #else #define xavs2_threading_init() 0 #endif #define xavs2_create_thread FPFX(create_thread) int xavs2_create_thread(xavs2_thread_t *tid, xavs2_tfunc_t start, void *arg); #define xavs2_log FPFX(log) void xavs2_log(void *p, int i_log_level, const char *psz_fmt, ...); /* --------------------------------------------------------------------------- * memory alloc */ /* xavs2_malloc : will do or emulate a memalign * you have to use xavs2_free for buffers allocated with xavs2_malloc */ #define xavs2_malloc FPFX(malloc) void *xavs2_malloc(size_t i_size); #define xavs2_calloc FPFX(calloc) void *xavs2_calloc(size_t count, size_t size); #define xavs2_free FPFX(free) void xavs2_free(void *ptr); #define xavs2_get_total_malloc_space FPFX(get_total_malloc_space) size_t xavs2_get_total_malloc_space(void); #define g_xavs2_default_log FPFX(g_xavs2_default_log) extern xavs2_log_t g_xavs2_default_log; /** * =========================================================================== * global const tables * =========================================================================== */ #include "avs2tab.h" #endif // XAVS2_COMMON_H xavs2-1.3/source/common/common_arm.c000066400000000000000000000122051340660520300175020ustar00rootroot00000000000000/* * common_arm.c * * Description of this file: * common tables definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifdef __ARM_ARCH_7A__ #include "common_arm.h" #include "common.h" //ARM_PART_4x2=10, ARM_PART_2x4 = 12, ARM_PART_8x2= 18, ARM_PART_2x8=24,ARM_PART_8x6=54,ARM_PART_6x8=56 const unsigned char g_arm_partition_map_tab[] = { ARM_PLANE_COPY_W88, 255, ARM_PLANE_COPY_W160, ARM_PLANE_COPY_W176, 255, 255, 255, ARM_PLANE_COPY_W320, ARM_PLANE_COPY_W352, ARM_PLANE_COPY_W360, ARM_PART_4x2, 255, ARM_PART_2x4, ARM_PLANE_COPY_W512, 255, 255, 255, ARM_PLANE_COPY_W640, ARM_PART_8x2, ARM_PLANE_COPY_W704, ARM_PLANE_COPY_W720, 255, 255, 255, ARM_PART_2x8, 255, 255, ARM_PLANE_COPY_W960, 255, ARM_PLANE_COPY_W1024, 255, 255, 255, 255, 255, 255, 255, ARM_PLANE_COPY_W1280, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ARM_PART_8x6, 255, ARM_PART_6x8, ARM_PLANE_COPY_W1920, 255, 255, 255, 255, 255, 255 }; /* g_T4[0][0] * g_T4[0][0], g_T4[0][0] * g_T4[1][0], g_T4[0][0] * g_T4[2][0], g_T4[0][0] * g_T4[3][0], g_T4[0][0] * g_T4[0][1], g_T4[0][0] * g_T4[1][1], g_T4[0][0] * g_T4[2][1], g_T4[0][0] * g_T4[3][1], g_T4[0][1] * g_T4[0][0], g_T4[0][1] * g_T4[1][0], g_T4[0][1] * g_T4[2][0], g_T4[0][1] * g_T4[3][0], g_T4[0][1] * g_T4[0][1], g_T4[0][1] * g_T4[1][1], g_T4[0][1] * g_T4[2][1], g_T4[0][1] * g_T4[3][1], g_T4[1][0] * g_T4[0][0], g_T4[1][0] * g_T4[1][0], g_T4[1][0] * g_T4[2][0], g_T4[1][0] * g_T4[3][0], g_T4[1][0] * g_T4[0][1], g_T4[1][0] * g_T4[1][1], g_T4[1][0] * g_T4[2][1], g_T4[1][0] * g_T4[3][1], g_T4[1][1] * g_T4[0][0], g_T4[1][1] * g_T4[1][0], g_T4[1][1] * g_T4[2][0], g_T4[1][1] * g_T4[3][0], g_T4[1][1] * g_T4[0][1], g_T4[1][1] * g_T4[1][1], g_T4[1][1] * g_T4[2][1], g_T4[1][1] * g_T4[3][1], g_T4[2][0] * g_T4[0][0], g_T4[2][0] * g_T4[1][0], g_T4[2][0] * g_T4[2][0], g_T4[2][0] * g_T4[3][0], g_T4[2][0] * g_T4[0][1], g_T4[2][0] * g_T4[1][1], g_T4[2][0] * g_T4[2][1], g_T4[2][0] * g_T4[3][1], g_T4[2][1] * g_T4[0][0], g_T4[2][1] * g_T4[1][0], g_T4[2][1] * g_T4[2][0], g_T4[2][1] * g_T4[3][0], g_T4[2][1] * g_T4[0][1], g_T4[2][1] * g_T4[1][1], g_T4[2][1] * g_T4[2][1], g_T4[2][1] * g_T4[3][1], g_T4[3][0] * g_T4[0][0], g_T4[3][0] * g_T4[1][0], g_T4[3][0] * g_T4[2][0], g_T4[3][0] * g_T4[3][0], g_T4[3][0] * g_T4[0][1], g_T4[3][0] * g_T4[1][1], g_T4[3][0] * g_T4[2][1], g_T4[3][0] * g_T4[3][1], g_T4[3][1] * g_T4[0][0], g_T4[3][1] * g_T4[1][0], g_T4[3][1] * g_T4[2][0], g_T4[3][1] * g_T4[3][0], g_T4[3][1] * g_T4[0][1], g_T4[3][1] * g_T4[1][1], g_T4[3][1] * g_T4[2][1], g_T4[3][1] * g_T4[3][1] */ ALIGN32(short dct4x4_const_table[64]) = { 32 * 32, 32 * 42, 32 * 32, 32 * 17, 32 * 32, 32 * 17, 32 * (-32), 32 * (-42), 32 * 32, 32 * 42, 32 * 32, 32 * 17, 32 * 32, 32 * 17, 32 * (-32), 32 * (-42), 42 * 32, 42 * 42, 42 * 32, 42 * 17, 42 * 32, 42 * 17, 42 * (-32), 42 * (-42), 17 * 32, 17 * 42, 17 * 32, 17 * 17, 17 * 32, 17 * 17, 17 * (-32), 17 * (-42), 32 * 32, 32 * 42, 32 * 32, 32 * 17, 32 * 32, 32 * 17, 32 * (-32), 32 * (-42), (-32) * 32, (-32) * 42, (-32) * 32, (-32) * 17, (-32) * 32, (-32) * 17, (-32) * (-32), (-32) * (-42), 17 * 32, 17 * 42, 17 * 32, 17 * 17, 17 * 32, 17 * 17, 17 * (-32), 17 * (-42), (-42) * 32, (-42) * 42, (-42) * 32, (-42) * 17, (-42) * 32, (-42) * 17, (-42) * (-32), (-42) * (-42) }; ALIGN32(short g_dct_temp_buf[1024]) = {0}; #endif //__ARM_ARCH_7A__ xavs2-1.3/source/common/common_arm.h000066400000000000000000000045111340660520300175100ustar00rootroot00000000000000/* * common_arm.h * * Description of this file: * common tables definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef COMMON_ARM_H_ #define COMMON_ARM_H_ #ifdef __ARM_ARCH_7A__ enum ARM_MC_PART { /*mc_copy idx*/ ARM_PART_2x4, ARM_PART_2x8, ARM_PART_4x2, ARM_PART_6x8, ARM_PART_8x2, ARM_PART_8x6 }; enum ARM_PLANE_COPY_PART{ /*plane_copy idx*/ ARM_PLANE_COPY_W88, ARM_PLANE_COPY_W160, ARM_PLANE_COPY_W176, ARM_PLANE_COPY_W320, ARM_PLANE_COPY_W352, ARM_PLANE_COPY_W360, ARM_PLANE_COPY_W512, ARM_PLANE_COPY_W640, ARM_PLANE_COPY_W704, ARM_PLANE_COPY_W720, ARM_PLANE_COPY_W960, ARM_PLANE_COPY_W1024, ARM_PLANE_COPY_W1280, ARM_PLANE_COPY_W1920 }; extern const unsigned char g_arm_partition_map_tab[]; //Wxh= 2x4, 2x8, 6x8, 8x6 #define ARM_MC_PART_INDEX(w, h) g_arm_partition_map_tab[(w + 1) * h] #define ARM_PLANE_COPY_INDEX(w) g_arm_partition_map_tab[(((w + 8) >> 4) - 5) >> 1] extern short dct4x4_const_table[64]; #endif /* __ARM_ARCH_7A__ */ #endif /* COMMON_ARM_H_ */ xavs2-1.3/source/common/cpu.c000066400000000000000000000341301340660520300161430ustar00rootroot00000000000000/***************************************************************************** * Copyright (C) 2013-2017 MulticoreWare, Inc * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Loren Merritt * Laurent Aimar * Fiona Glaser * Steve Borho * Falei LUO * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ #include "common.h" #include "cpu.h" #if SYS_MACOSX || SYS_FREEBSD #include #include #endif #if SYS_OPENBSD #include #include #include #endif #if ARCH_ARM #include #include static sigjmp_buf jmpbuf; static volatile sig_atomic_t canjump = 0; static void sigill_handler(int sig) { if (!canjump) { signal(sig, SIG_DFL); raise(sig); } canjump = 0; siglongjmp(jmpbuf, 1); } #endif // if ARCH_ARM /* --------------------------------------------------------------------------- */ typedef struct { const char name[16]; int flags; } xavs2_cpu_name_t; /* --------------------------------------------------------------------------- */ static const xavs2_cpu_name_t xavs2_cpu_names[] = { #if ARCH_X86 || ARCH_X86_64 #define MMX2 XAVS2_CPU_MMX | XAVS2_CPU_MMX2 | XAVS2_CPU_CMOV { "MMX2", MMX2 }, { "MMXEXT", MMX2 }, { "SSE", MMX2 | XAVS2_CPU_SSE }, #define SSE2 MMX2 | XAVS2_CPU_SSE | XAVS2_CPU_SSE2 { "SSE2Slow", SSE2 | XAVS2_CPU_SSE2_IS_SLOW }, { "SSE2", SSE2 }, { "SSE2Fast", SSE2 | XAVS2_CPU_SSE2_IS_FAST }, { "SSE3", SSE2 | XAVS2_CPU_SSE3 }, { "SSSE3", SSE2 | XAVS2_CPU_SSE3 | XAVS2_CPU_SSSE3 }, { "SSE4.1", SSE2 | XAVS2_CPU_SSE3 | XAVS2_CPU_SSSE3 | XAVS2_CPU_SSE4 }, { "SSE4", SSE2 | XAVS2_CPU_SSE3 | XAVS2_CPU_SSSE3 | XAVS2_CPU_SSE4 }, { "SSE4.2", SSE2 | XAVS2_CPU_SSE3 | XAVS2_CPU_SSSE3 | XAVS2_CPU_SSE4 | XAVS2_CPU_SSE42 }, #define AVX SSE2 | XAVS2_CPU_SSE3 | XAVS2_CPU_SSSE3 | XAVS2_CPU_SSE4 | XAVS2_CPU_SSE42 | XAVS2_CPU_AVX { "AVX", AVX }, { "XOP", AVX | XAVS2_CPU_XOP }, { "FMA4", AVX | XAVS2_CPU_FMA4 }, { "AVX2", AVX | XAVS2_CPU_AVX2 }, { "FMA3", AVX | XAVS2_CPU_FMA3 }, #undef AVX #undef SSE2 #undef MMX2 { "Cache32", XAVS2_CPU_CACHELINE_32 }, { "Cache64", XAVS2_CPU_CACHELINE_64 }, { "LZCNT", XAVS2_CPU_LZCNT }, { "BMI1", XAVS2_CPU_BMI1 }, { "BMI2", XAVS2_CPU_BMI1 | XAVS2_CPU_BMI2 }, { "SlowCTZ", XAVS2_CPU_SLOW_CTZ }, { "SlowAtom", XAVS2_CPU_SLOW_ATOM }, { "SlowPshufb", XAVS2_CPU_SLOW_PSHUFB }, { "SlowPalignr", XAVS2_CPU_SLOW_PALIGNR }, { "SlowShuffle", XAVS2_CPU_SLOW_SHUFFLE }, { "UnalignedStack", XAVS2_CPU_STACK_MOD4 }, #elif ARCH_ARM { "ARMv6", XAVS2_CPU_ARMV6 }, { "NEON", XAVS2_CPU_NEON }, { "FastNeonMRC", XAVS2_CPU_FAST_NEON_MRC }, #endif // if XAVS2_ARCH_X86 { "", 0 } }; /* --------------------------------------------------------------------------- */ char *xavs2_get_simd_capabilities(char *buf, int cpuid) { char *p = buf; for (int i = 0; xavs2_cpu_names[i].flags; i++) { if (!strcmp(xavs2_cpu_names[i].name, "SSE") && (cpuid & XAVS2_CPU_SSE2)) continue; if (!strcmp(xavs2_cpu_names[i].name, "SSE2") && (cpuid & (XAVS2_CPU_SSE2_IS_FAST | XAVS2_CPU_SSE2_IS_SLOW))) continue; if (!strcmp(xavs2_cpu_names[i].name, "SSE3") && (cpuid & XAVS2_CPU_SSSE3 || !(cpuid & XAVS2_CPU_CACHELINE_64))) continue; if (!strcmp(xavs2_cpu_names[i].name, "SSE4.1") && (cpuid & XAVS2_CPU_SSE42)) continue; if (!strcmp(xavs2_cpu_names[i].name, "BMI1") && (cpuid & XAVS2_CPU_BMI2)) continue; if ((cpuid & xavs2_cpu_names[i].flags) == xavs2_cpu_names[i].flags && (!i || xavs2_cpu_names[i].flags != xavs2_cpu_names[i - 1].flags)) p += sprintf(p, " %s", xavs2_cpu_names[i].name); } if (p == buf) sprintf(p, " none! (%08x)", cpuid); return buf; } #if HAVE_MMX /* --------------------------------------------------------------------------- */ uint32_t xavs2_cpu_detect(void) { uint32_t cpuid = 0; uint32_t eax, ebx, ecx, edx; uint32_t vendor[4] = { 0 }; uint32_t max_extended_cap, max_basic_cap; #if !ARCH_X86_64 if (!xavs2_cpu_cpuid_test()) { return 0; } #endif xavs2_cpu_cpuid(0, &eax, vendor + 0, vendor + 2, vendor + 1); max_basic_cap = eax; if (max_basic_cap == 0) { return 0; } xavs2_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); if (edx & 0x00800000) { cpuid |= XAVS2_CPU_MMX; } else { return cpuid; } if (edx & 0x02000000) { cpuid |= XAVS2_CPU_MMX2 | XAVS2_CPU_SSE; } if (edx & 0x00008000) { cpuid |= XAVS2_CPU_CMOV; } else { return cpuid; } if (edx & 0x04000000) { cpuid |= XAVS2_CPU_SSE2; } if (ecx & 0x00000001) { cpuid |= XAVS2_CPU_SSE3; } if (ecx & 0x00000200) { cpuid |= XAVS2_CPU_SSSE3; } if (ecx & 0x00080000) { cpuid |= XAVS2_CPU_SSE4; } if (ecx & 0x00100000) { cpuid |= XAVS2_CPU_SSE42; } /* Check OXSAVE and AVX bits */ if ((ecx & 0x18000000) == 0x18000000) { /* Check for OS support */ xavs2_cpu_xgetbv(0, &eax, &edx); if ((eax & 0x6) == 0x6) { cpuid |= XAVS2_CPU_AVX; if (ecx & 0x00001000) { cpuid |= XAVS2_CPU_FMA3; } } } if (max_basic_cap >= 7) { xavs2_cpu_cpuid(7, &eax, &ebx, &ecx, &edx); /* AVX2 requires OS support, but BMI1/2 don't. */ if ((cpuid & XAVS2_CPU_AVX) && (ebx & 0x00000020)) { cpuid |= XAVS2_CPU_AVX2; } if (ebx & 0x00000008) { cpuid |= XAVS2_CPU_BMI1; if (ebx & 0x00000100) { cpuid |= XAVS2_CPU_BMI2; } } } if (cpuid & XAVS2_CPU_SSSE3) { cpuid |= XAVS2_CPU_SSE2_IS_FAST; } xavs2_cpu_cpuid(0x80000000, &eax, &ebx, &ecx, &edx); max_extended_cap = eax; if (max_extended_cap >= 0x80000001) { xavs2_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &edx); if (ecx & 0x00000020) cpuid |= XAVS2_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */ if (ecx & 0x00000040) { /* SSE4a, AMD only */ int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); cpuid |= XAVS2_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */ if (family == 0x14) { cpuid &= ~XAVS2_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */ cpuid |= XAVS2_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */ cpuid |= XAVS2_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */ } if (family == 0x16) { cpuid |= XAVS2_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough * compared to alternate instruction sequences that this * is equal or faster on almost all such functions. */ } } if (cpuid & XAVS2_CPU_AVX) { if (ecx & 0x00000800) { /* XOP */ cpuid |= XAVS2_CPU_XOP; } if (ecx & 0x00010000) { /* FMA4 */ cpuid |= XAVS2_CPU_FMA4; } } if (!strcmp((char*)vendor, "AuthenticAMD")) { if (edx & 0x00400000) { cpuid |= XAVS2_CPU_MMX2; } if (!(cpuid & XAVS2_CPU_LZCNT)) { cpuid |= XAVS2_CPU_SLOW_CTZ; } if ((cpuid & XAVS2_CPU_SSE2) && !(cpuid & XAVS2_CPU_SSE2_IS_FAST)) { cpuid |= XAVS2_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */ } } } if (!strcmp((char*)vendor, "GenuineIntel")) { int family, model; xavs2_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); if (family == 6) { /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah") * theoretically support sse2, but it's significantly slower than mmx for * almost all of x264's functions, so let's just pretend they don't. */ if (model == 9 || model == 13 || model == 14) { cpuid &= ~(XAVS2_CPU_SSE2 | XAVS2_CPU_SSE3); //XAVS2_CHECK(!(cpuid & (XAVS2_CPU_SSSE3 | XAVS2_CPU_SSE4)), "unexpected CPU ID %d\n", cpuid); } else if (model == 28) { /* Detect Atom CPU */ cpuid |= XAVS2_CPU_SLOW_ATOM; cpuid |= XAVS2_CPU_SLOW_CTZ; cpuid |= XAVS2_CPU_SLOW_PSHUFB; } else if ((cpuid & XAVS2_CPU_SSSE3) && !(cpuid & XAVS2_CPU_SSE4) && model < 23) { /* Conroe has a slow shuffle unit. Check the model number to make sure not * to include crippled low-end Penryns and Nehalems that don't have SSE4. */ cpuid |= XAVS2_CPU_SLOW_SHUFFLE; } } } if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpuid & XAVS2_CPU_SSE42)) { /* cacheline size is specified in 3 places, any of which may be missing */ int cache; xavs2_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); cache = (ebx & 0xff00) >> 5; // cflush size if (!cache && max_extended_cap >= 0x80000006) { xavs2_cpu_cpuid(0x80000006, &eax, &ebx, &ecx, &edx); cache = ecx & 0xff; // cacheline size } if (!cache && max_basic_cap >= 2) { // Cache and TLB Information static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 }; static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 }; uint32_t buf[4]; int max, i = 0, j; do { xavs2_cpu_cpuid(2, buf + 0, buf + 1, buf + 2, buf + 3); max = buf[0] & 0xff; buf[0] &= ~0xff; for (j = 0; j < 4; j++) { if (!(buf[j] >> 31)) { while (buf[j]) { if (strchr(cache32_ids, buf[j] & 0xff)) { cache = 32; } if (strchr(cache64_ids, buf[j] & 0xff)) { cache = 64; } buf[j] >>= 8; } } } } while (++i < max); } if (cache == 32) { cpuid |= XAVS2_CPU_CACHELINE_32; } else if (cache == 64) { cpuid |= XAVS2_CPU_CACHELINE_64; } else { xavs2_log(NULL, XAVS2_LOG_WARNING, "unable to determine cacheline size\n"); } } #ifdef BROKEN_STACK_ALIGNMENT cpuid |= XAVS2_CPU_STACK_MOD4; #endif return cpuid; } #endif // if HAVE_MMX #if SYS_LINUX && !(defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7__)) /* --------------------------------------------------------------------------- */ int sched_getaffinity(pid_t pid, size_t cpusetsize, cpu_set_t *mask); #endif /* --------------------------------------------------------------------------- */ int xavs2_cpu_num_processors(void) { #if !HAVE_THREAD return 1; #elif defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7__) return 2; #elif SYS_WINDOWS return xavs2_thread_num_processors_np(); #elif SYS_LINUX unsigned int bit; int np = 0; cpu_set_t p_aff; memset(&p_aff, 0, sizeof(p_aff)); sched_getaffinity(0, sizeof(p_aff), &p_aff); for (bit = 0; bit < sizeof(p_aff); bit++) { np += (((uint8_t *)& p_aff)[bit / 8] >> (bit % 8)) & 1; } return np; #elif SYS_BEOS system_info info; get_system_info(&info); return info.cpu_count; #elif SYS_MACOSX || SYS_FREEBSD || SYS_OPENBSD int numberOfCPUs; size_t length = sizeof (numberOfCPUs); #if SYS_OPENBSD int mib[2] = { CTL_HW, HW_NCPU }; if(sysctl(mib, 2, &numberOfCPUs, &length, NULL, 0)) #else if(sysctlbyname("hw.ncpu", &numberOfCPUs, &length, NULL, 0)) #endif { numberOfCPUs = 1; } return numberOfCPUs; #else return 1; #endif } xavs2-1.3/source/common/cpu.h000066400000000000000000000126621340660520300161560ustar00rootroot00000000000000/* * cpu.h * * Description of this file: * CPU-Processing functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_CPU_H #define XAVS2_CPU_H /** * =========================================================================== * const defines * =========================================================================== */ /* CPU flags */ /* x86 */ #define XAVS2_CPU_CMOV 0x0000001 #define XAVS2_CPU_MMX 0x0000002 #define XAVS2_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */ #define XAVS2_CPU_MMXEXT XAVS2_CPU_MMX2 #define XAVS2_CPU_SSE 0x0000008 #define XAVS2_CPU_SSE2 0x0000010 #define XAVS2_CPU_SSE3 0x0000020 #define XAVS2_CPU_SSSE3 0x0000040 #define XAVS2_CPU_SSE4 0x0000080 /* SSE4.1 */ #define XAVS2_CPU_SSE42 0x0000100 /* SSE4.2 */ #define XAVS2_CPU_LZCNT 0x0000200 /* Phenom support for "leading zero count" instruction. */ #define XAVS2_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */ #define XAVS2_CPU_XOP 0x0000800 /* AMD XOP */ #define XAVS2_CPU_FMA4 0x0001000 /* AMD FMA4 */ #define XAVS2_CPU_AVX2 0x0002000 /* AVX2 */ #define XAVS2_CPU_FMA3 0x0004000 /* Intel FMA3 */ #define XAVS2_CPU_BMI1 0x0008000 /* BMI1 */ #define XAVS2_CPU_BMI2 0x0010000 /* BMI2 */ /* x86 modifiers */ #define XAVS2_CPU_CACHELINE_32 0x0020000 /* avoid memory loads that span the border between two cachelines */ #define XAVS2_CPU_CACHELINE_64 0x0040000 /* 32/64 is the size of a cacheline in bytes */ #define XAVS2_CPU_SSE2_IS_SLOW 0x0080000 /* avoid most SSE2 functions on Athlon64 */ #define XAVS2_CPU_SSE2_IS_FAST 0x0100000 /* a few functions are only faster on Core2 and Phenom */ #define XAVS2_CPU_SLOW_SHUFFLE 0x0200000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */ #define XAVS2_CPU_STACK_MOD4 0x0400000 /* if stack is only mod4 and not mod16 */ #define XAVS2_CPU_SLOW_CTZ 0x0800000 /* BSR/BSF x86 instructions are really slow on some CPUs */ #define XAVS2_CPU_SLOW_ATOM 0x1000000 /* The Atom is terrible: slow SSE unaligned loads, slow * SIMD multiplies, slow SIMD variable shifts, slow pshufb, * cacheline split penalties -- gather everything here that * isn't shared by other CPUs to avoid making half a dozen * new SLOW flags. */ #define XAVS2_CPU_SLOW_PSHUFB 0x2000000 /* such as on the Intel Atom */ #define XAVS2_CPU_SLOW_PALIGNR 0x4000000 /* such as on the AMD Bobcat */ /* ARM */ #define XAVS2_CPU_ARMV6 0x0000001 #define XAVS2_CPU_NEON 0x0000002 /* ARM NEON */ #define XAVS2_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */ /** * =========================================================================== * declarations * =========================================================================== */ #define xavs2_cpu_detect FPFX(cpu_detect) uint32_t xavs2_cpu_detect(void); #define xavs2_cpu_num_processors FPFX(cpu_num_processors) int xavs2_cpu_num_processors(void); #define xavs2_cpu_emms FPFX(cpu_emms) void xavs2_cpu_emms(void); #define xavs2_cpu_sfence FPFX(cpu_sfence) void xavs2_cpu_sfence(void); #define xavs2_get_simd_capabilities FPFX(get_simd_capabilities) char *xavs2_get_simd_capabilities(char *buf, int cpuid); #if HAVE_MMX #define xavs2_cpu_cpuid_test FPFX(cpu_cpuid_test) int xavs2_cpu_cpuid_test(void); #define xavs2_cpu_cpuid FPFX(cpu_cpuid) uint32_t xavs2_cpu_cpuid(uint32_t op, uint32_t * eax, uint32_t * ebx, uint32_t * ecx, uint32_t * edx); #define xavs2_cpu_xgetbv FPFX(cpu_xgetbv) void xavs2_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx); #define xavs2_emms() xavs2_cpu_emms() #else #define xavs2_emms() #endif #endif // XAVS2_CPU_H xavs2-1.3/source/common/cudata.c000066400000000000000000000441351340660520300166230ustar00rootroot00000000000000/* * cudata.c * * Description of this file: * CU-Data functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "cudata.h" #include "header.h" #include "block_info.h" #include "transform.h" #include "me.h" #include "rdo.h" #include "predict.h" #include "bitstream.h" #include "ratecontrol.h" /** * =========================================================================== * local/global variables * =========================================================================== */ #if XAVS2_TRACE extern int g_sym_count; /* global symbol count for trace */ extern int g_bit_count; /* global bit count for trace */ #endif /* --------------------------------------------------------------------------- */ static const uint8_t BLOCK_STEPS[MAX_PRED_MODES][2] = { // [mode][h/v] { 2, 2 }, // 8x8 (PRED_SKIP ) { 2, 2 }, // 8x8 (PRED_2Nx2N ) { 2, 1 }, // 8x4 (PRED_2NxN ) { 1, 2 }, // 4x8 (PRED_Nx2N ) { 2, 1 }, // 8x2 (PRED_2NxnU ) { 2, 1 }, // 8x6 (PRED_2NxnD ) { 1, 2 }, // 2x8 (PRED_nLx2N ) { 1, 2 }, // 6x8 (PRED_nRx2N ) { 2, 2 }, // 8x8 (PRED_I_2Nx2N) { 1, 1 }, // 4x4 (PRED_I_NxN ) { 2, 1 }, // 8x2 (PRED_I_2Nxn ) { 1, 2 } // 2x8 (PRED_I_nx2N ) }; /* --------------------------------------------------------------------------- */ const uint8_t tab_split_tu_pos[MAX_PRED_MODES][4][2] = { // [mode][block][x/y] // x0,y0 x1,y1 x2,y2 x3,y3 CU TU0 TU1 TU2 TU3 { { 0, 0 }, { 4, 0 }, { 0, 4 }, { 4, 4 } }, // 8x8: 4x4, 4x4, 4x4, 4x4 (PRED_SKIP ) { { 0, 0 }, { 4, 0 }, { 0, 4 }, { 4, 4 } }, // 8x8: 4x4, 4x4, 4x4, 4x4 (PRED_2Nx2N ) { { 0, 0 }, { 0, 2 }, { 0, 4 }, { 0, 6 } }, // 8x4: 8x2, 8x2, 8x2, 8x2 (PRED_2NxN ) { { 0, 0 }, { 2, 0 }, { 4, 0 }, { 6, 0 } }, // 4x8: 2x8, 2x8, 2x8, 2x8 (PRED_Nx2N ) { { 0, 0 }, { 0, 2 }, { 0, 4 }, { 0, 6 } }, // 8x2: 8x2, 8x2, 8x2, 8x2 (PRED_2NxnU ) { { 0, 0 }, { 0, 2 }, { 0, 4 }, { 0, 6 } }, // 8x6: 8x2, 8x2, 8x2, 8x2 (PRED_2NxnD ) { { 0, 0 }, { 2, 0 }, { 4, 0 }, { 6, 0 } }, // 2x8: 2x8, 2x8, 2x8, 2x8 (PRED_nLx2N ) { { 0, 0 }, { 2, 0 }, { 4, 0 }, { 6, 0 } }, // 6x8: 2x8, 2x8, 2x8, 2x8 (PRED_nRx2N ) { { 0, 0 }, { 4, 0 }, { 0, 4 }, { 4, 4 } }, // 8x8: 4x4, 4x4, 4x4, 4x4 (PRED_I_2Nx2N) { { 0, 0 }, { 4, 0 }, { 0, 4 }, { 4, 4 } }, // 4x4: 4x4, 4x4, 4x4, 4x4 (PRED_I_NxN ) { { 0, 0 }, { 0, 2 }, { 0, 4 }, { 0, 6 } }, // 8x2: 8x2, 8x2, 8x2, 8x2 (PRED_I_2Nxn ) { { 0, 0 }, { 2, 0 }, { 4, 0 }, { 6, 0 } } // 2x8: 2x8, 2x8, 2x8, 2x8 (PRED_I_nx2N ) }; /* --------------------------------------------------------------------------- */ const uint8_t tab_qp_scale_chroma[64] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, }; /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void cu_mvd_derivation(xavs2_t *h, mv_t *mvd, const mv_t *mv, const mv_t *mvp) { if (h->param->enable_pmvr) { mv_t ctr; ctr.x = (mvp->x >> 1) << 1; ctr.y = (mvp->y >> 1) << 1; if (XAVS2_ABS(mv->x - ctr.x) > TH_PMVR) { mvd->x = (int16_t)((mv->x + ctr.x + xavs2_sign2(mv->x - ctr.x) * TH_PMVR) >> 1) - mvp->x; mvd->y = (mv->y - ctr.y) >> 1; } else if (XAVS2_ABS(mv->y - ctr.y) > TH_PMVR) { mvd->x = (mv->x - ctr.x) >> 1; mvd->y = (int16_t)((mv->y + ctr.y + xavs2_sign2(mv->y - ctr.y) * TH_PMVR) >> 1) - mvp->y; } else { mvd->x = mv->x - mvp->x; mvd->y = mv->y - mvp->y; } } else { mvd->x = mv->x - mvp->x; mvd->y = mv->y - mvp->y; } } /* --------------------------------------------------------------------------- * get mvds, only for inter cu mode */ void cu_get_mvds(xavs2_t *h, cu_t *p_cu) { int mode = p_cu->cu_info.i_mode; int pdir; int k, blk_idx; cu_mode_t *p_mode = cu_get_layer_mode(h, p_cu->cu_info.i_level); cu_mv_mode_t *p_mvmode = p_mode->mvs[mode]; assert(IS_INTER_MODE(mode) && !IS_SKIP_MODE(mode)); for (k = 0; k < p_cu->cu_info.num_pu; k++) { mv_t mv_fwd, mv_bwd; mv_t mvp_fwd, mvp_bwd; mv_t mvd_fwd, mvd_bwd; #if XAVS2_TRACE mv_fwd.v = mv_bwd.v = 0; mvp_fwd.v = mvp_bwd.v = 0; #endif mvd_fwd.v = mvd_bwd.v = 0; blk_idx = pu_get_mv_index(mode, k); pdir = p_cu->cu_info.b8pdir[k]; /* forward motion vectors */ if (pdir != PDIR_BWD) { int ref_fwd = p_cu->cu_info.ref_idx_1st[k]; mv_fwd = p_cu->mc.mv[k][0]; mvp_fwd = p_mvmode[blk_idx].all_mvp[ref_fwd]; cu_mvd_derivation(h, &mvd_fwd, &mv_fwd, &mvp_fwd); } /* backward motion vectors */ if (pdir == PDIR_BWD || pdir == PDIR_BID) { // has backward vector mv_bwd = p_cu->mc.mv[k][1]; mvp_bwd = p_mvmode[blk_idx].all_mvp[B_BWD]; cu_mvd_derivation(h, &mvd_bwd, &mv_bwd, &mvp_bwd); } // store (oversampled) mvd p_cu->cu_info.mvd[0][k] = mvd_fwd; p_cu->cu_info.mvd[1][k] = mvd_bwd; #if XAVS2_TRACE p_cu->cu_info.mv [0][k] = mv_fwd; p_cu->cu_info.mvp[0][k] = mvp_fwd; p_cu->cu_info.mv [1][k] = mv_bwd; p_cu->cu_info.mvp[1][k] = mvp_bwd; #endif } } /* --------------------------------------------------------------------------- * copy one block (multi-planes) */ static void block_copy_x3(pel_t *p_dst[], int i_dst[], pel_t *p_src[], int i_src[], int i_width[], int i_height[], int i_planes) { pel_t *dst, *src; int y, k; for (k = 0; k < i_planes; k++) { int i_size = i_width[k] * sizeof(pel_t); memcpy_t f_memcpy = i_size & 15 ? memcpy : g_funcs.memcpy_aligned; dst = p_dst[k]; src = p_src[k]; for (y = i_height[k]; y != 0; y--) { f_memcpy(dst, src, i_size); dst += i_dst[k]; src += i_src[k]; } } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void xavs2_copy_col1(pel_t *dst, pel_t *src, const int height, const int stride) { int i; int k = 0; for (i = height; i != 0; i--) { dst[k] = src[k]; k += stride; } } /* --------------------------------------------------------------------------- * cache CTU border */ static INLINE void xavs2_cache_lcu_border(pel_t *p_dst, const pel_t *p_top, const pel_t *p_left, int i_left, int lcu_width, int lcu_height) { int i; /* top, top-right */ memcpy(p_dst, p_top, (2 * lcu_width + 1) * sizeof(pel_t)); /* left */ for (i = 1; i <= lcu_height; i++) { p_dst[-i] = p_left[0]; p_left += i_left; } } /* --------------------------------------------------------------------------- * cache CTU border (UV components together) */ static INLINE void xavs2_cache_lcu_border_uv(pel_t *p_dst_u, const pel_t *p_top_u, const pel_t *p_left_u, pel_t *p_dst_v, const pel_t *p_top_v, const pel_t *p_left_v, int i_left, int lcu_width, int lcu_height) { int i; /* top, top-right */ memcpy(p_dst_u, p_top_u, (2 * lcu_width + 1) * sizeof(pel_t)); memcpy(p_dst_v, p_top_v, (2 * lcu_width + 1) * sizeof(pel_t)); /* left */ for (i = 1; i <= lcu_height; i++) { p_dst_u[-i] = p_left_u[0]; p_dst_v[-i] = p_left_v[0]; p_left_u += i_left; p_left_v += i_left; } } /* --------------------------------------------------------------------------- * start encoding a lcu (initializing) */ void lcu_start_init_pos(xavs2_t *h, int i_lcu_x, int i_lcu_y) { const int scu_x = i_lcu_x << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); const int scu_y = i_lcu_y << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); const int pix_x = scu_x << MIN_CU_SIZE_IN_BIT; const int pix_y = scu_y << MIN_CU_SIZE_IN_BIT; int lcu_width = 1 << h->i_lcu_level; int lcu_height = 1 << h->i_lcu_level; cu_t *p_cu = h->lcu.p_ctu; /* point to the CTU */ #if ENABLE_RATE_CONTROL_CU int w_in_scu; /* width in SCU of current lcu */ int h_in_scu; /* height in SCU of current lcu */ int x, y; #endif /* ------------------------------------------------------------- * 1, update the coordinates for the next lcu */ /* update the coordinates */ h->lcu.i_lcu_xy = i_lcu_y * h->i_width_in_lcu + i_lcu_x; h->lcu.i_scu_xy = p_cu->i_scu_xy = scu_y * h->i_width_in_mincu + scu_x; h->lcu.i_scu_x = p_cu->cu_info.i_scu_x = scu_x; h->lcu.i_scu_y = p_cu->cu_info.i_scu_y = scu_y; h->lcu.i_pix_x = p_cu->i_pix_x = pix_x; h->lcu.i_pix_y = p_cu->i_pix_y = pix_y; /* update actual width and height */ lcu_width = XAVS2_MIN( lcu_width, h->i_width - pix_x); lcu_height = XAVS2_MIN(lcu_height, h->i_height - pix_y); h->lcu.i_pix_width = (int16_t)lcu_width; h->lcu.i_pix_height = (int16_t)lcu_height; /* ------------------------------------------------------------- * 2, init qp for current CTU */ #if ENABLE_RATE_CONTROL_CU if (h->param->i_rc_method == XAVS2_RC_CBR_SCU) { h->i_qp = xavs2_rc_get_lcu_qp(h, h->fenc->i_frame, h->i_qp); } #endif /* ------------------------------------------------------------- * 3, init all SCU in current CTU */ h->lcu_slice_idx[h->lcu.i_lcu_xy] = (int8_t)(h->i_slice_index); #if ENABLE_RATE_CONTROL_CU w_in_scu = lcu_width >> MIN_CU_SIZE_IN_BIT; h_in_scu = lcu_height >> MIN_CU_SIZE_IN_BIT; for (y = 0; y < h_in_scu; y++) { cu_info_t *p_cu_info = &h->cu_info[h->lcu.i_scu_xy + y * h->i_width_in_mincu]; /* point to a SCU */ for (x = w_in_scu; x != 0; x--, p_cu_info++) { p_cu_info->i_delta_qp = 0; p_cu_info->i_cu_qp = (int8_t)(h->i_qp); // needed in loop filter (even if constant QP is used) // reset syntax element entries in cu_info_t // ЩԪڱÿLCUʱãԴ˴Ҫ޸ // p_cu_info->i_mode = PRED_SKIP; // p_cu_info->i_cbp = 0; // p_cu_info->i_level = MIN_CU_SIZE_IN_BIT; } } #endif } /* --------------------------------------------------------------------------- * start encoding a lcu (initializing) */ void lcu_start_init_pixels(xavs2_t *h, int i_lcu_x, int i_lcu_y) { const int scu_x = i_lcu_x << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); const int scu_y = i_lcu_y << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); int img_x = scu_x << MIN_CU_SIZE_IN_BIT; int img_y = scu_y << MIN_CU_SIZE_IN_BIT; int lcu_width = h->lcu.i_pix_width; int lcu_height = h->lcu.i_pix_height; int blk_w[3]; int blk_h[3]; int i_src[3]; int i_dst[3]; pel_t *p_src[3]; pel_t *p_dst[3]; /* ------------------------------------------------------------- * 1, copy LCU pixel data from original image buffer */ i_src[0] = h->fenc->i_stride[0]; i_src[1] = h->fenc->i_stride[1]; i_src[2] = h->fenc->i_stride[2]; p_src[0] = h->fenc->planes[0] + (img_y ) * i_src[0] + (img_x ); p_src[1] = h->fenc->planes[1] + (img_y >> 1) * i_src[1] + (img_x >> 1); p_src[2] = h->fenc->planes[2] + (img_y >> 1) * i_src[2] + (img_x >> 1); i_dst[0] = i_dst[1] = i_dst[2] = FENC_STRIDE; p_dst[0] = h->lcu.p_fenc[0]; p_dst[1] = h->lcu.p_fenc[1]; p_dst[2] = h->lcu.p_fenc[2]; blk_w[0] = lcu_width; blk_h[0] = lcu_height; blk_w[1] = blk_w[2] = lcu_width >> 1; blk_h[1] = blk_h[2] = lcu_height >> 1; block_copy_x3(p_dst, i_dst, p_src, i_src, blk_w, blk_h, 3); /* first CTU of LCU row */ if (h->fenc->b_enable_intra || h->fenc->i_frm_type == XAVS2_TYPE_I) { if (img_x == 0) { memcpy(h->lcu.ctu_border[0].rec_top + 1, h->intra_border[0], lcu_width * 2 * sizeof(pel_t)); memcpy(h->lcu.ctu_border[1].rec_top + 1, h->intra_border[1], lcu_width * sizeof(pel_t)); memcpy(h->lcu.ctu_border[2].rec_top + 1, h->intra_border[2], lcu_width * sizeof(pel_t)); } else if (h->param->i_lcurow_threads > 1) { /* top-right pixels */ memcpy(h->lcu.ctu_border[0].rec_top + 1 + lcu_width, h->intra_border[0] + img_x + lcu_width, lcu_width * sizeof(pel_t)); memcpy(h->lcu.ctu_border[1].rec_top + 1 + (lcu_width >> 1), h->intra_border[1] + ((img_x + lcu_width) >> 1), (lcu_width >> 1) * sizeof(pel_t)); memcpy(h->lcu.ctu_border[2].rec_top + 1 + (lcu_width >> 1), h->intra_border[2] + ((img_x + lcu_width) >> 1), (lcu_width >> 1) * sizeof(pel_t)); } } } /* --------------------------------------------------------------------------- * terminate processing of the current LCU depending on the chosen slice mode */ void lcu_end(xavs2_t *h, int i_lcu_x, int i_lcu_y) { const int img_y = h->lcu.i_pix_y; const int img_y_c = img_y >> 1; const int img_x = h->lcu.i_pix_x; const int img_x_c = img_x >> 1; const int lcu_width = h->lcu.i_pix_width; /* width of lcu (in pixel) */ const int lcu_height = h->lcu.i_pix_height; /* height of lcu (in pixel) */ const int lcu_width_c = lcu_width >> 1; const int lcu_height_c = lcu_height >> 1; int blk_w[3]; int blk_h[3]; int i_src[3]; int i_dst[3]; pel_t *p_src[3]; pel_t *p_dst[3]; /* ------------------------------------------------------------- * 1, copy decoded LCU to frame buffer */ i_dst[0] = h->fdec->i_stride[0]; i_dst[1] = h->fdec->i_stride[1]; i_dst[2] = h->fdec->i_stride[2]; p_dst[0] = h->fdec->planes[0] + (img_y) * i_dst[0] + (img_x); p_dst[1] = h->fdec->planes[1] + (img_y_c) * i_dst[1] + (img_x_c); p_dst[2] = h->fdec->planes[2] + (img_y_c) * i_dst[2] + (img_x_c); i_src[0] = i_src[1] = i_src[2] = FDEC_STRIDE; p_src[0] = h->lcu.p_fdec[0]; p_src[1] = h->lcu.p_fdec[1]; p_src[2] = h->lcu.p_fdec[2]; blk_w[0] = lcu_width; blk_h[0] = lcu_height; blk_w[1] = blk_w[2] = lcu_width_c; blk_h[1] = blk_h[2] = lcu_height_c; block_copy_x3(p_dst, i_dst, p_src, i_src, blk_w, blk_h, 3); /* ------------------------------------------------------------- * 2, backup right col and bottom row pixels for intra coding */ if (h->fenc->b_enable_intra || h->fenc->i_frm_type == XAVS2_TYPE_I) { // backup intra pred mode of bottom 4x4 row int i_pred_mode_stride = h->i_width_in_minpu + 16; int i_pred_mode_width_in_lcu = (1 << h->i_lcu_level) >> MIN_PU_SIZE_IN_BIT; memcpy(h->ipredmode - i_pred_mode_stride + i_lcu_x * i_pred_mode_width_in_lcu, h->ipredmode + i_pred_mode_stride * (i_pred_mode_width_in_lcu - 1) + i_lcu_x * i_pred_mode_width_in_lcu, i_pred_mode_width_in_lcu * sizeof(int8_t)); /* cache top and left samples for intra prediction of next CTU */ xavs2_cache_lcu_border(h->lcu.ctu_border[0].rec_top, h->intra_border[0] + img_x + lcu_width - 1, p_src[0] + lcu_width - 1, FDEC_STRIDE, lcu_width, lcu_height); xavs2_cache_lcu_border_uv(h->lcu.ctu_border[1].rec_top, h->intra_border[1] + img_x_c + lcu_width_c - 1, p_src[1] + lcu_width_c - 1, h->lcu.ctu_border[2].rec_top, h->intra_border[2] + img_x_c + lcu_width_c - 1, p_src[2] + lcu_width_c - 1, FDEC_STRIDE, lcu_width_c, lcu_height_c); /* 2.2, backup bottom row pixels */ if (i_lcu_y < h->i_height_in_lcu - 1) { g_funcs.fast_memcpy(h->intra_border[0] + img_x, p_src[0] + (lcu_height - 1) * FDEC_STRIDE, lcu_width * sizeof(pel_t)); g_funcs.fast_memcpy(h->intra_border[1] + img_x_c, p_src[1] + (lcu_height_c - 1) * FDEC_STRIDE, lcu_width_c * sizeof(pel_t)); g_funcs.fast_memcpy(h->intra_border[2] + img_x_c, p_src[2] + (lcu_height_c - 1) * FDEC_STRIDE, lcu_width_c * sizeof(pel_t)); } } } xavs2-1.3/source/common/cudata.h000066400000000000000000000124031340660520300166210ustar00rootroot00000000000000/* * cudata.h * * Description of this file: * CU-Data functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_CUDATA_H #define XAVS2_CUDATA_H void cu_get_mvds(xavs2_t *h, cu_t *p_cu); void lcu_start_init_pos (xavs2_t *h, int i_lcu_x, int i_lcu_y); void lcu_start_init_pixels(xavs2_t *h, int i_lcu_x, int i_lcu_y); void lcu_end(xavs2_t *h, int i_lcu_x, int i_lcu_y); /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int clip_qp(xavs2_t *h, int i_qp) { /* AVS2-P2 ͼ picture_qp */ int max_qp = MAX_QP + (h->param->sample_bit_depth - 8) * 8; return XAVS2_MAX(MIN_QP, XAVS2_MIN(max_qp, i_qp)); } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void cu_init_transform_block(int i_cu_level, int i_tu_split, int b8, cb_t *p_tb) { static const cb_t TRANS_BLOCK_INFO[TU_SPLIT_TYPE_NUM][4] = {// [tu_split_type][block] // x, y, w, h x, y, w, h x, y, w, h x, y, w, h for block 0, 1, 2 and 3 { { { 0, 0, 8, 8 } }, { { 0, 0, 0, 0 } }, { { 0, 0, 0, 0 } }, { { 0, 0, 0, 0 } } }, // 0: 8x8, ---, ---, --- (TU_SPLIT_NON ) { { { 0, 0, 8, 2 } }, { { 0, 2, 8, 2 } }, { { 0, 4, 8, 2 } }, { { 0, 6, 8, 2 } } }, // 2: 8x2, 8x2, 8x2, 8x2 (TU_SPLIT_HOR ) { { { 0, 0, 2, 8 } }, { { 2, 0, 2, 8 } }, { { 4, 0, 2, 8 } }, { { 6, 0, 2, 8 } } }, // 3: 2x8, 2x8, 2x8, 2x8 (TU_SPLIT_VER ) { { { 0, 0, 4, 4 } }, { { 4, 0, 4, 4 } }, { { 0, 4, 4, 4 } }, { { 4, 4, 4, 4 } } } // 1: 4x4, 4x4, 4x4, 4x4 (TU_SPLIT_CROSS ) }; static const cb_t CHROMAW_BLOCK_INFO[2] = { { { 0, 8, 4, 4 } }, { { 4, 8, 4, 4 } } }; const int shift_bits = (i_cu_level - MIN_CU_SIZE_IN_BIT); if (b8 < 4) { p_tb->v = TRANS_BLOCK_INFO[i_tu_split][b8].v << shift_bits; } else { p_tb->v = CHROMAW_BLOCK_INFO[b8 - 4].v << shift_bits; } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int pu_get_mv_index(int i_mode, int pu_idx) { int i_shift = IS_HOR_PU_PART(i_mode); return pu_idx << i_shift; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int cu_get_qp(xavs2_t *h, cu_info_t *p_cu_info) { #if ENABLE_RATE_CONTROL_CU UNUSED_PARAMETER(h); return p_cu_info->i_cu_qp; #else UNUSED_PARAMETER(p_cu_info); return h->i_qp; #endif } /* --------------------------------------------------------------------------- * */ static ALWAYS_INLINE int cu_get_slice_index(xavs2_t *h, int scu_x, int scu_y) { int lcu_shift = (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); int lcu_xy = (scu_y >> lcu_shift) * h->i_width_in_lcu + (scu_x >> lcu_shift); return h->lcu_slice_idx[lcu_xy]; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int cu_get_chroma_qp(xavs2_t *h, int luma_qp, int uv) { int QP; UNUSED_PARAMETER(uv); UNUSED_PARAMETER(h); QP = tab_qp_scale_chroma[XAVS2_CLIP3(0, 63, luma_qp)]; return QP; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE cu_layer_t *cu_get_layer(xavs2_t *h, int i_cu_level) { return &h->lcu.cu_layer[i_cu_level - MIN_CU_SIZE_IN_BIT]; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE cu_mode_t *cu_get_layer_mode(xavs2_t *h, int i_cu_level) { return &h->lcu.cu_layer[i_cu_level - MIN_CU_SIZE_IN_BIT].cu_mode; } static ALWAYS_INLINE cu_parallel_t *cu_get_enc_context(xavs2_t *h, int i_cu_level) { #if PARALLEL_INSIDE_CTU return &h->lcu.cu_enc[i_cu_level - MIN_CU_SIZE_IN_BIT]; #else UNUSED_PARAMETER(i_cu_level); return &h->lcu.cu_enc[0]; #endif } #endif // XAVS2_CUDATA_H xavs2-1.3/source/common/defines.h000066400000000000000000000406051340660520300170020ustar00rootroot00000000000000/* * defines.h * * Description of this file: * const variable definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_DEFINES_H #define XAVS2_DEFINES_H /** * =========================================================================== * build switch * =========================================================================== */ /* --------------------------------------------------------------------------- * debug */ #define XAVS2_DUMP_REC 1 /* dump reconstruction frames, 1: ON, 0: OFF */ #define XAVS2_TRACE 0 /* write trace file, 1: ON, 0: OFF */ #define XAVS2_STAT 1 /* stat encoder info, 1: On, 0: OFF */ /** * =========================================================================== * optimization * =========================================================================== */ /* 㷨Ƿ */ #define IS_ALG_ENABLE(alg) ((h->i_fast_algs >> alg) & 1) /* --------------------------------------------------------------------------- * mask for fast algorithms */ enum xavs2_fast_algorithms_e { /* fast inter */ OPT_EARLY_SKIP , /* ʱԵĿSKIP */ OPT_PSC_MD , /* ʱԵĿģʽ (prediction size correlation based mode decision) */ OPT_FAST_CBF_MODE , /* ŻģʽCBFʣĻģʽ */ OPT_FAST_PU_SEL , /* OPT_FAST_CBF_MODEļ㷨cbf=0ʱ2Nx2NSKIPʣ֡ģʽ֡ģʽ */ OPT_BYPASS_AMP , /* PRED_2NxNδţֱַͬPRED_2NxnU/PRED_2NxnD; PRED_Nx2Nͬ */ OPT_DMH_CANDIDATE , /* ھDMHģʽµRDO */ OPT_BYPASS_MODE_FPIC , /* F֡е֡ģʽDMHģʽ */ OPT_ADVANCE_CHROMA_AEC , /* ǰɫȿı任ϵ */ OPT_ROUGH_MODE_SKIP , /* */ OPT_CMS_ETMD , /* ֡ڻַʽ * 1I_2Nx2N֡Ԥģʽ򲻱֣֡ * 2֡ģʽCBPΪʱ֡ڻַʽ*/ OPT_ROUGH_PU_SEL , /* ԵPUģʽ */ OPT_CBP_DIRECT , /* directģʽ²вǷΪȫ飬PUֺCUݹ黮 */ OPT_SKIP_DMH_THRES , /* ͨDistortionֵDMHģʽı */ OPT_ROUGH_SKIP_SEL , /* ͨdistortionԱֻԸskip/directģʽRDO */ /* fast intra */ OPT_BYPASS_SDIP , /* PRED_I_2NxnѻţֱPRED_I_nx2N */ OPT_FAST_INTRA_MODE , /* ֡ģʽپ */ OPT_FAST_RDO_INTRA_C , /* ֡ChromaԤģʽŻɫȷ */ OPT_ET_RDO_INTRA_L , /* Luma RDOǰ˳ */ OPT_ET_INTRA_DEPTH , /* MADֵI֡depthǰֹ */ OPT_BYPASS_INTRA_BPIC , /* B֡֡ԤģʽCBPΪ㣬֡Ԥģʽ */ OPT_FAST_INTRA_IN_INTER , /* CUģʽǷ֡ڼǰCU֡ģʽRDCost֡֡ģʽ */ /* fast CU depth */ OPT_ECU , /* HMȫSKIPģʽֹ²㻮 */ OPT_ET_HOMO_MV , /* */ OPT_CU_CSET , /* CSET of uAVS2, Only for inter frames that are not referenced by others */ OPT_CU_DEPTH_CTRL , /* ʱԵDepthƣϡϡϺʱοlevelDEPTHΧȫI֡Ҳ */ OPT_CU_QSFD , /* CU splitting termination based on RD-Cost: Z. Wang, R. Wang, K. Fan, H. Sun, and W. Gao, uAVS2Fast encoder for the 2nd generation IEEE 1857 video coding standard, Signal Process. Image Commun., vol. 53, no. October 2016, pp. 13C23, 2017. */ /* fast transform and Quant */ OPT_BYPASS_INTRA_RDOQ , /* B֡֡е֡ģʽRDOQ */ OPT_RDOQ_AZPC , /* ͨԱ任ϵֵжϼȫRDOQԤɫȷRDOQ*/ /* others */ OPT_FAST_ZBLOCK , /* */ OPT_TR_KEY_FRAME_MD , /* Ըǹؼ֡IJģʽܽʡ5%ʱ */ OPT_CODE_OPTIMZATION , /* OPT_CU_SUBCU_COST: ȱCUٱСCUʱǰСCURDCostCUһCU * OPT_RDOQ_SKIP: ͨRDOQ֮ǰԱ任ϵֵжϼȫ飬RDOQ */ OPT_BIT_EST_PSZT , /* TUعƣ33x32TUٶֻеƵ16x16зϵ */ OPT_TU_LEVEL_DEC , /* TU㻮־ߣԵһTUѡţڶTU֣ǷҪTU */ OPT_FAST_ALF , /* ALF㷨ڶB֡֡οALFALFЭʱstep=2² */ OPT_FAST_SAO , /* SAO㷨ڶB֡֡οSAO */ OPT_SUBCU_SPLIT , /* ݻӿĿ߸ǷԷSKIPģʽRDO */ OPT_PU_RMS , /* رС飨8x8,16x16)ֵԤⵥԪ2Nx2N֡ڣ֡ԼSKIPģʽ*/ NUM_FAST_ALGS /* ܵĿ㷨 */ }; /* --------------------------------------------------------------------------- * const defines related with fast algorithms */ #define SAVE_CU_INFO 1 /* ο֡ÿһ֡cu typecu bitsizeڻȡʱcuģʽcuߴ */ #define NUM_INTRA_C_FULL_RD 4 /* --------------------------------------------------------------------------- * switches for modules to be removed */ /* remove code for Weighted Quant */ #define ENABLE_WQUANT 0 /* 1: enable, 0: disable */ /* frame level interpolation */ #define ENABLE_FRAME_SUBPEL_INTPL 1 /* Entropy coding optimization for context update */ #define CTRL_OPT_AEC 1 /* --------------------------------------------------------------------------- * Rate Control */ #define ENABLE_RATE_CONTROL_CU 0 /* Enable Rate-Control on CU level: 1: enable, 0: disable */ #define ENABLE_AUTO_INIT_QP 1 /* ĿԶóʼQPֵ */ /** * =========================================================================== * const defines * =========================================================================== */ /* --------------------------------------------------------------------------- * const for bool type */ #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif /* --------------------------------------------------------------------------- * profiles */ #define MAIN_PICTURE_PROFILE 0x12 /* profile: MAIN_PICTURE */ #define MAIN_PROFILE 0x20 /* profile: MAIN */ #define MAIN10_PROFILE 0x22 /* profile: MAIN10 */ /* --------------------------------------------------------------------------- * chroma formats */ #define CHROMA_400 0 #define CHROMA_420 1 #define CHROMA_422 2 #define CHROMA_444 3 #define CHROMA_V_SHIFT (h->i_chroma_v_shift) /* --------------------------------------------------------------------------- * quantization parameter range */ #define MIN_QP 0 /* min QP */ #define MAX_QP 63 /* max QP */ #define SHIFT_QP 11 /* shift QP */ /* --------------------------------------------------------------------------- * cu size */ #define MAX_CU_SIZE 64 /* max CU size */ #define MAX_CU_SIZE_IN_BIT 6 #define MIN_CU_SIZE 8 /* min CU size */ #define MIN_CU_SIZE_IN_BIT 3 #define MIN_PU_SIZE 4 /* min PU size */ #define MIN_PU_SIZE_IN_BIT 2 #define BLOCK_MULTIPLE (MIN_CU_SIZE / MIN_PU_SIZE) #define CTU_DEPTH (MAX_CU_SIZE_IN_BIT - MIN_CU_SIZE_IN_BIT + 1) #define B4X4_IN_BIT 2 /* unit level: 2 */ #define B8X8_IN_BIT 3 /* unit level: 3 */ #define B16X16_IN_BIT 4 /* unit level: 4 */ #define B32X32_IN_BIT 5 /* unit level: 5 */ #define B64X64_IN_BIT 6 /* unit level: 6 */ /* --------------------------------------------------------------------------- * parameters for scale mv */ #define MULTIx2 32768 #define MULTI 16384 #define HALF_MULTI 8192 #define OFFSET 14 /* --------------------------------------------------------------------------- * prediction techniques */ #define LAM_2Level_TU 0.8 #define DMH_MODE_NUM 5 /* number of DMH mode */ #define WPM_NUM 3 /* number of WPM */ #define TH_PMVR 2 /* PMVRķ֮һؾMVĿ÷Χ */ /* --------------------------------------------------------------------------- * coefficient coding */ #define MAX_TU_SIZE 32 /* 任Сرʱϵ */ #define MAX_TU_SIZE_IN_BIT 5 /* 任Сرʱϵ */ #define SIZE_CG 4 /* CG С 4x4 */ #define SIZE_CG_IN_BIT 2 /* CG С 4x4 */ #define MAX_CG_NUM_IN_TU (1 << ((MAX_TU_SIZE_IN_BIT - SIZE_CG_IN_BIT) << 1)) /* --------------------------------------------------------------------------- * temporal level (layer) */ #define TEMPORAL_MAXLEVEL 8 /* max number of temporal levels */ #define TEMPORAL_MAXLEVEL_BIT 3 /* bits of temporal level */ /* --------------------------------------------------------------------------- * SAO (Sample Adaptive Offset) */ #define NUM_BO_OFFSET 32 /*BOģʽoffset4*/ #define MAX_NUM_SAO_CLASSES 32 /*offset*/ #define NUM_SAO_BO_CLASSES_LOG2 5 /**/ #define NUM_SAO_BO_CLASSES_IN_BIT 5 /**/ #define NUM_SAO_BO_CLASSES (1 << NUM_SAO_BO_CLASSES_LOG2) /*BOģʽstartbandĿ*/ #define SAO_RATE_THR 1.0 /*ȷRDO*/ #define SAO_RATE_CHROMA_THR 1.0 /*ɫȷRDO*/ #define SAO_SHIFT_PIX_NUM 4 /*SAOƫƵص*/ #define MAX_DOUBLE 1.7e+308 /* --------------------------------------------------------------------------- * ALF (Adaptive Loop Filter) */ #define ALF_MAX_NUM_COEF 9 #define NO_VAR_BINS 16 #define LOG2_VAR_SIZE_H 2 #define LOG2_VAR_SIZE_W 2 #define ALF_FOOTPRINT_SIZE 7 #define DF_CHANGED_SIZE 3 #define ALF_NUM_BIT_SHIFT 6 #define LAMBDA_SCALE_LUMA (1.0) /* scale for luma */ #define LAMBDA_SCALE_CHROMA (1.0) /* scale for chroma */ /* --------------------------------------------------------------------------- * threshold values to zero out quantized transform coefficients */ #define LUMA_COEFF_COST 1 /* threshold for luma coefficients */ #define MAX_COEFF_QUASI_ZERO 8 /* threshold for quasi zero block detection with luma coefficients */ /* --------------------------------------------------------------------------- * number of luma intra modes for full RDO */ #define INTRA_MODE_NUM_FOR_RDO 9 /* number of luma intra modes for full RDO */ /* --------------------------------------------------------------------------- * max values */ #define MAX_DISTORTION (1 << 30) /* maximum distortion (1 << bitdepth)^2 * (MAX_CU_SIZE)^2 */ #define XAVS2_THREAD_MAX 128 /* max number of threads */ #define XAVS2_BS_HEAD_LEN 256 /* length of bitstream buffer for headers */ #define XAVS2_PAD (64 + 16) /* number of pixels padded around the reference frame */ #define MAX_COST (1LL << 50) /* used for start value for cost variables */ #define MAX_FRAME_INDEX 0x3FFFFF00 /* max frame index */ #define MAX_REFS XAVS2_MAX_REFS /* max number of reference frames */ #define MAX_SLICES 8 /* max number of slices in one picture */ #define MAX_PARALLEL_FRAMES 8 /* max number of parallel encoding frames */ #define MAX_COI_VALUE ((1<<8) - 1) /* max COI value (unsigned char) */ #define PIXEL_MAX ((1< * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_IN_LOOP_FILTERS_H #define XAVS2_IN_LOOP_FILTERS_H /** * =========================================================================== * global/local variables * =========================================================================== */ static const int tab_saoclip[NUM_SAO_OFFSET][3] = { // EO { -1, 6, 7 }, // low bound, upper bound, threshold { 0, 1, 1 }, { 0, 0, 0 }, { -1, 0, 1 }, { -6, 1, 7 }, { -7, 7, 7 } // BO }; /* --------------------------------------------------------------------------- * lcu neighbor */ enum lcu_neighbor_e { SAO_T = 0, /* top */ SAO_D = 1, /* down */ SAO_L = 2, /* left */ SAO_R = 3, /* right */ SAO_TL = 4, /* top-left */ SAO_TR = 5, /* top-right */ SAO_DL = 6, /* down-left */ SAO_DR = 7 /* down-right */ }; typedef struct sao_region_t { int pix_x[NUM_SAO_COMPONENTS]; /* start pixel position in x */ int pix_y[NUM_SAO_COMPONENTS]; /* start pixel position in y */ int width[NUM_SAO_COMPONENTS]; /* */ int height[NUM_SAO_COMPONENTS]; /* */ /* availabilities of neighboring blocks */ int8_t b_left; int8_t b_top_left; int8_t b_top; int8_t b_top_right; int8_t b_right; int8_t b_right_down; int8_t b_down; int8_t b_down_left; } sao_region_t; #define xavs2_lcu_deblock FPFX(lcu_deblock) void xavs2_lcu_deblock(xavs2_t *h, xavs2_frame_t *frm); #endif // XAVS2_IN_LOOP_FILTERS_H xavs2-1.3/source/common/filter_alf.c000066400000000000000000000224641340660520300174720ustar00rootroot00000000000000/* * filter_alf.h * * Description of this file: * ALF filter functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "primitives.h" #include "filter.h" #include "cudata.h" #include "cpu.h" /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void alf_filter_block1(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, int *alf_coeff, int b_top_avail, int b_down_avail) { const int pel_add = 1 << (ALF_NUM_BIT_SHIFT - 1); int startPos = b_top_avail ? (lcu_pix_y - 4) : lcu_pix_y; int endPos = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height); int xPosEnd = lcu_pix_x + lcu_width; int min_x = lcu_pix_x - 3; int max_x = xPosEnd - 1 + 3; int yUp, yBottom; int xLeft, xRight; int x, y, pel_val; pel_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6; p_src += (startPos * i_src); p_dst += (startPos * i_dst); for (y = startPos; y < endPos; y++) { yUp = XAVS2_CLIP3(startPos, endPos - 1, y - 1); yBottom = XAVS2_CLIP3(startPos, endPos - 1, y + 1); p_src1 = p_src + (yBottom - y) * i_src; p_src2 = p_src + (yUp - y) * i_src; yUp = XAVS2_CLIP3(startPos, endPos - 1, y - 2); yBottom = XAVS2_CLIP3(startPos, endPos - 1, y + 2); p_src3 = p_src + (yBottom - y) * i_src; p_src4 = p_src + (yUp - y) * i_src; yUp = XAVS2_CLIP3(startPos, endPos - 1, y - 3); yBottom = XAVS2_CLIP3(startPos, endPos - 1, y + 3); p_src5 = p_src + (yBottom - y) * i_src; p_src6 = p_src + (yUp - y) * i_src; for (x = lcu_pix_x; x < xPosEnd; x++) { pel_val = alf_coeff[0] * (p_src5[x] + p_src6[x]); pel_val += alf_coeff[1] * (p_src3[x] + p_src4[x]); xLeft = XAVS2_CLIP3(min_x, max_x, x - 1); xRight = XAVS2_CLIP3(min_x, max_x, x + 1); pel_val += alf_coeff[2] * (p_src1[xRight] + p_src2[xLeft ]); pel_val += alf_coeff[3] * (p_src1[x ] + p_src2[x ]); pel_val += alf_coeff[4] * (p_src1[xLeft ] + p_src2[xRight]); pel_val += alf_coeff[7] * (p_src [xRight] + p_src [xLeft ]); xLeft = XAVS2_CLIP3(min_x, max_x, x - 2); xRight = XAVS2_CLIP3(min_x, max_x, x + 2); pel_val += alf_coeff[6] * (p_src [xRight] + p_src [xLeft ]); xLeft = XAVS2_CLIP3(min_x, max_x, x - 3); xRight = XAVS2_CLIP3(min_x, max_x, x + 3); pel_val += alf_coeff[5] * (p_src [xRight] + p_src [xLeft ]); pel_val += alf_coeff[8] * (p_src [x ]); pel_val = (pel_val + pel_add) >> ALF_NUM_BIT_SHIFT; p_dst[x] = (pel_t)XAVS2_CLIP1(pel_val); } p_src += i_src; p_dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void alf_filter_block2(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, int *alf_coeff, int b_top_avail, int b_down_avail) { pel_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6; int pixelInt; int startPos = b_top_avail ? (lcu_pix_y - 4) : lcu_pix_y; int endPos = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height); /* first line */ p_src += (startPos * i_src) + lcu_pix_x; p_dst += (startPos * i_dst) + lcu_pix_x; if (p_src[0] != p_src[-1]) { p_src1 = p_src + 1 * i_src; p_src2 = p_src; p_src3 = p_src + 2 * i_src; p_src4 = p_src; p_src5 = p_src + 3 * i_src; p_src6 = p_src; pixelInt = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]); pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]); pixelInt += alf_coeff[2] * (p_src1[ 1] + p_src2[ 0]); pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]); pixelInt += alf_coeff[4] * (p_src1[-1] + p_src2[ 1]); pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]); pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]); pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]); pixelInt += alf_coeff[8] * (p_src [ 0]); pixelInt = (int)((pixelInt + 32) >> 6); p_dst[0] = (pel_t)XAVS2_CLIP1(pixelInt); } p_src += lcu_width - 1; p_dst += lcu_width - 1; if (p_src[0] != p_src[1]) { p_src1 = p_src + 1 * i_src; p_src2 = p_src; p_src3 = p_src + 2 * i_src; p_src4 = p_src; p_src5 = p_src + 3 * i_src; p_src6 = p_src; pixelInt = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]); pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]); pixelInt += alf_coeff[2] * (p_src1[ 1] + p_src2[-1]); pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]); pixelInt += alf_coeff[4] * (p_src1[-1] + p_src2[ 0]); pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]); pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]); pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]); pixelInt += alf_coeff[8] * (p_src [ 0]); pixelInt = (int)((pixelInt + 32) >> 6); p_dst[0] = (pel_t)XAVS2_CLIP1(pixelInt); } /* last line */ p_src -= lcu_width - 1; p_dst -= lcu_width - 1; p_src += ((endPos - startPos - 1) * i_src); p_dst += ((endPos - startPos - 1) * i_dst); if (p_src[0] != p_src[-1]) { p_src1 = p_src; p_src2 = p_src - 1 * i_src; p_src3 = p_src; p_src4 = p_src - 2 * i_src; p_src5 = p_src; p_src6 = p_src - 3 * i_src; pixelInt = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]); pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]); pixelInt += alf_coeff[2] * (p_src1[ 1] + p_src2[-1]); pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]); pixelInt += alf_coeff[4] * (p_src1[ 0] + p_src2[ 1]); pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]); pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]); pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]); pixelInt += alf_coeff[8] * (p_src [ 0]); pixelInt = (int)((pixelInt + 32) >> 6); p_dst[0] = (pel_t)XAVS2_CLIP1(pixelInt); } p_src += lcu_width - 1; p_dst += lcu_width - 1; if (p_src[0] != p_src[1]) { p_src1 = p_src; p_src2 = p_src - 1 * i_src; p_src3 = p_src; p_src4 = p_src - 2 * i_src; p_src5 = p_src; p_src6 = p_src - 3 * i_src; pixelInt = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]); pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]); pixelInt += alf_coeff[2] * (p_src1[ 0] + p_src2[-1]); pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]); pixelInt += alf_coeff[4] * (p_src1[-1] + p_src2[ 1]); pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]); pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]); pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]); pixelInt += alf_coeff[8] * (p_src [ 0]); pixelInt = (int)((pixelInt + 32) >> 6); p_dst[0] = (pel_t)XAVS2_CLIP1(pixelInt); } } /* --------------------------------------------------------------------------- */ void xavs2_alf_init(uint32_t cpuid, intrinsic_func_t *pf) { /* set function handles */ pf->alf_flt[0] = alf_filter_block1; pf->alf_flt[1] = alf_filter_block2; #if HAVE_MMX if (cpuid & XAVS2_CPU_SSE42) { pf->alf_flt[0] = alf_flt_one_block_sse128; } #else UNUSED_PARAMETER(cpuid); #endif } xavs2-1.3/source/common/filter_deblock.c000066400000000000000000000562501340660520300203330ustar00rootroot00000000000000/* * filter_deblock.h * * Description of this file: * Deblock filter functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "primitives.h" #include "filter.h" #include "cudata.h" #include "cpu.h" /** * =========================================================================== * global/local variables * =========================================================================== */ /* --------------------------------------------------------------------------- */ const uint8_t tab_deblock_alpha[64] = { 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 20, 22, 24, 26, 28, 30, 33, 33, 35, 35, 36, 37, 37, 39, 39, 42, 44, 46, 48, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64 }; /* --------------------------------------------------------------------------- */ const uint8_t tab_deblock_beta[64] = { 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 10, 10, 11, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27 }; /* --------------------------------------------------------------------------- * edge direction for deblock */ enum edge_direction_e { EDGE_HOR = 1, /* horizontal */ EDGE_VER = 0 /* vertical */ }; /* --------------------------------------------------------------------------- * edge type for fitler control */ enum edge_type_e { EDGE_TYPE_NOFILTER = 0, /* no deblock filter */ EDGE_TYPE_ONLY_LUMA = 1, /* TU boundary in CU (chroma block does not have such boundaries) */ EDGE_TYPE_BOTH = 2 /* CU boundary and PU boundary */ }; /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void lf_set_edge_filter_param(xavs2_t *h, int i_level, int scu_x, int scu_y, int dir, int edge_type) { const int w_in_scu = h->i_width_in_mincu; // const int h_in_scu = h->i_height_in_mincu; const int y_in_lcu = scu_y - h->lcu.i_scu_y; int scu_num = 1 << (i_level - MIN_CU_SIZE_IN_BIT); int i; if (dir == EDGE_VER) { /* set flag of vertical edges */ if (scu_x == 0) { return; } /* TODO: Is left border Slice border? */ /* set filter type */ // scu_num = XAVS2_MIN(scu_num, h_in_scu - scu_y); for (i = 0; i < scu_num; i++) { if (h->p_deblock_flag[EDGE_VER][(y_in_lcu + i) * w_in_scu + scu_x] != EDGE_TYPE_NOFILTER) { break; } h->p_deblock_flag[EDGE_VER][(y_in_lcu + i) * w_in_scu + scu_x] = (uint8_t)edge_type; } } else { /* set flag of horizontal edges */ if (scu_y == 0) { return; } /* Is this border a slice border inside the picture? */ if (cu_get_slice_index(h, scu_x, scu_y) != cu_get_slice_index(h, scu_x, scu_y - 1)) { if (!h->param->b_cross_slice_loop_filter) { return; } } /* set filter type */ // scu_num = XAVS2_MIN(scu_num, w_in_scu - scu_x); for (i = 0; i < scu_num; i++) { if (h->p_deblock_flag[EDGE_HOR][y_in_lcu * w_in_scu + scu_x + i] != EDGE_TYPE_NOFILTER) { break; } h->p_deblock_flag[EDGE_HOR][y_in_lcu * w_in_scu + scu_x + i] = (uint8_t)edge_type; } } } /* --------------------------------------------------------------------------- */ static void lf_lcu_set_edge_filter(xavs2_t *h, int i_level, int scu_x, int scu_y, int scu_xy) { cu_info_t *p_cu_info = &h->cu_info[scu_xy]; int i; assert(p_cu_info->i_level >= MIN_CU_SIZE_IN_BIT); if (p_cu_info->i_level < i_level) { const int w_in_scu = h->i_width_in_mincu; const int h_in_scu = h->i_height_in_mincu; // 4 sub-cu for (i = 0; i < 4; i++) { int sub_cu_x = (i & 1) << (i_level - MIN_CU_SIZE_IN_BIT - 1); int sub_cu_y = (i >> 1) << (i_level - MIN_CU_SIZE_IN_BIT - 1); int pos; if (scu_x + sub_cu_x >= w_in_scu || scu_y + sub_cu_y >= h_in_scu) { continue; // is outside of the frame } pos = scu_xy + sub_cu_y * w_in_scu + sub_cu_x; lf_lcu_set_edge_filter(h, i_level - 1, scu_x + sub_cu_x, scu_y + sub_cu_y, pos); } } else { // set the first left and top edge filter parameters lf_set_edge_filter_param(h, i_level, scu_x, scu_y, EDGE_VER, EDGE_TYPE_BOTH); // left edge lf_set_edge_filter_param(h, i_level, scu_x, scu_y, EDGE_HOR, EDGE_TYPE_BOTH); // top edge // set other edge filter parameters if (p_cu_info->i_level > MIN_CU_SIZE_IN_BIT) { /* set prediction boundary */ i = i_level - MIN_CU_SIZE_IN_BIT - 1; switch (p_cu_info->i_mode) { case PRED_2NxN: lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << i), EDGE_HOR, EDGE_TYPE_BOTH); break; case PRED_Nx2N: lf_set_edge_filter_param(h, i_level, scu_x + (1 << i), scu_y, EDGE_VER, EDGE_TYPE_BOTH); break; case PRED_I_NxN: lf_set_edge_filter_param(h, i_level, scu_x + (1 << i), scu_y, EDGE_VER, EDGE_TYPE_BOTH); lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << i), EDGE_HOR, EDGE_TYPE_BOTH); break; case PRED_I_2Nxn: if (i > 0) { lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i - 1)), EDGE_HOR, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i - 1)) * 2, EDGE_HOR, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i - 1)) * 3, EDGE_HOR, EDGE_TYPE_ONLY_LUMA); } else { lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i )), EDGE_HOR, EDGE_TYPE_ONLY_LUMA); } break; case PRED_I_nx2N: if (i > 0) { lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i - 1)), scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i - 1)) * 2, scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i - 1)) * 3, scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); } else { lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i )), scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); } break; case PRED_2NxnU: if (i > 0) { lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i - 1)), EDGE_HOR, EDGE_TYPE_BOTH); } break; case PRED_2NxnD: if (i > 0) { lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i - 1)) * 3, EDGE_HOR, EDGE_TYPE_BOTH); } break; case PRED_nLx2N: if (i > 0) { lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i - 1)), scu_y, EDGE_VER, EDGE_TYPE_BOTH); } break; case PRED_nRx2N: if (i > 0) { lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i - 1)) * 3, scu_y, EDGE_VER, EDGE_TYPE_BOTH); } break; default: // for other modes: direct/skip, 2Nx2N inter, 2Nx2N intra, no need to set break; } /* set transform block boundary */ if (p_cu_info->i_mode != PRED_I_NxN && p_cu_info->i_tu_split && p_cu_info->i_cbp != 0) { if (h->param->enable_nsqt && IS_HOR_PU_PART(p_cu_info->i_mode)) { if (p_cu_info->i_level == B16X16_IN_BIT) { lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i )), EDGE_HOR, EDGE_TYPE_ONLY_LUMA); } else { lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i - 1)), EDGE_HOR, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i )), EDGE_HOR, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i )) + (1 << (i - 1)), EDGE_HOR, EDGE_TYPE_ONLY_LUMA); } } else if (h->param->enable_nsqt && IS_VER_PU_PART(p_cu_info->i_mode)) { if (p_cu_info->i_level == B16X16_IN_BIT) { lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i )), scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); } else { lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i - 1)), scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i )), scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i )) + (1 << (i - 1)), scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); } } else { lf_set_edge_filter_param(h, i_level, scu_x + (1 << i), scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << i), EDGE_HOR, EDGE_TYPE_ONLY_LUMA); } } } } } /* --------------------------------------------------------------------------- * Return 1 if skip filtering is needed */ static INLINE uint8_t lf_skip_filter(xavs2_t *h, cu_info_t *MbP, cu_info_t *MbQ, int dir, int block_x, int block_y) { if (h->i_type == SLICE_TYPE_P || h->i_type == SLICE_TYPE_F) { const mv_t *p_mv_buf = h->fwd_1st_mv; const int8_t *p_ref_buf = h->fwd_1st_ref; int w_in_4x4 = h->i_width_in_minpu; int block_x2 = block_x - !dir; int block_y2 = block_y - dir; int pos1 = block_y * w_in_4x4 + block_x; int pos2 = block_y2 * w_in_4x4 + block_x2; if ((MbP->i_cbp == 0) && (MbQ->i_cbp == 0) && (XAVS2_ABS(p_mv_buf[pos1].x - p_mv_buf[pos2].x) < 4) && (XAVS2_ABS(p_mv_buf[pos1].y - p_mv_buf[pos2].y) < 4) && (p_ref_buf[pos1] != INVALID_REF && p_ref_buf[pos1] == p_ref_buf[pos2])) { return 0; } } return 1; } /* --------------------------------------------------------------------------- */ static void lf_edge_core(pel_t *src, int b_chroma, int ptr_inc, int inc1, int alpha, int beta, uint8_t *flt_flag) { int pel; int abs_delta; int L2, L1, L0, R0, R1, R2; int fs; // fs stands for filtering strength. The larger fs is, the stronger filter is applied. int FlatnessL, FlatnessR; int inc2, inc3; int flag = 0; inc2 = inc1 << 1; inc3 = inc1 + inc2; for (pel = 0; pel < MIN_CU_SIZE; pel++) { L2 = src[-inc3]; L1 = src[-inc2]; L0 = src[-inc1]; R0 = src[ 0]; R1 = src[ inc1]; R2 = src[ inc2]; abs_delta = XAVS2_ABS(R0 - L0); flag = (pel < 4) ? flt_flag[0] : flt_flag[1]; if (flag && (abs_delta < alpha) && (abs_delta > 1)) { FlatnessL = (XAVS2_ABS(L1 - L0) < beta) ? 2 : 0; if (XAVS2_ABS(L2 - L0) < beta) { FlatnessL += 1; } FlatnessR = (XAVS2_ABS(R0 - R1) < beta) ? 2 : 0; if (XAVS2_ABS(R0 - R2) < beta) { FlatnessR += 1; } switch (FlatnessL + FlatnessR) { case 6: fs = (R1 == R0 && L0 == L1) ? 4 : 3; break; case 5: fs = (R1 == R0 && L0 == L1) ? 3 : 2; break; case 4: fs = (FlatnessL == 2) ? 2 : 1; break; case 3: fs = (XAVS2_ABS(L1 - R1) < beta) ? 1 : 0; break; default: fs = 0; break; } if (b_chroma && fs > 0) { fs--; } switch (fs) { case 4: src[-inc1] = (pel_t)((L0 + ((L0 + L2) << 3) + L2 + (R0 << 3) + (R2 << 2) + (R2 << 1) + 16) >> 5); // L0 src[-inc2] = (pel_t)(((L0 << 3) - L0 + (L2 << 2) + (L2 << 1) + R0 + (R0 << 1) + 8) >> 4); // L1 src[-inc3] = (pel_t)(((L0 << 2) + L2 + (L2 << 1) + R0 + 4) >> 3); // L2 src[ 0] = (pel_t)((R0 + ((R0 + R2) << 3) + R2 + (L0 << 3) + (L2 << 2) + (L2 << 1) + 16) >> 5); // R0 src[ inc1] = (pel_t)(((R0 << 3) - R0 + (R2 << 2) + (R2 << 1) + L0 + (L0 << 1) + 8) >> 4); // R1 src[ inc2] = (pel_t)(((R0 << 2) + R2 + (R2 << 1) + L0 + 4) >> 3); // R2 break; case 3: src[-inc1] = (pel_t)((L2 + (L1 << 2) + (L0 << 2) + (L0 << 1) + (R0 << 2) + R1 + 8) >> 4); // L0 src[ 0] = (pel_t)((L1 + (L0 << 2) + (R0 << 2) + (R0 << 1) + (R1 << 2) + R2 + 8) >> 4); // R0 src[-inc2] = (pel_t)((L2 * 3 + L1 * 8 + L0 * 4 + R0 + 8) >> 4); src[ inc1] = (pel_t)((R2 * 3 + R1 * 8 + R0 * 4 + L0 + 8) >> 4); break; case 2: src[-inc1] = (pel_t)(((L1 << 1) + L1 + (L0 << 3) + (L0 << 1) + (R0 << 1) + R0 + 8) >> 4); src[ 0] = (pel_t)(((L0 << 1) + L0 + (R0 << 3) + (R0 << 1) + (R1 << 1) + R1 + 8) >> 4); break; case 1: src[-inc1] = (pel_t)((L0 * 3 + R0 + 2) >> 2); src[ 0] = (pel_t)((R0 * 3 + L0 + 2) >> 2); break; default: break; } } src += ptr_inc; // next row or column pel += b_chroma; } } /* --------------------------------------------------------------------------- */ static void deblock_edge_hor(pel_t *src, int stride, int alpha, int beta, uint8_t *flt_flag) { lf_edge_core(src, 0, 1, stride, alpha, beta, flt_flag); } /* --------------------------------------------------------------------------- */ static void deblock_edge_ver(pel_t *src, int stride, int alpha, int beta, uint8_t *flt_flag) { lf_edge_core(src, 0, stride, 1, alpha, beta, flt_flag); } /* --------------------------------------------------------------------------- */ static void deblock_edge_ver_c(pel_t *src_u, pel_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag) { lf_edge_core(src_u, 1, stride, 1, alpha, beta, flt_flag); lf_edge_core(src_v, 1, stride, 1, alpha, beta, flt_flag); } /* --------------------------------------------------------------------------- */ static void deblock_edge_hor_c(pel_t *src_u, pel_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag) { lf_edge_core(src_u, 1, 1, stride, alpha, beta, flt_flag); lf_edge_core(src_v, 1, 1, stride, alpha, beta, flt_flag); } /* --------------------------------------------------------------------------- */ static void lf_scu_deblock(xavs2_t *h, pel_t *p_rec[3], int i_stride, int i_stride_c, int scu_x, int scu_y, int dir) { static const int max_qp_deblock = 63; cu_info_t *MbQ = &h->cu_info[scu_y * h->i_width_in_mincu + scu_x]; /* current SCU */ int edge_type = h->p_deblock_flag[dir][(scu_y - h->lcu.i_scu_y) * h->i_width_in_mincu + scu_x]; if (edge_type != EDGE_TYPE_NOFILTER) { pel_t *src_y = p_rec[0] + (scu_y << MIN_CU_SIZE_IN_BIT) * i_stride + (scu_x << MIN_CU_SIZE_IN_BIT); cu_info_t *MbP = dir ? (MbQ - h->i_width_in_mincu) : (MbQ - 1); /* MbP = Mb of the remote 4x4 block */ int QP = (cu_get_qp(h, MbP) + cu_get_qp(h, MbQ) + 1) >> 1; /* average QP of the two blocks */ int shift = h->param->sample_bit_depth - 8; int offset = shift << 3; /* coded as 10/12 bit, QP is added by (8 * (h->param->sample_bit_depth - 8)) in config file */ int alpha, beta; uint8_t b_filter_edge[2]; b_filter_edge[0] = lf_skip_filter(h, MbP, MbQ, dir, (scu_x << 1), (scu_y << 1)); b_filter_edge[1] = lf_skip_filter(h, MbP, MbQ, dir, (scu_x << 1) + dir, (scu_y << 1) + !dir); if (b_filter_edge[0] == 0 && b_filter_edge[1] == 0) { return; } /* deblock luma edge */ alpha = tab_deblock_alpha[XAVS2_CLIP3(0, max_qp_deblock, QP - offset + h->param->alpha_c_offset)] << shift; beta = tab_deblock_beta [XAVS2_CLIP3(0, max_qp_deblock, QP - offset + h->param->beta_offset)] << shift; g_funcs.deblock_luma[dir](src_y, i_stride, alpha, beta, b_filter_edge); assert(h->param->chroma_format == CHROMA_420 || h->param->chroma_format == CHROMA_400); /* only support I420/I400 now */ /* deblock chroma edge */ if (edge_type == EDGE_TYPE_BOTH && h->param->chroma_format == CHROMA_420) if ((((scu_y & 1) == 0) && dir) || (((scu_x & 1) == 0) && (!dir))) { pel_t *src_u = p_rec[1] + (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * i_stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1)); pel_t *src_v = p_rec[2] + (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * i_stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1)); int alpha_c, beta_c; QP = cu_get_chroma_qp(h, QP, 0) - offset; alpha_c = tab_deblock_alpha[XAVS2_CLIP3(0, max_qp_deblock, QP + h->param->alpha_c_offset)] << shift; beta_c = tab_deblock_beta [XAVS2_CLIP3(0, max_qp_deblock, QP + h->param->beta_offset)] << shift; g_funcs.deblock_chroma[dir](src_u, src_v, i_stride_c, alpha_c, beta_c, b_filter_edge); } } } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ void xavs2_lcu_deblock(xavs2_t *h, xavs2_frame_t *frm) { const int i_stride = frm->i_stride[0]; const int i_stride_c = frm->i_stride[1]; const int w_in_scu = h->i_width_in_mincu; int scu_x = h->lcu.i_scu_x; int scu_y = h->lcu.i_scu_y; int num_of_scu_hor = h->lcu.i_pix_width >> MIN_CU_SIZE_IN_BIT; int num_of_scu_ver = h->lcu.i_pix_height >> MIN_CU_SIZE_IN_BIT; uint8_t *p_fbuf0 = h->p_deblock_flag[0] + scu_x; uint8_t *p_fbuf1 = h->p_deblock_flag[1] + scu_x; int i, j; /* clear edge flags in one LCU */ int size_setzero = num_of_scu_hor * sizeof(uint8_t); for (j = 0; j < num_of_scu_ver; j++) { g_funcs.fast_memzero(p_fbuf0, size_setzero); g_funcs.fast_memzero(p_fbuf1, size_setzero); p_fbuf0 += w_in_scu; p_fbuf1 += w_in_scu; } /* set edge flags in one LCU */ lf_lcu_set_edge_filter(h, h->i_lcu_level, h->lcu.i_scu_x, h->lcu.i_scu_y, h->lcu.i_scu_xy); /* deblock all vertical edges in one LCU */ for (j = 0; j < num_of_scu_ver; j++) { for (i = 0; i < num_of_scu_hor; i++) { lf_scu_deblock(h, frm->planes, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_VER); } } /* adjust the value of scu_x and num_of_scu_hor */ if (scu_x == 0) { /* the current LCU is the first LCU in a LCU row */ num_of_scu_hor--; /* leave the last horizontal edge */ } else { /* the current LCU is one of the rest LCUs in a row */ if (scu_x + num_of_scu_hor == w_in_scu) { /* the current LCU is the last LCUs in a row, * need deblock one horizontal edge more */ num_of_scu_hor++; } scu_x--; /* begin from the last horizontal edge of previous LCU */ } /* deblock all horizontal edges in one LCU */ for (j = 0; j < num_of_scu_ver; j++) { for (i = 0; i < num_of_scu_hor; i++) { lf_scu_deblock(h, frm->planes, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_HOR); } } } /* --------------------------------------------------------------------------- */ void xavs2_deblock_init(uint32_t cpuid, intrinsic_func_t* lf) { lf->deblock_luma [0] = deblock_edge_ver; lf->deblock_luma [1] = deblock_edge_hor; lf->deblock_chroma[0] = deblock_edge_ver_c; lf->deblock_chroma[1] = deblock_edge_hor_c; #if HAVE_MMX if (cpuid & XAVS2_CPU_SSE42) { lf->deblock_luma[0] = deblock_edge_ver_sse128; lf->deblock_luma[1] = deblock_edge_hor_sse128; // lf->deblock_chroma[0] = deblock_edge_ver_c_sse128; // lf->deblock_chroma[1] = deblock_edge_hor_c_sse128; } if (cpuid & XAVS2_CPU_AVX2) { // In some machines, avx is slower than SSE // lf->deblock_luma[0] = deblock_edge_ver_avx2; // lf->deblock_luma[1] = deblock_edge_hor_avx2; // lf->deblock_chroma[0] = deblock_edge_ver_c_avx2; // lf->deblock_chroma[1] = deblock_edge_hor_c_avx2; } #else UNUSED_PARAMETER(cpuid); #endif } xavs2-1.3/source/common/filter_sao.c000066400000000000000000000232401340660520300175030ustar00rootroot00000000000000/* * filter_sao.h * * Description of this file: * SAO filter functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "primitives.h" #include "filter.h" #include "cudata.h" #include "cpu.h" /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void sao_block_c(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int i_block_w, int i_block_h, int *lcu_avail, SAOBlkParam *sao_param) { int8_t SIGN_BUF[MAX_CU_SIZE + 32]; // sign of top line int8_t *UPROW_S = SIGN_BUF + 16; int *sao_offset = sao_param->offset; const int max_pel_val = (1 << g_bit_depth) - 1; int reg = 0; int sx, sy, ex, ey; // start/end (x, y) int sx_0, ex_0, sx_n, ex_n; // start/end x for first and last row int left_sign, right_sign, top_sign, down_sign; int edge_type; int pel_diff; int x, y; assert(sao_param->typeIdc != SAO_TYPE_OFF); switch (sao_param->typeIdc) { case SAO_TYPE_EO_0: sx = lcu_avail[SAO_L] ? 0 : 1; ex = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); for (y = 0; y < i_block_h; y++) { left_sign = xavs2_sign3(p_src[sx] - p_src[sx - 1]); for (x = sx; x < ex; x++) { right_sign = xavs2_sign3(p_src[x] - p_src[x + 1]); edge_type = left_sign + right_sign + 2; left_sign = -right_sign; p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } p_src += i_src; p_dst += i_dst; } break; case SAO_TYPE_EO_90: { sy = lcu_avail[SAO_T] ? 0 : 1; ey = lcu_avail[SAO_D] ? i_block_h : (i_block_h - 1); for (x = 0; x < i_block_w; x++) { pel_diff = p_src[sy * i_src + x] - p_src[(sy - 1) * i_src + x]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); for (y = sy; y < ey; y++) { pel_diff = p_src[y * i_src + x] - p_src[(y + 1) * i_src + x]; down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + top_sign + 2; top_sign = -down_sign; p_dst[y * i_dst + x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[y * i_src + x] + sao_offset[edge_type]); } } break; } case SAO_TYPE_EO_135: sx = lcu_avail[SAO_L] ? 0 : 1; ex = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); // init the line buffer for (x = sx; x < ex; x++) { pel_diff = p_src[i_src + x + 1] - p_src[x]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); UPROW_S[x + 1] = (int8_t)top_sign; } // first row sx_0 = lcu_avail[SAO_TL] ? 0 : 1; ex_0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; for (x = sx_0; x < ex_0; x++) { pel_diff = p_src[x] - p_src[-i_src + x - 1]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = top_sign - UPROW_S[x + 1] + 2; p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } // middle rows for (y = 1; y < i_block_h - 1; y++) { p_src += i_src; p_dst += i_dst; for (x = sx; x < ex; x++) { if (x == sx) { pel_diff = p_src[x] - p_src[-i_src + x - 1]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); UPROW_S[x] = (int8_t)top_sign; } pel_diff = p_src[x] - p_src[i_src + x + 1]; down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + UPROW_S[x] + 2; p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); UPROW_S[x] = (int8_t)reg; reg = -down_sign; } } // last row sx_n = lcu_avail[SAO_D] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); ex_n = lcu_avail[SAO_DR] ? i_block_w : (i_block_w - 1); p_src += i_src; p_dst += i_dst; for (x = sx_n; x < ex_n; x++) { if (x == sx) { pel_diff = p_src[x] - p_src[-i_src + x - 1]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); UPROW_S[x] = (int8_t)top_sign; } pel_diff = p_src[x] - p_src[i_src + x + 1]; down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + UPROW_S[x] + 2; p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } break; case SAO_TYPE_EO_45: sx = lcu_avail[SAO_L] ? 0 : 1; ex = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); // init the line buffer for (x = sx; x < ex; x++) { pel_diff = p_src[i_src + x - 1] - p_src[x]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); UPROW_S[x - 1] = (int8_t)top_sign; } // first row sx_0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); ex_0 = lcu_avail[SAO_TR] ? i_block_w : (i_block_w - 1); for (x = sx_0; x < ex_0; x++) { pel_diff = p_src[x] - p_src[-i_src + x + 1]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = top_sign - UPROW_S[x - 1] + 2; p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } // middle rows for (y = 1; y < i_block_h - 1; y++) { p_src += i_src; p_dst += i_dst; for (x = sx; x < ex; x++) { if (x == ex - 1) { pel_diff = p_src[x] - p_src[-i_src + x + 1]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); UPROW_S[x] = (int8_t)top_sign; } pel_diff = p_src[x] - p_src[i_src + x - 1]; down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + UPROW_S[x] + 2; p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); UPROW_S[x - 1] = (int8_t)(-down_sign); } } // last row sx_n = lcu_avail[SAO_DL] ? 0 : 1; ex_n = lcu_avail[SAO_D] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; p_src += i_src; p_dst += i_dst; for (x = sx_n; x < ex_n; x++) { if (x == ex - 1) { pel_diff = p_src[x] - p_src[-i_src + x + 1]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); UPROW_S[x] = (int8_t)top_sign; } pel_diff = p_src[x] - p_src[i_src + x - 1]; down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + UPROW_S[x] + 2; p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } break; case SAO_TYPE_BO: pel_diff = g_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT; for (y = 0; y < i_block_h; y++) { for (x = 0; x < i_block_w; x++) { edge_type = p_src[x] >> pel_diff; p_dst[x] = (pel_t)XAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } p_src += i_src; p_dst += i_dst; } break; default: xavs2_log(NULL, XAVS2_LOG_ERROR, "Not a supported SAO types."); assert(0); exit(-1); } } /* --------------------------------------------------------------------------- */ void xavs2_sao_init(uint32_t cpuid, intrinsic_func_t *pf) { pf->sao_block = sao_block_c; #if HAVE_MMX if (cpuid & XAVS2_CPU_SSE4) { pf->sao_block = SAO_on_block_sse128; } #ifdef _MSC_VER if (cpuid & XAVS2_CPU_AVX2) { pf->sao_block = SAO_on_block_sse256; } #endif // if _MSC_VER #endif // HAVE_MMX } xavs2-1.3/source/common/frame.c000066400000000000000000000556431340660520300164620ustar00rootroot00000000000000/* * frame.c * * Description of this file: * Frame handling functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "wrapper.h" #include "frame.h" /** * =========================================================================== * macro defines * =========================================================================== */ /* --------------------------------------------------------------------------- * pointer */ #define get_plane_ptr(...) \ MULTI_LINE_MACRO_BEGIN\ if (get_plane_pointer(__VA_ARGS__) < 0) {\ return -1;\ }\ MULTI_LINE_MACRO_END /** * =========================================================================== * memory handling * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int align_stride(int x, int align, int disalign) { x = XAVS2_ALIGN(x, align); if (!(x & (disalign - 1))) { x += align; } return x; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int align_plane_size(int x, int disalign) { if (!(x & (disalign - 1))) { x += 128; } return x; } /* --------------------------------------------------------------------------- */ size_t xavs2_frame_buffer_size(const xavs2_param_t *param, int alloc_type) { int img_w_l = ((param->org_width + MIN_CU_SIZE - 1) >> MIN_CU_SIZE_IN_BIT) << MIN_CU_SIZE_IN_BIT; int img_h_l = ((param->org_height + MIN_CU_SIZE - 1) >> MIN_CU_SIZE_IN_BIT) << MIN_CU_SIZE_IN_BIT; int img_w_c = img_w_l >> (param->chroma_format <= CHROMA_420 ? 1 : 0); int img_h_c = img_h_l >> (param->chroma_format <= CHROMA_420 ? 1 : 0); int align = 32; int disalign = 1 << 16; int stride_l, stride_c; int size_l, size_c; /* size of luma and chroma plane */ int i_nal_info_size = 0; int mem_size; /* total memory size */ int planes_size; int bs_size = 0; /* reuse the YUV plane space */ int cmp_size = 0; /* size of frame complexity buffer */ int cmp_buf_size = 0; /* complexity buffer size */ #if SAVE_CU_INFO int frame_size_in_mincu = 0; #endif int frame_size_in_mvstore = 0; /* reference information size */ /* compute stride and the plane size */ switch (alloc_type) { case FT_DEC: /* +PAD for extra data for me */ stride_l = align_stride(img_w_l + ((XAVS2_PAD << 1) ), align, disalign); stride_c = align_stride(img_w_c + ((XAVS2_PAD >> 1) << 1), align, disalign); size_l = align_plane_size(stride_l * (img_h_l + ((XAVS2_PAD << 1) ) + 1), disalign); size_c = align_plane_size(stride_c * (img_h_c + ((XAVS2_PAD >> 1) << 1) + 1), disalign); #if SAVE_CU_INFO frame_size_in_mincu = (img_w_l >> MIN_CU_SIZE_IN_BIT) * (img_h_l >> MIN_CU_SIZE_IN_BIT); #endif frame_size_in_mvstore = (((img_w_l >> MIN_PU_SIZE_IN_BIT) + 3) >> 2) * (((img_h_l >> MIN_PU_SIZE_IN_BIT) + 3) >> 2); planes_size = size_l + size_c * 2; #if ENABLE_FRAME_SUBPEL_INTPL planes_size += size_l * 15; #endif break; case FT_TEMP: /* +PAD for extra data for me */ stride_l = align_stride(img_w_l + ((XAVS2_PAD << 1) ), align, disalign); stride_c = align_stride(img_w_c + ((XAVS2_PAD >> 1) << 1), align, disalign); size_l = align_plane_size(stride_l * (img_h_l + ((XAVS2_PAD << 1) ) + 1), disalign); size_c = align_plane_size(stride_c * (img_h_c + ((XAVS2_PAD >> 1) << 1) + 1), disalign); planes_size = size_l + size_c * 2; break; default: stride_l = align_stride(img_w_l, align, disalign); stride_c = align_stride(img_w_c, align, disalign); size_l = align_plane_size(stride_l * img_h_l, disalign); size_c = align_plane_size(stride_c * img_h_c, disalign); planes_size = size_l + size_c * 2; } if (alloc_type == FT_ENC) { #if XAVS2_ADAPT_LAYER i_nal_info_size = (param->slice_num + 6) * sizeof(xavs2_nal_info_t); #endif bs_size = size_l * sizeof(uint8_t); /* let the PSNR compute correctly */ } /* compute space size and alloc memory */ mem_size = sizeof(xavs2_frame_t) + /* M0, size of frame handle */ i_nal_info_size + /* M1, size of nal_info buffer */ cmp_size + cmp_buf_size + /* M2, size of frame complexity buffer */ bs_size + /* M3, size of bitstream buffer */ planes_size * sizeof(pel_t) + /* M4, size of planes buffer: Y+U+V */ frame_size_in_mvstore * sizeof(int8_t) + /* M5, size of pu reference index buffer */ frame_size_in_mvstore * sizeof(mv_t) + /* M6, size of pu motion vector buffer */ #if SAVE_CU_INFO frame_size_in_mincu * sizeof(int8_t) * 3 + /* M7, size of cu mode/cbp/level buffers */ #endif (img_h_l >> MIN_CU_SIZE_IN_BIT) * sizeof(int)+ /* M8, line status array */ CACHE_LINE_SIZE * 10; /* align to CACHE_LINE_SIZE */ mem_size = (mem_size + CACHE_LINE_SIZE - 1) & (~(uint32_t)(CACHE_LINE_SIZE - 1)); return mem_size; } /* --------------------------------------------------------------------------- */ xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type) { xavs2_frame_t *frame; int img_w_l = h->i_width; int img_h_l = h->i_height; int img_w_c = img_w_l >> (h->param->chroma_format <= CHROMA_420 ? 1 : 0); int img_h_c = img_h_l >> (h->param->chroma_format <= CHROMA_420 ? 1 : 0); int align = 32; int disalign = 1 << 16; int stride_l, stride_c; int size_l, size_c; /* size of luma and chroma plane */ int i_nal_info_size = 0; int mem_size; /* total memory size */ int planes_size, i; int bs_size = 0; /* reuse the YUV plane space */ int cmp_size = 0; /* size of frame complexity buffer */ int cmp_buf_size = 0; /* complexity buffer size */ #if SAVE_CU_INFO int frame_size_in_mincu = 0; #endif int frame_size_in_mvstore = 0; /* reference information size */ uint8_t *mem_ptr; /* compute stride and the plane size */ switch (alloc_type) { case FT_DEC: /* +PAD for extra data for me */ stride_l = align_stride(img_w_l + ((XAVS2_PAD << 1) ), align, disalign); stride_c = align_stride(img_w_c + ((XAVS2_PAD >> 1) << 1), align, disalign); size_l = align_plane_size(stride_l * (img_h_l + ((XAVS2_PAD << 1) ) + 1), disalign); size_c = align_plane_size(stride_c * (img_h_c + ((XAVS2_PAD >> 1) << 1) + 1), disalign); #if SAVE_CU_INFO frame_size_in_mincu = h->i_width_in_mincu * h->i_height_in_mincu; #endif frame_size_in_mvstore = ((h->i_width_in_minpu + 3) >> 2) * ((h->i_height_in_minpu + 3) >> 2); planes_size = size_l + size_c * 2; #if ENABLE_FRAME_SUBPEL_INTPL if (h->use_fractional_me == 1) { planes_size += size_l * 3; } else if (h->use_fractional_me == 2) { planes_size += size_l * 15; } #endif break; case FT_TEMP: /* for SAO and ALF */ /* +PAD for extra data for me */ stride_l = align_stride(img_w_l + ((XAVS2_PAD << 1) ), align, disalign); stride_c = align_stride(img_w_c + ((XAVS2_PAD >> 1) << 1), align, disalign); size_l = align_plane_size(stride_l * (img_h_l + ((XAVS2_PAD << 1) ) + 1), disalign); size_c = align_plane_size(stride_c * (img_h_c + ((XAVS2_PAD >> 1) << 1) + 1), disalign); planes_size = size_l + size_c * 2; break; default: stride_l = align_stride(img_w_l, align, disalign); stride_c = align_stride(img_w_c, align, disalign); size_l = align_plane_size(stride_l * img_h_l, disalign); size_c = align_plane_size(stride_c * img_h_c, disalign); planes_size = size_l + size_c * 2; } if (alloc_type == FT_ENC) { #if XAVS2_ADAPT_LAYER i_nal_info_size = (h->param->slice_num + 6) * sizeof(xavs2_nal_info_t); #endif bs_size = size_l * sizeof(uint8_t); /* let the PSNR compute correctly */ } /* compute space size and alloc memory */ mem_size = sizeof(xavs2_frame_t) + /* M0, size of frame handle */ i_nal_info_size + /* M1, size of nal_info buffer */ cmp_size + cmp_buf_size + /* M2, size of frame complexity buffer */ bs_size + /* M3, size of bitstream buffer */ planes_size * sizeof(pel_t) + /* M4, size of planes buffer: Y+U+V */ frame_size_in_mvstore * sizeof(int8_t) + /* M5, size of pu reference index buffer */ frame_size_in_mvstore * sizeof(mv_t) + /* M6, size of pu motion vector buffer */ #if SAVE_CU_INFO frame_size_in_mincu * sizeof(int8_t) * 3 + /* M7, size of cu mode/cbp/level buffers */ #endif h->i_height_in_lcu * sizeof(int) + /* M8, line status array */ CACHE_LINE_SIZE * 10; /* align to CACHE_LINE_SIZE */ mem_size = (mem_size + CACHE_LINE_SIZE - 1) & (~(uint32_t)(CACHE_LINE_SIZE - 1)); if (mem_base == NULL) { CHECKED_MALLOC(mem_ptr, uint8_t *, mem_size); } else { mem_ptr = *mem_base; } /* M0, frame handle */ frame = (xavs2_frame_t *)mem_ptr; mem_ptr += sizeof(xavs2_frame_t); ALIGN_POINTER(mem_ptr); /* set frame properties */ frame->i_plane = 3; /* planes: Y+U+V */ frame->i_width [0] = img_w_l; frame->i_lines [0] = img_h_l; frame->i_stride[0] = stride_l; frame->i_width [1] = frame->i_width [2] = img_w_c; frame->i_lines [1] = frame->i_lines [2] = img_h_c; frame->i_stride[1] = frame->i_stride[2] = stride_c; /* the default setting of a frame */ frame->i_frame = -1; frame->i_frm_coi = -1; frame->i_gop_idr_coi = -1; if (h->param->chroma_format == CHROMA_400) { frame->i_plane = 1; } frame->i_frm_type = XAVS2_TYPE_AUTO; frame->i_pts = -1; frame->i_dts = -1; frame->b_enable_intra = (h->param->enable_intra); /* buffer for fenc */ if (alloc_type == FT_ENC) { #if XAVS2_ADAPT_LAYER /* M1, nal_info buffer */ frame->nal_info = (xavs2_nal_info_t *)mem_ptr; frame->i_nal = 0; mem_ptr += i_nal_info_size; ALIGN_POINTER(mem_ptr); #endif /* M2, set the bit stream buffer pointer and length * NOTE: the size of bitstream buffer is big enough, no need to reallocate * memory in function encoder_encapsulate_nals */ frame->p_bs_buf = mem_ptr; frame->i_bs_buf = bs_size; /* the length is long enough */ mem_ptr += bs_size; } /* M3, buffer for planes: Y+U+V */ frame->plane_buf = (pel_t *)mem_ptr; frame->size_plane_buf = (size_l + 2 * size_c) * sizeof(pel_t); frame->planes[0] = (pel_t *)mem_ptr; frame->planes[1] = frame->planes[0] + size_l; frame->planes[2] = frame->planes[1] + size_c; mem_ptr += (size_l + size_c * 2) * sizeof(pel_t); if (alloc_type == FT_DEC || alloc_type == FT_TEMP) { uint8_t *p_align; /* point to plane data area */ frame->planes[0] += frame->i_stride[0] * (XAVS2_PAD ) + (XAVS2_PAD ); frame->planes[1] += frame->i_stride[1] * (XAVS2_PAD / 2) + (XAVS2_PAD / 2); frame->planes[2] += frame->i_stride[2] * (XAVS2_PAD / 2) + (XAVS2_PAD / 2); /* make sure the pointers are aligned */ p_align = (uint8_t *)frame->planes[0]; ALIGN_POINTER(p_align); frame->planes[0] = (pel_t *)p_align; p_align = (uint8_t *)frame->planes[1]; ALIGN_POINTER(p_align); frame->planes[1] = (pel_t *)p_align; p_align = (uint8_t *)frame->planes[2]; ALIGN_POINTER(p_align); frame->planes[2] = (pel_t *)p_align; } if (alloc_type == FT_DEC) { /* buffer for luma interpolated planes */ frame->filtered[0] = frame->planes[0]; // full pel plane, reused for (i = 1; i < 16; i++) { frame->filtered[i] = NULL; } #if ENABLE_FRAME_SUBPEL_INTPL switch (h->use_fractional_me) { case 1: frame->filtered[2] = (pel_t *)mem_ptr; mem_ptr += size_l * sizeof(pel_t); frame->filtered[8] = (pel_t *)mem_ptr; mem_ptr += size_l * sizeof(pel_t); frame->filtered[10] = (pel_t *)mem_ptr; mem_ptr += size_l * sizeof(pel_t); break; case 2: for (i = 1; i < 16; i++) { frame->filtered[i] = (pel_t *)mem_ptr; mem_ptr += size_l * sizeof(pel_t); } break; default: break; } #endif /* point to plane data area */ for (i = 1; i < 16; i++) { if (frame->filtered[i] != NULL) { frame->filtered[i] += frame->i_stride[0] * XAVS2_PAD + XAVS2_PAD; } } ALIGN_POINTER(mem_ptr); /* M4, reference index buffer */ frame->pu_ref = (int8_t *)mem_ptr; mem_ptr += frame_size_in_mvstore * sizeof(int8_t); ALIGN_POINTER(mem_ptr); /* M5, pu motion vector buffer */ frame->pu_mv = (mv_t *)mem_ptr; mem_ptr += frame_size_in_mvstore * sizeof(mv_t); ALIGN_POINTER(mem_ptr); #if SAVE_CU_INFO /* M6, cu mode/cbp/level buffers */ frame->cu_mode = (int8_t *)mem_ptr; mem_ptr += frame_size_in_mincu * sizeof(int8_t); ALIGN_POINTER(mem_ptr); frame->cu_cbp = (int8_t *)mem_ptr; mem_ptr += frame_size_in_mincu * sizeof(int8_t); ALIGN_POINTER(mem_ptr); frame->cu_level = (int8_t *)mem_ptr; mem_ptr += frame_size_in_mincu * sizeof(int8_t); ALIGN_POINTER(mem_ptr); #endif /* M7, line status array */ frame->num_lcu_coded_in_row = (int *)mem_ptr; mem_ptr += h->i_height_in_lcu * sizeof(int); ALIGN_POINTER(mem_ptr); memset(frame->num_lcu_sao_off, 0, sizeof(frame->num_lcu_sao_off)); } if (mem_ptr - (uint8_t *)frame > mem_size) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Failed to alloc one frame, type %d\n", alloc_type); goto fail; } /* update mem_base */ if (mem_base != NULL) { *mem_base = mem_ptr; } /* initialize default value */ frame->i_qpplus1 = 0; frame->cnt_refered = 0; /* initialize signals */ if (xavs2_thread_mutex_init(&frame->mutex, NULL)) { goto fail; } if (xavs2_thread_cond_init(&frame->cond, NULL)) { goto fail; } return frame; fail: xavs2_free(mem_ptr); return NULL; } /* --------------------------------------------------------------------------- */ void xavs2_frame_delete(xavs2_handler_t *h_mgr, xavs2_frame_t *frame) { if (frame == NULL) { return; } UNUSED_PARAMETER(h_mgr); xavs2_thread_cond_destroy(&frame->cond); xavs2_thread_mutex_destroy(&frame->mutex); /* free the frame itself */ xavs2_free(frame); } /* --------------------------------------------------------------------------- */ void xavs2_frame_destroy_objects(xavs2_handler_t *h_mgr, xavs2_frame_t *frame) { if (frame == NULL) { return; } UNUSED_PARAMETER(h_mgr); xavs2_thread_cond_destroy(&frame->cond); xavs2_thread_mutex_destroy(&frame->mutex); } /** * =========================================================================== * border expanding * =========================================================================== */ /* --------------------------------------------------------------------------- */ void plane_expand_border(pel_t *p_pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom) { pel_t *pix = p_pix; pel_t *row; int y; /* --- horizontal ---------------------------------------------- */ for (y = 0; y < i_height; y++) { g_funcs.mem_repeat_p(pix - i_padh, pix[0 ], i_padh); /* left band */ g_funcs.mem_repeat_p(pix + i_width, pix[i_width - 1], i_padh); /* right band */ pix += i_stride; } /* --- vertical ------------------------------------------------ */ i_width += (i_padh << 1); /* upper band */ if (b_pad_top) { pix = row = p_pix - i_padh; /* start row position */ for (y = 0; y < i_padv; y++) { pix -= i_stride; memcpy(pix, row, i_width * sizeof(pel_t)); } } /* lower band */ if (b_pad_bottom) { pix = row = p_pix + (i_height - 1) * i_stride - i_padh; for (y = 0; y < i_padv; y++) { pix += i_stride; memcpy(pix, row, i_width * sizeof(pel_t)); } } } /* --------------------------------------------------------------------------- */ void xavs2_frame_expand_border_frame(xavs2_t *h, xavs2_frame_t *frame) { int slice_start_y = 0; int slice_height = frame->i_lines[0]; int b_frame_start = 1; int b_frame_end = 1; int i; pel_t *pix; UNUSED_PARAMETER(h); for (i = 0; i < frame->i_plane; i++) { int chroma = !!i; int stride = frame->i_stride[i]; int width = frame->i_width[i]; int height = slice_height >> chroma; int pad_h = XAVS2_PAD >> chroma; int pad_v = XAVS2_PAD >> chroma; pix = frame->planes[i] + (slice_start_y >> chroma) * stride; plane_expand_border(pix, stride, width, height, pad_h, pad_v, b_frame_start, b_frame_end); } } /* --------------------------------------------------------------------------- */ void xavs2_frame_expand_border_lcurow(xavs2_t *h, xavs2_frame_t *frame, int i_lcu_y) { static const int UP_SHIFT = 4; int i_lcu_level = h->i_lcu_level; int b_start = !i_lcu_y; int b_end = (i_lcu_y == h->i_height_in_lcu - 1); int i; assert(h->param->slice_num == 1 || !h->param->b_cross_slice_loop_filter); for (i = 0; i < frame->i_plane; i++) { int chroma_shift = !!i; int stride = frame->i_stride[i]; int width = frame->i_width[i]; int padh = XAVS2_PAD >> chroma_shift; int padv = XAVS2_PAD >> chroma_shift; int y_start = ((i_lcu_y + 0) << (i_lcu_level - chroma_shift)); int y_end = ((i_lcu_y + 1) << (i_lcu_level - chroma_shift)); int height; pel_t *pix; if (i_lcu_y != h->slices[h->i_slice_index]->i_first_lcu_y) { y_start -= UP_SHIFT; } if (i_lcu_y != h->slices[h->i_slice_index]->i_last_lcu_y) { y_end -= UP_SHIFT; } y_end = XAVS2_MIN(frame->i_lines[i], y_end); height = y_end - y_start; // if (i == 0) { // xavs2_log(NULL, XAVS2_LOG_DEBUG, "Pad POC [%3d], Slice %2d, Row %2d, [%3d, %3d)\n", // h->fenc->i_frame, h->i_slice_index, i_lcu_y, y_start, y_end); // } pix = frame->planes[i] + y_start * stride; plane_expand_border(pix, stride, width, height, padh, padv, b_start, b_end); } } /* --------------------------------------------------------------------------- */ void xavs2_frame_expand_border_mod8(xavs2_t *h, xavs2_frame_t *frame) { int i, y; for (i = 0; i < frame->i_plane; i++) { int i_scale = !!i; int i_width = h->param->org_width >> i_scale; int i_height = h->param->org_height >> i_scale; int i_padx = (h->i_width - h->param->org_width ) >> i_scale; int i_pady = (h->i_height - h->param->org_height) >> i_scale; int i_stride = frame->i_stride[i]; /* expand right border */ if (i_padx) { pel_t *pix = frame->planes[i] + i_width; for (y = 0; y < i_height; y++) { memset(pix, pix[-1], i_padx); pix += i_stride; } } /* expand bottom border */ if (i_pady) { int rowlen = (i_width + i_padx) * sizeof(pel_t); pel_t *row = frame->planes[i] + (i_height - 1) * i_stride; pel_t *pix = frame->planes[i] + (i_height ) * i_stride; for (y = i_height; y < i_height + i_pady; y++) { memcpy(pix, row, rowlen); pix += i_stride; } } } } /* --------------------------------------------------------------------------- * FIXME: ҪpaddingĿ */ void xavs2_frame_copy_planes(xavs2_t *h, xavs2_frame_t *dst, xavs2_frame_t *src) { int k; UNUSED_PARAMETER(h); if (dst->size_plane_buf == src->size_plane_buf && dst->i_width[0] == src->i_width[0]) { g_funcs.fast_memcpy(dst->plane_buf, src->plane_buf, src->size_plane_buf); } else { for (k = 0; k < dst->i_plane; k++) { g_funcs.plane_copy(dst->planes[k], dst->i_stride[k], src->planes[k], src->i_stride[k], src->i_width[k], src->i_lines[k]); } } } xavs2-1.3/source/common/frame.h000066400000000000000000000057301340660520300164570ustar00rootroot00000000000000/* * frame.h * * Description of this file: * Frame handling functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_FRAME_H #define XAVS2_FRAME_H /** * =========================================================================== * function declares * =========================================================================== */ #define xavs2_frame_new FPFX(frame_new) xavs2_frame_t *xavs2_frame_new(xavs2_t *h, uint8_t **mem_base, int alloc_type); #define xavs2_frame_delete FPFX(frame_delete) void xavs2_frame_delete(xavs2_handler_t *h_mgr, xavs2_frame_t *frame); #define xavs2_frame_buffer_size FPFX(frame_buffer_size) size_t xavs2_frame_buffer_size(const xavs2_param_t *param, int alloc_type); #define xavs2_frame_destroy_objects FPFX(frame_destroy_objects) void xavs2_frame_destroy_objects(xavs2_handler_t *h_mgr, xavs2_frame_t *frame); #define xavs2_frame_copy_planes FPFX(frame_copy_planes) void xavs2_frame_copy_planes(xavs2_t *h, xavs2_frame_t *dst, xavs2_frame_t *src); #define xavs2_frame_expand_border_frame FPFX(frame_expand_border_frame) void plane_expand_border(pel_t *p_pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom); void xavs2_frame_expand_border_frame(xavs2_t *h, xavs2_frame_t *frame); #define xavs2_frame_expand_border_lcurow FPFX(frame_expand_border_lcurow) void xavs2_frame_expand_border_lcurow(xavs2_t *h, xavs2_frame_t *frame, int i_lcu_y); #define xavs2_frame_expand_border_mod8 FPFX(frame_expand_border_mod8) void xavs2_frame_expand_border_mod8(xavs2_t *h, xavs2_frame_t *frame); #endif /* XAVS2_FRAME_H */ xavs2-1.3/source/common/intra.c000066400000000000000000002437261340660520300165060ustar00rootroot00000000000000/* * intra.c * * Description of this file: * Intra prediction functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "block_info.h" #include "cudata.h" #include "cpu.h" #if HAVE_MMX #include "vec/intrinsic.h" #endif // --------------------------------------------------------------------------- // disable warning #if defined(_MSC_VER) || defined(__ICL) #pragma warning(disable: 4100) // unreferenced formal parameter #endif /** * =========================================================================== * local tables * =========================================================================== */ /* --------------------------------------------------------------------------- */ static const int16_t tab_log2size[MAX_CU_SIZE + 1] = { -1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6 }; /* --------------------------------------------------------------------------- */ static const char tab_auc_dir_dx[NUM_INTRA_MODE] = { 0, 0, 0, 11, 2, 11, 1, 8, 1, 4, 1, 1, /* X */ 0, 1, 1, 4, 1, 8, 1, 11, 2, 11, 4, 8, /* XY */ 0, 8, 4, 11, 2, 11, 1, 8, 1 /* Y */ }; /* --------------------------------------------------------------------------- */ static const char tab_auc_dir_dy[NUM_INTRA_MODE] = { 0, 0, 0, -4, -1, -8, -1, -11, -2, -11, -4, -8, /* X */ 0, 8, 4, 11, 2, 11, 1, 8, 1, 4, 1, 1, /* XY */ 0, -1, -1, -4, -1, -8, -1, -11, -2 /* Y */ }; /* --------------------------------------------------------------------------- */ static const char tab_auc_dir_dxdy[2][NUM_INTRA_MODE][2] = { { // dx/dy { 0,0}, {0,0}, { 0,0}, {11,2}, {2,0}, {11,3}, {1,0}, {93,7}, {1,1}, {93,8}, {1,2}, { 1,3}, /* X */ { 0,0}, { 1,3}, {1,2}, {93,8}, {1,1}, {93,7}, {1,0}, {11,3}, {2,0}, {11,2}, {4,0}, {8,0}, /* XY */ { 0,0}, { 8,0}, {4,0}, {11,2}, {2,0}, {11,3}, {1,0}, {93,7}, {1,1}, /* Y */ }, { // dy/dx { 0,0}, {0,0}, { 0,0}, {93,8}, {1,1}, {93,7}, {1,0}, {11,3}, {2,0}, {11,2}, {4,0}, { 8,0}, /* X */ { 0,0}, { 8,0}, {4,0}, {11,2}, {2,0}, {11,3}, {1,0}, {93,7}, {1,1}, {93,8}, {1,2}, {1,3}, /* XY */ { 0,0}, { 1,3}, {1,2}, {93,8}, {1,1}, {93,7}, {1,0}, {11,3}, {2,0} /* Y */ } }; /** * =========================================================================== * local function definition * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void intra_pred_ver_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { pel_t *p_src = src + 1; int y; for (y = 0; y < bsy; y++) { g_funcs.fast_memcpy(dst, p_src, bsx * sizeof(pel_t)); dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_hor_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { pel_t *p_src = src - 1; while (bsy-- != 0) { g_funcs.mem_repeat_p(dst, *p_src--, bsx); dst += i_dst; } } /* --------------------------------------------------------------------------- * NOTE: dir_mode = (bAboveAvail << 8) + (bLeftAvail) */ static void intra_pred_dc_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int avail_top = dir_mode >> 8; int avail_left = dir_mode & 0xFF; int sum_left = 0; int sum_above = 0; int dc_value; int x, y; pel_t *p_src; p_src = src - 1; for (y = 0; y < bsy; y++) { sum_left += p_src[-y]; } p_src = src + 1; for (x = 0; x < bsx; x++) { sum_above += p_src[x]; } if (avail_left && avail_top) { x = bsx + bsy; dc_value = ((sum_left + sum_above + (x >> 1)) * (512 / x)) >> 9; } else if (avail_left) { dc_value = (sum_left + (bsy >> 1)) >> xavs2_log2u(bsy); } else if (avail_top) { dc_value = (sum_above + (bsx >> 1)) >> xavs2_log2u(bsx); } else { dc_value = g_dc_value; } for (y = 0; y < bsy; y++) { g_funcs.mem_repeat_p(dst, (pel_t)dc_value, bsx); dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_plane_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { /* size in bits: 2 3 4 5 6 */ static const int ib_mult [8] = { 0, 0, 13, 17, 5, 11, 23, 0 }; static const int ib_shift[8] = { 0, 0, 7, 10, 11, 15, 19, 0 }; const int mult_h = ib_mult [tab_log2size[bsx]]; const int mult_v = ib_mult [tab_log2size[bsy]]; const int shift_h = ib_shift[tab_log2size[bsx]]; const int shift_v = ib_shift[tab_log2size[bsy]]; const int W2 = bsx >> 1; /* half block width */ const int H2 = bsy >> 1; /* half block height */ const int vmax = (1 << g_bit_depth) - 1; /* max value of pixel */ int H = 0; int V = 0; int a, b, c; int x, y; pel_t *p_src; /* calculate H and V */ p_src = src + W2; for (x = 1; x < W2 + 1; x++) { H += x * (p_src[x] - p_src[-x]); } p_src = src - H2; for (y = 1; y < H2 + 1; y++) { V += y * (p_src[-y] - p_src[y]); } a = (src[-bsy] + src[bsx]) << 4; b = ((H << 5) * mult_h + (1 << (shift_h - 1))) >> shift_h; c = ((V << 5) * mult_v + (1 << (shift_v - 1))) >> shift_v; a += 16 - b * (W2 - 1) - c * (H2 - 1); for (y = 0; y < bsy; y++) { int pix = a; for (x = 0; x < bsx; x++) { dst[x] = (pel_t)XAVS2_CLIP3(0, vmax, pix >> 5); pix += b; } dst += i_dst; a += c; } } /* --------------------------------------------------------------------------- */ static void intra_pred_bilinear_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { itr_t pTop[MAX_CU_SIZE], pLeft[MAX_CU_SIZE], pT[MAX_CU_SIZE], pL[MAX_CU_SIZE], wy[MAX_CU_SIZE]; int shift_x = tab_log2size[bsx]; int shift_y = tab_log2size[bsy]; int shift = XAVS2_MIN(shift_x, shift_y); int shift_xy = shift_x + shift_y + 1; int offset = 1 << (shift_x + shift_y); int vmax = max_pel_value; // max value of pixel int a, b, c, t, wxy, temp; int predx, val; int x, y; pel_t *p_src; p_src = src + 1; for (x = 0; x < bsx; x++) { pTop[x] = p_src[x]; } p_src = src - 1; for (y = 0; y < bsy; y++) { pLeft[y] = p_src[-y]; } a = pTop [bsx - 1]; b = pLeft[bsy - 1]; c = (bsx == bsy) ? (a + b + 1) >> 1 : (((a << shift_x) + (b << shift_y)) * 13 + (1 << (shift + 5))) >> (shift + 6); t = (c << 1) - a - b; for (x = 0; x < bsx; x++) { pT [x] = (itr_t)(b - pTop[x]); pTop[x] <<= shift_y; } temp = 0; for (y = 0; y < bsy; y++) { pL [y] = (itr_t)(a - pLeft[y]); pLeft[y] <<= shift_x; wy [y] = (itr_t)temp; temp += t; } for (y = 0; y < bsy; y++) { predx = pLeft[y]; wxy = -wy[y]; for (x = 0; x < bsx; x++) { predx += pL[y]; wxy += wy[y]; pTop[x] += pT[x]; val = ((predx << shift_y) + (pTop[x] << shift_x) + wxy + offset) >> shift_xy; dst[x] = (pel_t)XAVS2_CLIP3(0, vmax, val); } dst += i_dst; } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int get_context_pixel(int dir_mode, int xy_flag, int temp_d, int *offset) { const int mult = tab_auc_dir_dxdy[xy_flag][dir_mode][0]; const int shift = tab_auc_dir_dxdy[xy_flag][dir_mode][1]; int temp_dn; temp_d *= mult; temp_dn = temp_d >> shift; *offset = ((temp_d << 5) >> shift) - (temp_dn << 5); return temp_dn; } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int f0, f1, f2, f3; int i, j; int iX; for (j = 0; j < bsy; j++) { iX = get_context_pixel(dir_mode, 0, j + 1, &f3); f0 = 32 - f3; f1 = 64 - f3; f2 = 32 + f3; for (i = 0; i < bsx; i++) { dst[i] = (pel_t)((src[iX ] * f0 + src[iX + 1] * f1 + src[iX + 2] * f2 + src[iX + 3] * f3 + 64) >> 7); iX++; } dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int offsets[64]; int xsteps[64]; int offset; int i, j; int iY; for (i = 0; i < bsx; i++) { xsteps[i] = get_context_pixel(dir_mode, 1, i + 1, &offsets[i]); } for (j = 0; j < bsy; j++) { for (i = 0; i < bsx; i++) { iY = j + xsteps[i]; offset = offsets[i]; dst[i] = (pel_t)((src[-iY ] * (32 - offset) + src[-iY - 1] * (64 - offset) + src[-iY - 2] * (32 + offset) + src[-iY - 3] * ( offset) + 64) >> 7); } dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_xy_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(int xoffsets[64]); ALIGN16(int xsteps[64]); int i, j, iXx, iYy; int offsetx, offsety; for (i = 0; i < bsx; i++) { xsteps[i] = get_context_pixel(dir_mode, 1, i + 1, &xoffsets[i]); } for (j = 0; j < bsy; j++) { iXx = -get_context_pixel(dir_mode, 0, j + 1, &offsetx); for (i = 0; i < bsx; i++) { iYy = j - xsteps[i]; if (iYy <= -1) { dst[i] = (pel_t)((src[ iXx + 2] * (32 - offsetx) + src[ iXx + 1] * (64 - offsetx) + src[ iXx ] * (32 + offsetx) + src[ iXx - 1] * ( offsetx) + 64) >> 7); } else { offsety = xoffsets[i]; dst[i] = (pel_t)((src[-iYy - 2] * (32 - offsety) + src[-iYy - 1] * (64 - offsety) + src[-iYy ] * (32 + offsety) + src[-iYy + 1] * ( offsety) + 64) >> 7); } iXx++; } dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_3_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[(64 + 176) << 2]); int line_size = bsx + (bsy >> 2) * 11 - 1; int aligned_line_size = 64 + 176; int i_dst4 = i_dst << 2; int i; pel_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; for (i = 0; i < line_size; i++, src++) { pfirst[0][i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); pfirst[1][i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); pfirst[2][i] = (pel_t)((3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); pfirst[3][i] = (pel_t)(( src[11] + 2 * src[12] + src[13] + 0 * src[14] + 2) >> 2); } bsy >>= 2; for (i = 0; i < bsy; i++) { memcpy(dst , pfirst[0] + i * 11, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i * 11, bsx * sizeof(pel_t)); memcpy(dst + 2 * i_dst, pfirst[2] + i * 11, bsx * sizeof(pel_t)); memcpy(dst + 3 * i_dst, pfirst[3] + i * 11, bsx * sizeof(pel_t)); dst += i_dst4; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_4_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); int line_size = bsx + ((bsy - 1) << 1); int iHeight2 = bsy << 1; int i; src += 3; for (i = 0; i < line_size; i++, src++) { first_line[i] = (pel_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); } for (i = 0; i < iHeight2; i += 2) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_5_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (((bsy > 4) && (bsx > 8))) { ALIGN16(pel_t first_line[(64 + 80) << 3]); int line_size = bsx + (((bsy - 8) * 11) >> 3); int aligned_line_size = ((line_size + 15) >> 4) << 4; pel_t *pfirst[8]; pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; for (i = 0; i < line_size; src++, i++) { pfirst[0][i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); pfirst[1][i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); pfirst[2][i] = (pel_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); pfirst[3][i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); pfirst[4][i] = (pel_t)(( src[6] + 9 * src[7] + 15 * src[8] + 7 * src[9] + 16) >> 5); pfirst[5][i] = (pel_t)(( 3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); pfirst[6][i] = (pel_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] + 5 * src[12] + 16) >> 5); pfirst[7][i] = (pel_t)(( src[11] + 2 * src[12] + src[13] + 2) >> 2); } bsy >>= 3; for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel_t)); memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel_t)); memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel_t)); memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel_t)); memcpy(dst5, pfirst[4] + i * 11, bsx * sizeof(pel_t)); memcpy(dst6, pfirst[5] + i * 11, bsx * sizeof(pel_t)); memcpy(dst7, pfirst[6] + i * 11, bsx * sizeof(pel_t)); memcpy(dst8, pfirst[7] + i * 11, bsx * sizeof(pel_t)); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; } } else if (bsx == 16) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; for (i = 0; i < bsx; i++, src++) { dst1[i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); dst2[i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); dst3[i] = (pel_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); dst4[i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); } } else if (bsx == 8) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; for (i = 0; i < 8; src++, i++) { dst1[i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); dst2[i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); dst3[i] = (pel_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); dst4[i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); dst5[i] = (pel_t)(( src[6] + 9 * src[7] + 15 * src[8] + 7 * src[9] + 16) >> 5); dst6[i] = (pel_t)(( 3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); dst7[i] = (pel_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] + 5 * src[12] + 16) >> 5); dst8[i] = (pel_t)(( src[11] + 2 * src[12] + src[13] + 2) >> 2); } if (bsy == 32) { //src -> 8,src[8] -> 16 pel_t pad1 = src[8]; dst1 = dst8 + i_dst; int j; for (j = 0; j < 24; j++) { for (i = 0; i < 8; i++) { dst1[i] = pad1; } dst1 += i_dst; } dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; src += 4; dst1[0] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); dst1[1] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); dst1[2] = (pel_t)((5 * src[2] + 13 * src[3] + 11 * src[4] + 3 * src[5] + 16) >> 5); dst1[3] = (pel_t)((5 * src[3] + 13 * src[4] + 11 * src[5] + 3 * src[6] + 16) >> 5); dst2[0] = (pel_t)((src[1] + 5 * src[2] + 7 * src[3] + 3 * src[4] + 8) >> 4); dst2[1] = (pel_t)((src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); dst2[2] = (pel_t)((src[3] + 5 * src[4] + 7 * src[5] + 3 * src[6] + 8) >> 4); dst3[0] = (pel_t)((7 * src[3] + 15 * src[4] + 9 * src[5] + src[6] + 16) >> 5); } } else { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; for (i = 0; i < 4; i++, src++) { dst1[i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); dst2[i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); dst3[i] = (pel_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); dst4[i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); } if (bsy == 16) { pel_t *dst5 = dst4 + i_dst; src += 4; pel_t pad1 = src[0]; int j; for (j = 0; j < 12; j++) { for (i = 0; i < 4; i++) { dst5[i] = pad1; } dst5 += i_dst; } dst5 = dst4 + i_dst; dst5[0] = (pel_t)((src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5); dst5[1] = (pel_t)((src[-1] + 9 * src[ 0] + 15 * src[1] + 7 * src[2] + 16) >> 5); } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_6_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; int i; for (i = 0; i < line_size; i++, src++) { first_line[i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); } for (i = 0; i < bsy; i++) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_7_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; if (bsy == 4) { for (i = 0; i < bsx; src++, i++){ dst1[i] = (pel_t)((src[0] * 9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7); dst2[i] = (pel_t)((src[1] * 9 + src[2] * 25 + src[3] * 23 + src[4] * 7 + 32) >> 6); dst3[i] = (pel_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] * 5 + 64) >> 7); dst4[i] = (pel_t)((src[2] * 3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7); } } else if (bsy == 8) { pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; for (i = 0; i < bsx; src++, i++){ dst1[i] = (pel_t)((src[0] * 9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7); dst2[i] = (pel_t)((src[1] * 9 + src[2] * 25 + src[3] * 23 + src[4] * 7 + 32) >> 6); dst3[i] = (pel_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] * 5 + 64) >> 7); dst4[i] = (pel_t)((src[2] * 3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7); dst5[i] = (pel_t)((src[3] * 3 + src[4] * 11 + src[5] * 13 + src[6] * 5 + 16) >> 5); dst6[i] = (pel_t)((src[4] * 21 + src[5] * 53 + src[6] * 43 + src[7] * 11 + 64) >> 7); dst7[i] = (pel_t)((src[5] * 15 + src[6] * 31 + src[7] * 17 + src[8] + 32) >> 6); dst8[i] = (pel_t)((src[5] * 3 + src[6] * 19 + src[7] * 29 + src[8] * 13 + 32) >> 6); } } else { intra_pred_ang_x_c(src, dst, i_dst, dir_mode, bsx, bsy); } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_8_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (64 + 32)]); int line_size = bsx + (bsy >> 1) - 1; int aligned_line_size = ((line_size + 15) >> 4) << 4; int i_dst2 = i_dst << 1; int i; pel_t *pfirst[2]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; for (i = 0; i < line_size; i++, src++) { pfirst[0][i] = (pel_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); pfirst[1][i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); } bsy >>= 1; for (i = 0; i < bsy; i++) { memcpy(dst , pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); dst += i_dst2; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_9_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { if (bsy > 8){ intra_pred_ang_x_c(src, dst, i_dst, dir_mode, bsx, bsy); /* ALIGN16(pel_t first_line[(64 + 32) * 11]); int line_size = bsx + (bsy * 93 >> 8) - 1; int real_size = XAVS2_MIN(line_size, bsx * 2); int aligned_line_size = ((line_size + 31) >> 5) << 5; int i_dst11 = i_dst * 11; int i; pel_t pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9, pad10, pad11; pel_t *pfirst[11]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; pfirst[8] = pfirst[7] + aligned_line_size; pfirst[9] = pfirst[8] + aligned_line_size; pfirst[10] = pfirst[9] + aligned_line_size; for (i = 0; i < real_size; i++, src++) { pfirst[0][i] = (pel_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); pfirst[1][i] = (pel_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); pfirst[2][i] = (pel_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + 1 * src[4] + 32) >> 6); pfirst[3][i] = (pel_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); pfirst[4][i] = (pel_t)((3 * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6); pfirst[5][i] = (pel_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5 * src[5] + 64) >> 7); pfirst[6][i] = (pel_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7); pfirst[7][i] = (pel_t)((3 * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7); pfirst[8][i] = (pel_t)((3 * src[3] + 7 * src[4] + 5 * src[5] + 1 * src[6] + 8) >> 4); pfirst[9][i] = (pel_t)((3 * src[3] + 11 * src[4] + 13 * src[5] + 5 * src[6] + 16) >> 5); pfirst[10][i] = (pel_t)((1 * src[3] + 33 * src[4] + 63 * src[5] + 31 * src[6] + 64) >> 7); } // padding if (real_size < line_size) { pfirst[8][real_size - 3] = pfirst[8][real_size - 4]; pfirst[9][real_size - 3] = pfirst[9][real_size - 4]; pfirst[10][real_size - 3] = pfirst[10][real_size - 4]; pfirst[8][real_size - 2] = pfirst[8][real_size - 3]; pfirst[9][real_size - 2] = pfirst[9][real_size - 3]; pfirst[10][real_size - 2] = pfirst[10][real_size - 3]; pfirst[8][real_size - 1] = pfirst[8][real_size - 2]; pfirst[9][real_size - 1] = pfirst[9][real_size - 2]; pfirst[10][real_size - 1] = pfirst[10][real_size - 2]; pfirst[5][real_size - 2] = pfirst[5][real_size - 3]; pfirst[6][real_size - 2] = pfirst[6][real_size - 3]; pfirst[7][real_size - 2] = pfirst[7][real_size - 3]; pfirst[5][real_size - 1] = pfirst[5][real_size - 2]; pfirst[6][real_size - 1] = pfirst[6][real_size - 2]; pfirst[7][real_size - 1] = pfirst[7][real_size - 2]; pfirst[2][real_size - 1] = pfirst[2][real_size - 2]; pfirst[3][real_size - 1] = pfirst[3][real_size - 2]; pfirst[4][real_size - 1] = pfirst[4][real_size - 2]; pad1 = pfirst[0][real_size - 1]; pad2 = pfirst[1][real_size - 1]; pad3 = pfirst[2][real_size - 1]; pad4 = pfirst[3][real_size - 1]; pad5 = pfirst[4][real_size - 1]; pad6 = pfirst[5][real_size - 1]; pad7 = pfirst[6][real_size - 1]; pad8 = pfirst[7][real_size - 1]; pad9 = pfirst[8][real_size - 1]; pad10 = pfirst[9][real_size - 1]; pad11 = pfirst[10][real_size - 1]; for (; i < line_size; i++) { pfirst[0][i] = pad1; pfirst[1][i] = pad2; pfirst[2][i] = pad3; pfirst[3][i] = pad4; pfirst[4][i] = pad5; pfirst[5][i] = pad6; pfirst[6][i] = pad7; pfirst[7][i] = pad8; pfirst[8][i] = pad9; pfirst[9][i] = pad10; pfirst[10][i] = pad11; } } int bsy_b = bsy / 11; for (i = 0; i < bsy_b; i++) { memcpy(dst, pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel_t)); memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel_t)); memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel_t)); memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel_t)); memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel_t)); memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel_t)); memcpy(dst + 8 * i_dst, pfirst[8] + i, bsx * sizeof(pel_t)); memcpy(dst + 9 * i_dst, pfirst[9] + i, bsx * sizeof(pel_t)); memcpy(dst + 10 * i_dst, pfirst[10] + i, bsx * sizeof(pel_t)); dst += i_dst11; } int bsy_r = bsy - bsy_b * 11; for (i = 0; i < bsy_r; i++) { memcpy(dst, pfirst[i] + bsy_b, bsx * sizeof(pel_t)); dst += i_dst; } */ } else if (bsy == 8) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; for (int i = 0; i < bsx; i++, src++) { dst1[i] = (pel_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); dst2[i] = (pel_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); dst3[i] = (pel_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + src[4] + 32) >> 6); dst4[i] = (pel_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); dst5[i] = (pel_t)((3 * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6); dst6[i] = (pel_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5 * src[5] + 64) >> 7); dst7[i] = (pel_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7); dst8[i] = (pel_t)((3 * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7); } } else /*if (bsy == 4)*/ { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; for (int i = 0; i < bsx; i++, src++) { dst1[i] = (pel_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); dst2[i] = (pel_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); dst3[i] = (pel_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + src[4] + 32) >> 6); dst4[i] = (pel_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_10_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; int i; if (bsy != 4) { ALIGN16(pel_t first_line[4 * (64 + 16)]); int line_size = bsx + bsy / 4 - 1; int aligned_line_size = ((line_size + 15) >> 4) << 4; pel_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; pfirst[2] = first_line + aligned_line_size * 2; pfirst[3] = first_line + aligned_line_size * 3; for (i = 0; i < line_size; i++, src++) { pfirst[0][i] = (pel_t)((src[0] * 3 + src[1] * 7 + src[2] * 5 + src[3] + 8) >> 4); pfirst[1][i] = (pel_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); pfirst[2][i] = (pel_t)((src[0] + src[1] * 5 + src[2] * 7 + src[3] * 3 + 8) >> 4); pfirst[3][i] = (pel_t)((src[1] + src[2] * 2 + src[3] + 2) >> 2); } bsy >>= 2; i_dst <<= 2; for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst2, pfirst[1] + i, bsx * sizeof(pel_t)); memcpy(dst3, pfirst[2] + i, bsx * sizeof(pel_t)); memcpy(dst4, pfirst[3] + i, bsx * sizeof(pel_t)); dst1 += i_dst; dst2 += i_dst; dst3 += i_dst; dst4 += i_dst; } } else { for (i = 0; i < bsx; i++, src++) { dst1[i] = (pel_t)((src[0] * 3 + src[1] * 7 + src[2] * 5 + src[3] + 8) >> 4); dst2[i] = (pel_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); dst3[i] = (pel_t)((src[0] + src[1] * 5 + src[2] * 7 + src[3] * 3 + 8) >> 4); dst4[i] = (pel_t)((src[1] + src[2] * 2 + src[3] + 2) >> 2); } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_11_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsy > 8) { ALIGN16(pel_t first_line[(64 + 16) << 3]); int line_size = bsx + (bsy >> 3) - 1; int aligned_line_size = ((line_size + 15) >> 4) << 4; int i_dst8 = i_dst << 3; pel_t *pfirst[8]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; for (i = 0; i < line_size; i++, src++) { pfirst[0][i] = (pel_t)((7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); pfirst[1][i] = (pel_t)((3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); pfirst[2][i] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); pfirst[3][i] = (pel_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); pfirst[4][i] = (pel_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5); pfirst[5][i] = (pel_t)(( src[0] + 5 * src[1] + 7 * src[2] + 3 * src[3] + 8) >> 4); pfirst[6][i] = (pel_t)(( src[0] + 9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5); pfirst[7][i] = (pel_t)(( src[1] + 2 * src[2] + src[3] + 0 * src[4] + 2) >> 2); } bsy >>= 3; for (i = 0; i < bsy; i++) { memcpy(dst , pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel_t)); memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel_t)); memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel_t)); memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel_t)); memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel_t)); memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel_t)); dst += i_dst8; } } else if (bsy == 8) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; for (i = 0; i < bsx; i++, src++) { dst1[i] = (pel_t)((7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); dst2[i] = (pel_t)((3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); dst3[i] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); dst4[i] = (pel_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); dst5[i] = (pel_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5); dst6[i] = (pel_t)(( src[0] + 5 * src[1] + 7 * src[2] + 3 * src[3] + 8) >> 4); dst7[i] = (pel_t)(( src[0] + 9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5); dst8[i] = (pel_t)(( src[1] + 2 * src[2] + src[3] + + 2) >> 2); } } else { for (i = 0; i < bsx; i++, src++) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; dst1[i] = (pel_t)(( 7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); dst2[i] = (pel_t)(( 3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); dst3[i] = (pel_t)(( 5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); dst4[i] = (pel_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_25_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx > 8) { ALIGN16(pel_t first_line[64 + (64 << 3)]); int line_size = bsx + ((bsy - 1) << 3); int iHeight8 = bsy << 3; for (i = 0; i < line_size; i += 8, src--) { first_line[0 + i] = (pel_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); first_line[1 + i] = (pel_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); first_line[2 + i] = (pel_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); first_line[3 + i] = (pel_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); first_line[4 + i] = (pel_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5); first_line[5 + i] = (pel_t)((src[0] * 1 + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); first_line[6 + i] = (pel_t)((src[0] * 1 + src[-1] * 9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5); first_line[7 + i] = (pel_t)(( src[-1] * 1 + src[-2] * 2 + src[-3] * 1 + 2) >> 2); } for (i = 0; i < iHeight8; i += 8) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 8) { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); dst[1] = (pel_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); dst[2] = (pel_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); dst[3] = (pel_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); dst[4] = (pel_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5); dst[5] = (pel_t)((src[0] * 1 + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); dst[6] = (pel_t)((src[0] * 1 + src[-1] * 9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5); dst[7] = (pel_t)(( src[-1] * 1 + src[-2] * 2 + src[-3] * 1 + 2) >> 2); dst += i_dst; } } else { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); dst[1] = (pel_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); dst[2] = (pel_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); dst[3] = (pel_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_26_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx != 4) { ALIGN16(pel_t first_line[64 + 256]); int line_size = bsx + ((bsy - 1) << 2); int iHeight4 = bsy << 2; for (i = 0; i < line_size; i += 4, src--) { first_line[i ] = (pel_t)((src[ 0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] + 8) >> 4); first_line[i + 1] = (pel_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); first_line[i + 2] = (pel_t)((src[ 0] + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); first_line[i + 3] = (pel_t)((src[-1] + src[-2] * 2 + src[-3] + 2) >> 2); } for (i = 0; i < iHeight4; i += 4) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((src[ 0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] + 8) >> 4); dst[1] = (pel_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); dst[2] = (pel_t)((src[ 0] + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); dst[3] = (pel_t)((src[-1] + src[-2] * 2 + src[-3] + 2) >> 2); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_27_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx > 8){ intra_pred_ang_y_c(src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx == 8){ for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((21 * src[0] + 53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7); dst[1] = (pel_t)(( 9 * src[0] + 41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7); dst[2] = (pel_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] + 1 * src[-4] + 32) >> 6); dst[3] = (pel_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] + 7 * src[-4] + 32) >> 6); dst[4] = (pel_t)(( 3 * src[-1] + 19 * src[-2] + 29 * src[-3] + 13 * src[-4] + 32) >> 6); dst[5] = (pel_t)((27 * src[-2] + 59 * src[-3] + 37 * src[-4] + 5 * src[-5] + 64) >> 7); dst[6] = (pel_t)((15 * src[-2] + 47 * src[-3] + 49 * src[-4] + 17 * src[-5] + 64) >> 7); dst[7] = (pel_t)(( 3 * src[-2] + 35 * src[-3] + 61 * src[-4] + 29 * src[-5] + 64) >> 7); dst += i_dst; } } else{ for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((21 * src[0] + 53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7); dst[1] = (pel_t)(( 9 * src[0] + 41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7); dst[2] = (pel_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] + 1 * src[-4] + 32) >> 6); dst[3] = (pel_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] + 7 * src[-4] + 32) >> 6); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_28_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); int line_size = bsx + ((bsy - 1) << 1); int iHeight2 = bsy << 1; int i; for (i = 0; i < line_size; i += 2, src--) { first_line[i ] = (pel_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); first_line[i + 1] = (pel_t)((src[-1] + (src[-2] << 1) + src[-3] + 2) >> 2); } for (i = 0; i < iHeight2; i += 2) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_29_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx > 8) { intra_pred_ang_y_c(src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx == 8) { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7); dst[1] = (pel_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6); dst[2] = (pel_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7); dst[3] = (pel_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7); dst[4] = (pel_t)((src[-3] * 3 + src[-4] * 11 + src[-5] * 13 + src[-6] * 5 + 16) >> 5); dst[5] = (pel_t)((src[-4] * 21 + src[-5] * 53 + src[-6] * 43 + src[-7] * 11 + 64) >> 7); dst[6] = (pel_t)((src[-5] * 15 + src[-6] * 31 + src[-7] * 17 + src[-8] + 32) >> 6); dst[7] = (pel_t)((src[-5] * 3 + src[-6] * 19 + src[-7] * 29 + src[-8] * 13 + 32) >> 6); dst += i_dst; } } else { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7); dst[1] = (pel_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6); dst[2] = (pel_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7); dst[3] = (pel_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_30_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; int i; src -= 2; for (i = 0; i < line_size; i++, src--) { first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (i = 0; i < bsy; i++) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_31_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t dst_tran[MAX_CU_SIZE * MAX_CU_SIZE]); ALIGN16(pel_t src_tran[MAX_CU_SIZE << 3]); int i; if (bsx >= bsy){ // transposition // i < (bsx * 19 / 8 + 3) for (i = 0; i < (bsy + bsx * 11 / 8 + 3); i++){ src_tran[i] = src[-i]; } intra_pred_ang_x_5_c(src_tran, dst_tran, bsy, 5, bsy, bsx); for (i = 0; i < bsy; i++){ for (int j = 0; j < bsx; j++){ dst[j + i_dst * i] = dst_tran[i + bsy * j]; } } } else if (bsx == 8){ for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5); dst[1] = (pel_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4); dst[2] = (pel_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5); dst[3] = (pel_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3); dst[4] = (pel_t)((1 * src[-6] + 9 * src[-7] + 15 * src[-8] + 7 * src[-9] + 16) >> 5); dst[5] = (pel_t)((3 * src[-8] + 7 * src[-9] + 5 * src[-10] + 1 * src[-11] + 8) >> 4); dst[6] = (pel_t)((3 * src[-9] + 11 * src[-10] + 13 * src[-11] + 5 * src[-12] + 16) >> 5); dst[7] = (pel_t)((1 * src[-11] + 2 * src[-12] + 1 * src[-13] + 0 * src[-14] + 2) >> 2); dst += i_dst; } } else { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5); dst[1] = (pel_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4); dst[2] = (pel_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5); dst[3] = (pel_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_32_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (32 + 64)]); int line_size = (bsy >> 1) + bsx - 1; int aligned_line_size = ((line_size + 15) >> 4) << 4; int i_dst2 = i_dst << 1; int i; pel_t *pfirst[2]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; src -= 3; for (i = 0; i < line_size; i++, src -= 2) { pfirst[0][i] = (pel_t)((src[1] + (src[ 0] << 1) + src[-1] + 2) >> 2); pfirst[1][i] = (pel_t)((src[0] + (src[-1] << 1) + src[-2] + 2) >> 2); } bsy >>= 1; for (i = 0; i < bsy; i++) { memcpy(dst , pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); dst += i_dst2; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_xy_13_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsy > 8) { ALIGN16(pel_t first_line[(64 + 16) << 3]); int line_size = bsx + (bsy >> 3) - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 15) >> 4) << 4; pel_t *pfirst[8]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; src -= bsy - 8; for (i = 0; i < left_size; i++, src += 8) { pfirst[0][i] = (pel_t)((src[6] + (src[7] << 1) + src[8] + 2) >> 2); pfirst[1][i] = (pel_t)((src[5] + (src[6] << 1) + src[7] + 2) >> 2); pfirst[2][i] = (pel_t)((src[4] + (src[5] << 1) + src[6] + 2) >> 2); pfirst[3][i] = (pel_t)((src[3] + (src[4] << 1) + src[5] + 2) >> 2); pfirst[4][i] = (pel_t)((src[2] + (src[3] << 1) + src[4] + 2) >> 2); pfirst[5][i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); pfirst[6][i] = (pel_t)((src[0] + (src[1] << 1) + src[2] + 2) >> 2); pfirst[7][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (; i < line_size; i++, src++) { pfirst[0][i] = (pel_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); pfirst[1][i] = (pel_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); pfirst[2][i] = (pel_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); pfirst[3][i] = (pel_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); pfirst[4][i] = (pel_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5); pfirst[5][i] = (pel_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4); pfirst[6][i] = (pel_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5); pfirst[7][i] = (pel_t)((src[1] + 2 * src[0] + src[-1] + 2) >> 2); } pfirst[0] += left_size; pfirst[1] += left_size; pfirst[2] += left_size; pfirst[3] += left_size; pfirst[4] += left_size; pfirst[5] += left_size; pfirst[6] += left_size; pfirst[7] += left_size; bsy >>= 3; for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[4] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[5] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[6] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[7] - i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsy == 8) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; for (i = 0; i < bsx; i++, src++) { dst1[i] = (pel_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); dst2[i] = (pel_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); dst3[i] = (pel_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); dst4[i] = (pel_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); dst5[i] = (pel_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5); dst6[i] = (pel_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4); dst7[i] = (pel_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5); dst8[i] = (pel_t)((src[1] + 2 * src[0] + src[-1] + 2) >> 2); } } else { for (i = 0; i < bsx; i++, src++) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; dst1[i] = (pel_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); dst2[i] = (pel_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); dst3[i] = (pel_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); dst4[i] = (pel_t)(( src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); } } } static void intra_pred_ang_xy_14_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsy != 4) { ALIGN16(pel_t first_line[4 * (64 + 16)]); int line_size = bsx + (bsy >> 2) - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 15) >> 4) << 4; pel_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; pfirst[2] = first_line + aligned_line_size * 2; pfirst[3] = first_line + aligned_line_size * 3; src -= bsy - 4; for (i = 0; i < left_size; i++, src += 4) { pfirst[0][i] = (pel_t)((src[ 2] + (src[3] << 1) + src[4] + 2) >> 2); pfirst[1][i] = (pel_t)((src[ 1] + (src[2] << 1) + src[3] + 2) >> 2); pfirst[2][i] = (pel_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2); pfirst[3][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (; i < line_size; i++, src++) { pfirst[0][i] = (pel_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); pfirst[1][i] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); pfirst[2][i] = (pel_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); pfirst[3][i] = (pel_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); } pfirst[0] += left_size; pfirst[1] += left_size; pfirst[2] += left_size; pfirst[3] += left_size; bsy >>= 2; for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t)); dst += i_dst; } } else { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; for (i = 0; i < bsx; i++, src++) { dst1[i] = (pel_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); dst2[i] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); dst3[i] = (pel_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); dst4[i] = (pel_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_xy_16_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (64 + 32)]); int line_size = bsx + (bsy >> 1) - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 15) >> 4) << 4; int i_dst2 = i_dst << 1; pel_t *pfirst[2]; int i; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; src -= bsy - 2; for (i = 0; i < left_size; i++, src += 2) { pfirst[0][i] = (pel_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2); pfirst[1][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (; i < line_size; i++, src++) { pfirst[0][i] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); pfirst[1][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } pfirst[0] += left_size; pfirst[1] += left_size; bsy >>= 1; for (i = 0; i < bsy; i++) { memcpy(dst , pfirst[0] - i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += i_dst2; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_xy_18_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; int i; pel_t *pfirst = first_line + bsy - 1; src -= bsy - 1; for (i = 0; i < line_size; i++, src++) { first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); pfirst--; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_xy_20_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); int left_size = ((bsy - 1) << 1) + 1; int top_size = bsx - 1; int line_size = left_size + top_size; int i; pel_t *pfirst = first_line + left_size - 1; src -= bsy; for (i = 0; i < left_size; i += 2, src++) { first_line[i ] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); first_line[i + 1] = (pel_t)(( src[0] + (src[1] << 1) + src[2] + 2) >> 2); } i--; for (; i < line_size; i++, src++) { first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); pfirst -= 2; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_xy_22_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx != 4) { src -= bsy; ALIGN16(pel_t first_line[64 + 256]); int left_size = ((bsy - 1) << 2) + 3; int top_size = bsx - 3; int line_size = left_size + top_size; pel_t *pfirst = first_line + left_size - 3; for (i = 0; i < left_size; i += 4, src++) { first_line[i ] = (pel_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); first_line[i + 1] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); first_line[i + 2] = (pel_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); first_line[i + 3] = (pel_t)(( src[0] + src[1] * 2 + src[2] + 2) >> 2); } i--; for (; i < line_size; i++, src++) { first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); dst += i_dst; pfirst -= 4; } } else { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((src[-2] * 3 + src[-1] * 7 + src[0] * 5 + src[1] + 8) >> 4); dst[1] = (pel_t)((src[-2] + (src[-1] + src[0]) * 3 + src[1] + 4) >> 3); dst[2] = (pel_t)((src[-2] + src[-1] * 5 + src[0] * 7 + src[1] * 3 + 8) >> 4); dst[3] = (pel_t)(( src[-1] + src[0] * 2 + src[1] + 2) >> 2); dst += i_dst; } // needn't pad, (3,0) is equal for ang_x and ang_y } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_xy_23_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx > 8) { ALIGN16(pel_t first_line[64 + 512]); int left_size = (bsy << 3) - 1; int top_size = bsx - 7; int line_size = left_size + top_size; pel_t *pfirst = first_line + left_size - 7; src -= bsy; for (i = 0; i < left_size; i += 8, src++) { first_line[i ] = (pel_t)((7 * src[-1] + 15 * src[0] + 9 * src[1] + src[2] + 16) >> 5); first_line[i + 1] = (pel_t)((3 * src[-1] + 7 * src[0] + 5 * src[1] + src[2] + 8) >> 4); first_line[i + 2] = (pel_t)((5 * src[-1] + 13 * src[0] + 11 * src[1] + 3 * src[2] + 16) >> 5); first_line[i + 3] = (pel_t)(( src[-1] + 3 * src[0] + 3 * src[1] + src[2] + 4) >> 3); first_line[i + 4] = (pel_t)((3 * src[-1] + 11 * src[0] + 13 * src[1] + 5 * src[2] + 16) >> 5); first_line[i + 5] = (pel_t)(( src[-1] + 5 * src[0] + 7 * src[1] + 3 * src[2] + 8) >> 4); first_line[i + 6] = (pel_t)(( src[-1] + 9 * src[0] + 15 * src[1] + 7 * src[2] + 16) >> 5); first_line[i + 7] = (pel_t)(( src[ 0] + 2 * src[1] + src[2] + 0 * src[3] + 2) >> 2); } i--; for (; i < line_size; i++, src++) { first_line[i] = (pel_t)((src[1] + (src[0] << 1) + src[-1] + 2) >> 2); } for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); dst += i_dst; pfirst -= 8; } } else if (bsx == 8) { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((7 * src[-2] + 15 * src[-1] + 9 * src[0] + src[1] + 16) >> 5); dst[1] = (pel_t)((3 * src[-2] + 7 * src[-1] + 5 * src[0] + src[1] + 8) >> 4); dst[2] = (pel_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5); dst[3] = (pel_t)(( src[-2] + 3 * src[-1] + 3 * src[0] + src[1] + 4) >> 3); dst[4] = (pel_t)((3 * src[-2] + 11 * src[-1] + 13 * src[0] + 5 * src[1] + 16) >> 5); dst[5] = (pel_t)(( src[-2] + 5 * src[-1] + 7 * src[0] + 3 * src[1] + 8) >> 4); dst[6] = (pel_t)(( src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5); dst[7] = (pel_t)(( src[-1] + 2 * src[ 0] + src[1] + 0 * src[2] + 2) >> 2); dst += i_dst; } // needn't pad, (7,0) is equal for ang_x and ang_y } else { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((7 * src[-2] + 15 * src[-1] + 9 * src[0] + src[1] + 16) >> 5); dst[1] = (pel_t)((3 * src[-2] + 7 * src[-1] + 5 * src[0] + src[1] + 8) >> 4); dst[2] = (pel_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5); dst[3] = (pel_t)((src[-2] + 3 * src[-1] + 3 * src[0] + src[1] + 4) >> 3); dst += i_dst; } } } /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCUϱ߽PU */ static void fill_reference_samples_0_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { int num_padding = 0; /* fill default value */ g_funcs.mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ g_funcs.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel_t)); } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { g_funcs.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel_t)); } else { g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { g_funcs.mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { /* fill left pixels */ memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel_t)); } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel_t)); } else { g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy); } /* fill top-left pixel */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pLcuEP[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pLcuEP[1]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pLcuEP[-1]; } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { g_funcs.mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } } /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCUϱ߽PU */ static void fill_reference_samples_x_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { const pel_t *pL = pTL + i_TL; int num_padding = 0; /* fill default value */ g_funcs.mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ g_funcs.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel_t)); } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { g_funcs.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel_t)); } else { g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { g_funcs.mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { const pel_t *p_l = pL; int y; /* fill left pixels */ for (y = 0; y < bsy; y++) { EP[-1 - y] = *p_l; p_l += i_TL; } } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { int y; const pel_t *p_l = pL + bsy * i_TL; for (y = 0; y < bsy; y++) { EP[-bsy - 1 - y] = *p_l; p_l += i_TL; } } else { g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy); } /* fill top-left pixel */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pLcuEP[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pLcuEP[1]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pL[0]; } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { g_funcs.mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } } /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCU߽ϵPU */ static void fill_reference_samples_y_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { const pel_t *pT = pTL + 1; int num_padding = 0; /* fill default value */ g_funcs.mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ g_funcs.fast_memcpy(&EP[1], pT, bsx * sizeof(pel_t)); } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { g_funcs.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel_t)); } else { g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { g_funcs.mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { /* fill left pixels */ memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel_t)); } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel_t)); } else { g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy); } /* fill top-left pixel */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pLcuEP[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pT[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pLcuEP[-1]; } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { g_funcs.mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } } /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCUڲڱ߽ϵPU */ static void fill_reference_samples_xy_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { const pel_t *pT = pTL + 1; const pel_t *pL = pTL + i_TL; int num_padding = 0; /* fill default value */ g_funcs.mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ g_funcs.fast_memcpy(&EP[1], pT, bsx * sizeof(pel_t)); } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { g_funcs.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel_t)); } else { g_funcs.mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { g_funcs.mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { const pel_t *p_l = pL; int y; /* fill left pixels */ for (y = 0; y < bsy; y++) { EP[-1 - y] = *p_l; p_l += i_TL; } } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { int y; const pel_t *p_l = pL + bsy * i_TL; for (y = 0; y < bsy; y++) { EP[-bsy - 1 - y] = *p_l; p_l += i_TL; } } else { g_funcs.mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy); } /* fill top-left pixel */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pTL[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pT[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pL[0]; } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { g_funcs.mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } } /** * =========================================================================== * interface function definition * =========================================================================== */ /* --------------------------------------------------------------------------- */ void xavs2_intra_pred_init(uint32_t cpuid, intrinsic_func_t *pf) { #define ANG_X_OFFSET 3 #define ANG_XY_OFFSET 13 #define ANG_Y_OFFSET 25 int i; intra_pred_t *ipred = pf->intraf; pf->fill_edge_f[0] = fill_reference_samples_0_c; pf->fill_edge_f[1] = fill_reference_samples_x_c; pf->fill_edge_f[2] = fill_reference_samples_y_c; pf->fill_edge_f[3] = fill_reference_samples_xy_c; ipred[DC_PRED ] = intra_pred_dc_c; // 0 ipred[PLANE_PRED] = intra_pred_plane_c; // 1 ipred[BI_PRED ] = intra_pred_bilinear_c; // 2 for (i = ANG_X_OFFSET; i < VERT_PRED; i++) { ipred[i ] = intra_pred_ang_x_c; // 3 ~ 11 } ipred[VERT_PRED ] = intra_pred_ver_c; // 12 for (i = ANG_XY_OFFSET; i < HOR_PRED; i++) { ipred[i ] = intra_pred_ang_xy_c; // 13 ~ 23 } ipred[HOR_PRED ] = intra_pred_hor_c; // 24 for (i = ANG_Y_OFFSET; i < NUM_INTRA_MODE; i++) { ipred[i ] = intra_pred_ang_y_c; // 25 ~ 32 } ipred[INTRA_ANG_X_3 ] = intra_pred_ang_x_3_c; ipred[INTRA_ANG_X_4 ] = intra_pred_ang_x_4_c; ipred[INTRA_ANG_X_5 ] = intra_pred_ang_x_5_c; ipred[INTRA_ANG_X_6 ] = intra_pred_ang_x_6_c; ipred[INTRA_ANG_X_7 ] = intra_pred_ang_x_7_c; ipred[INTRA_ANG_X_8 ] = intra_pred_ang_x_8_c; ipred[INTRA_ANG_X_9 ] = intra_pred_ang_x_9_c; ipred[INTRA_ANG_X_10] = intra_pred_ang_x_10_c; ipred[INTRA_ANG_X_11] = intra_pred_ang_x_11_c; ipred[INTRA_ANG_XY_13] = intra_pred_ang_xy_13_c; ipred[INTRA_ANG_XY_14] = intra_pred_ang_xy_14_c; ipred[INTRA_ANG_XY_16] = intra_pred_ang_xy_16_c; ipred[INTRA_ANG_XY_18] = intra_pred_ang_xy_18_c; ipred[INTRA_ANG_XY_20] = intra_pred_ang_xy_20_c; ipred[INTRA_ANG_XY_22] = intra_pred_ang_xy_22_c; ipred[INTRA_ANG_XY_23] = intra_pred_ang_xy_23_c; ipred[INTRA_ANG_Y_25] = intra_pred_ang_y_25_c; ipred[INTRA_ANG_Y_26] = intra_pred_ang_y_26_c; ipred[INTRA_ANG_Y_27] = intra_pred_ang_y_27_c; ipred[INTRA_ANG_Y_28] = intra_pred_ang_y_28_c; ipred[INTRA_ANG_Y_29] = intra_pred_ang_y_29_c; ipred[INTRA_ANG_Y_30] = intra_pred_ang_y_30_c; ipred[INTRA_ANG_Y_31] = intra_pred_ang_y_31_c; ipred[INTRA_ANG_Y_32] = intra_pred_ang_y_32_c; // TODO: 8bit½Ƕ7911ܲһ 20170716 #if HAVE_MMX if (cpuid & XAVS2_CPU_SSE42) { ipred[DC_PRED ] = intra_pred_dc_sse128; ipred[HOR_PRED ] = intra_pred_hor_sse128; ipred[VERT_PRED ] = intra_pred_ver_sse128; ipred[PLANE_PRED ] = intra_pred_plane_sse128; ipred[BI_PRED ] = intra_pred_bilinear_sse128; ipred[INTRA_ANG_X_3 ] = intra_pred_ang_x_3_sse128; ipred[INTRA_ANG_X_4 ] = intra_pred_ang_x_4_sse128; ipred[INTRA_ANG_X_5 ] = intra_pred_ang_x_5_sse128; ipred[INTRA_ANG_X_6 ] = intra_pred_ang_x_6_sse128; ipred[INTRA_ANG_X_7 ] = intra_pred_ang_x_7_sse128; ipred[INTRA_ANG_X_8 ] = intra_pred_ang_x_8_sse128; ipred[INTRA_ANG_X_9 ] = intra_pred_ang_x_9_sse128; ipred[INTRA_ANG_X_10 ] = intra_pred_ang_x_10_sse128; ipred[INTRA_ANG_X_11 ] = intra_pred_ang_x_11_sse128; ipred[INTRA_ANG_XY_13] = intra_pred_ang_xy_13_sse128; ipred[INTRA_ANG_XY_14] = intra_pred_ang_xy_14_sse128; ipred[INTRA_ANG_XY_16] = intra_pred_ang_xy_16_sse128; ipred[INTRA_ANG_XY_18] = intra_pred_ang_xy_18_sse128; ipred[INTRA_ANG_XY_20] = intra_pred_ang_xy_20_sse128; ipred[INTRA_ANG_XY_22] = intra_pred_ang_xy_22_sse128; ipred[INTRA_ANG_XY_23] = intra_pred_ang_xy_23_sse128; ipred[INTRA_ANG_Y_25 ] = intra_pred_ang_y_25_sse128; ipred[INTRA_ANG_Y_26 ] = intra_pred_ang_y_26_sse128; ipred[INTRA_ANG_Y_28 ] = intra_pred_ang_y_28_sse128; ipred[INTRA_ANG_Y_30 ] = intra_pred_ang_y_30_sse128; ipred[INTRA_ANG_Y_31 ] = intra_pred_ang_y_31_sse128; ipred[INTRA_ANG_Y_32 ] = intra_pred_ang_y_32_sse128; pf->fill_edge_f[0] = fill_edge_samples_0_sse128; pf->fill_edge_f[1] = fill_edge_samples_x_sse128; pf->fill_edge_f[2] = fill_edge_samples_y_sse128; pf->fill_edge_f[3] = fill_edge_samples_xy_sse128; } /* 8/10bit assemble*/ if (cpuid & XAVS2_CPU_AVX2) { ipred[DC_PRED ] = intra_pred_dc_avx; ipred[HOR_PRED ] = intra_pred_hor_avx; ipred[VERT_PRED ] = intra_pred_ver_avx; ipred[PLANE_PRED ] = intra_pred_plane_avx; ipred[BI_PRED ] = intra_pred_bilinear_avx; ipred[INTRA_ANG_X_3 ] = intra_pred_ang_x_3_avx; ipred[INTRA_ANG_X_4 ] = intra_pred_ang_x_4_avx; ipred[INTRA_ANG_X_5 ] = intra_pred_ang_x_5_avx; ipred[INTRA_ANG_X_6 ] = intra_pred_ang_x_6_avx; ipred[INTRA_ANG_X_7 ] = intra_pred_ang_x_7_avx; ipred[INTRA_ANG_X_8 ] = intra_pred_ang_x_8_avx; ipred[INTRA_ANG_X_9 ] = intra_pred_ang_x_9_avx; ipred[INTRA_ANG_X_10 ] = intra_pred_ang_x_10_avx; ipred[INTRA_ANG_X_11 ] = intra_pred_ang_x_11_avx; ipred[INTRA_ANG_XY_13] = intra_pred_ang_xy_13_avx; ipred[INTRA_ANG_XY_14] = intra_pred_ang_xy_14_avx; ipred[INTRA_ANG_XY_16] = intra_pred_ang_xy_16_avx; ipred[INTRA_ANG_XY_18] = intra_pred_ang_xy_18_avx; ipred[INTRA_ANG_XY_20] = intra_pred_ang_xy_20_avx; ipred[INTRA_ANG_XY_22] = intra_pred_ang_xy_22_avx; ipred[INTRA_ANG_XY_23] = intra_pred_ang_xy_23_avx; ipred[INTRA_ANG_Y_25 ] = intra_pred_ang_y_25_avx; ipred[INTRA_ANG_Y_26 ] = intra_pred_ang_y_26_avx; ipred[INTRA_ANG_Y_28 ] = intra_pred_ang_y_28_avx; ipred[INTRA_ANG_Y_30 ] = intra_pred_ang_y_30_avx; ipred[INTRA_ANG_Y_31 ] = intra_pred_ang_y_31_avx; ipred[INTRA_ANG_Y_32 ] = intra_pred_ang_y_32_avx; } #endif //if HAVE_MMX #undef ANG_X_OFFSET #undef ANG_XY_OFFSET #undef ANG_Y_OFFSET } xavs2-1.3/source/common/intra.h000066400000000000000000000073331340660520300165030ustar00rootroot00000000000000/* * intra.h * * Description of this file: * Intra prediction functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_INTRA_H #define XAVS2_INTRA_H typedef struct intra_candidate_t intra_candidate_t; #define xavs2_intra_get_cu_neighbors FPFX(intra_get_cu_neighbors) uint32_t xavs2_intra_get_cu_neighbors(xavs2_t *h, cu_t *p_cu, int img_x, int img_y, int cu_size); #define xavs2_intra_fill_ref_samples_luma FPFX(intra_fill_ref_samples_luma) void xavs2_intra_fill_ref_samples_luma(xavs2_t *h, cu_t *p_cu, int img_x, int img_y, int block_x, int block_y, int bsx, int bsy); #define rdo_get_pred_intra_luma FPFX(rdo_get_pred_intra_luma) int rdo_get_pred_intra_luma(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, pel_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h); #define rdo_get_pred_intra_luma_rmd FPFX(rdo_get_pred_intra_luma_rmd) int rdo_get_pred_intra_luma_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, pel_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h); #define rdo_get_pred_intra_luma_cuda FPFX(rdo_get_pred_intra_luma_cuda) int rdo_get_pred_intra_luma_cuda(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, pel_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h); #define rdo_get_pred_intra_luma_2nd_pass FPFX(rdo_get_pred_intra_luma_2nd_pass) int rdo_get_pred_intra_luma_2nd_pass(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, pel_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h); #define rdo_get_pred_intra_chroma FPFX(rdo_get_pred_intra_chroma) int rdo_get_pred_intra_chroma(xavs2_t *h, cu_t *p_cu, int i_level, int pix_y_c, int pix_x_c, intra_candidate_t *p_candidate_list); #define rdo_get_pred_intra_chroma_fast FPFX(rdo_get_pred_intra_chroma_fast) int rdo_get_pred_intra_chroma_fast(xavs2_t *h, cu_t *p_cu, int i_level, int pix_y_c, int pix_x_c, intra_candidate_t *p_candidate_list); #endif // XAVS2_INTRA_H xavs2-1.3/source/common/mc.c000066400000000000000000001041231340660520300157530ustar00rootroot00000000000000/* * mc.c * * Description of this file: * MC functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "predict.h" #include "wrapper.h" #include "frame.h" #include "cpu.h" #include "mc.h" #if HAVE_MMX #include "x86/mc.h" #include "vec/intrinsic.h" #endif /** * =========================================================================== * global/local variables * =========================================================================== */ /* --------------------------------------------------------------------------- */ const int16_t tab_dmh_pos[DMH_MODE_NUM + DMH_MODE_NUM - 1][2] = { { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, -1 }, { 1, 1 }, { 2, 0 }, { 0, 2 }, { 2, -2 }, { 2, 2 } }; /* --------------------------------------------------------------------------- * interpolate filter (luma) */ ALIGN16(static const int8_t INTPL_FILTERS[4][8]) = { { 0, 0, 0, 64, 0, 0, 0, 0 }, /* for full-pixel, no use */ { -1, 4, -10, 57, 19, -7, 3, -1 }, { -1, 4, -11, 40, 40, -11, 4, -1 }, { -1, 3, -7, 19, 57, -10, 4, -1 } }; /* --------------------------------------------------------------------------- * interpolate filter (chroma) */ ALIGN16(static const int8_t INTPL_FILTERS_C[8][4]) = { { 0, 64, 0, 0 }, /* for full-pixel, no use */ { -4, 62, 6, 0 }, { -6, 56, 15, -1 }, { -5, 47, 25, -3 }, { -4, 36, 36, -4 }, { -3, 25, 47, -5 }, { -1, 15, 56, -6 }, { 0, 6, 62, -4 } }; /* --------------------------------------------------------------------------- * interpolate offsets */ static const int MC_OFFSET = 8; static const int PAD_OFFSET = 4; /* --------------------------------------------------------------------------- * luma interpolating position */ enum intpl_pos_e { INTPL_POS_0 = 0, /* decoded luma full pel plane */ INTPL_POS_A = 1, /* interpolating position: a */ INTPL_POS_B = 2, /* */ INTPL_POS_C = 3, /* | */ INTPL_POS_D = 4, /* | 0 1 2 3 */ INTPL_POS_E = 5, /* ----+------------------ */ INTPL_POS_F = 6, /* | */ INTPL_POS_G = 7, /* 0 | * a b c */ INTPL_POS_H = 8, /* | */ INTPL_POS_I = 9, /* 1 | d e f g */ INTPL_POS_J = 10, /* | */ INTPL_POS_K = 11, /* 2 | h i j k */ INTPL_POS_N = 12, /* | */ INTPL_POS_P = 13, /* 3 | n p q r */ INTPL_POS_Q = 14, /* | */ INTPL_POS_R = 15 /* */ }; /** * =========================================================================== * macros * =========================================================================== */ #define FLT_8TAP_HOR(src, i, coef) ( \ (src)[i - 3] * (coef)[0] + \ (src)[i - 2] * (coef)[1] + \ (src)[i - 1] * (coef)[2] + \ (src)[i ] * (coef)[3] + \ (src)[i + 1] * (coef)[4] + \ (src)[i + 2] * (coef)[5] + \ (src)[i + 3] * (coef)[6] + \ (src)[i + 4] * (coef)[7]) #define FLT_8TAP_VER(src, i, i_src, coef) ( \ (src)[i - 3 * i_src] * (coef)[0] + \ (src)[i - 2 * i_src] * (coef)[1] + \ (src)[i - 1 * i_src] * (coef)[2] + \ (src)[i ] * (coef)[3] + \ (src)[i + 1 * i_src] * (coef)[4] + \ (src)[i + 2 * i_src] * (coef)[5] + \ (src)[i + 3 * i_src] * (coef)[6] + \ (src)[i + 4 * i_src] * (coef)[7]) #define FLT_4TAP_HOR(src, i, coef) ( \ (src)[i - 1] * (coef)[0] + \ (src)[i ] * (coef)[1] + \ (src)[i + 1] * (coef)[2] + \ (src)[i + 2] * (coef)[3]) #define FLT_4TAP_VER(src, i, i_src, coef) ( \ (src)[i - 1 * i_src] * (coef)[0] + \ (src)[i ] * (coef)[1] + \ (src)[i + 1 * i_src] * (coef)[2] + \ (src)[i + 2 * i_src] * (coef)[3]) /** * =========================================================================== * local function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void mc_copy_c(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h) { while (h--) { memcpy(dst, src, w * sizeof(pel_t)); dst += i_dst; src += i_src; } } /* --------------------------------------------------------------------------- * plane copy */ static void plane_copy_c(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h) { while (h--) { memcpy(dst, src, w * sizeof(pel_t)); dst += i_dst; src += i_src; } } #define PLANE_COPY(align, cpu) \ void plane_copy_##cpu(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h)\ {\ int c_w = (align) / sizeof(pel_t) - 1;\ if (w < 256) { /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\ plane_copy_c( dst, i_dst, src, i_src, w, h );\ } else if (!(w & c_w)) {\ xavs2_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\ } else {\ if (--h > 0) {\ if( i_src > 0 ) {\ xavs2_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ dst += i_dst * h;\ src += i_src * h;\ } else {\ xavs2_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ }\ }\ /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\ memcpy( dst, src, w*sizeof(pel_t) );\ }\ } #if HAVE_MMX PLANE_COPY(16, mmx2) #endif /* --------------------------------------------------------------------------- * deinterleave copy, for chroma planes */ static void plane_copy_deinterleave_c(pel_t *dstu, intptr_t i_dstu, pel_t *dstv, intptr_t i_dstv, pel_t *src, intptr_t i_src, int w, int h) { int x, y; for (y = 0; y < h; y++, dstu += i_dstu, dstv += i_dstv, src += i_src) { for (x = 0; x < w; x++) { dstu[x] = src[2*x ]; dstv[x] = src[2*x + 1]; } } } /* --------------------------------------------------------------------------- */ void *memzero_aligned_c(void *dst, size_t n) { return memset(dst, 0, n); } /* --------------------------------------------------------------------------- */ void mem_repeat_i_c(void *dst, int val, size_t count) { int *p = (int *)dst; for (; count != 0; count--) { *p++ = val; } } /* --------------------------------------------------------------------------- */ void mem_repeat_8i_c(void *dst, int val, size_t count) { int64_t *p = (int64_t *)dst; int64_t val64 = val; val64 = (val64 << 32) | val; count = (count + 7) >> 3; for (; count != 0; count--) { *p++ = val64; *p++ = val64; *p++ = val64; *p++ = val64; } } /* --------------------------------------------------------------------------- */ static void intpl_chroma_block_hor_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int x, y, v; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_4TAP_HOR(src, x, coeff) + 32) >> 6; dst[x] = (pel_t)XAVS2_CLIP1(v); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intpl_chroma_block_ver_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int x, y, v; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_4TAP_VER(src, x, i_src, coeff) + 32) >> 6; dst[x] = (pel_t)XAVS2_CLIP1(v); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intpl_chroma_block_ext_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v) { ALIGN16(int32_t tmp_res[(32 + 3) * 32]); int32_t *tmp = tmp_res; const int shift1 = g_bit_depth - 8; const int add1 = (1 << shift1) >> 1; const int shift2 = 20 - g_bit_depth; const int add2 = 1 << (shift2 - 1); // 1<<(19-g_bit_depth) int x, y, v; src -= i_src; for (y = -1; y < height + 2; y++) { for (x = 0; x < width; x++) { v = FLT_4TAP_HOR(src, x, coeff_h); tmp[x] = (v + add1) >> shift1; } src += i_src; tmp += 32; } tmp = tmp_res + 32; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_4TAP_VER(tmp, x, 32, coeff_v) + add2) >> shift2; dst[x] = (pel_t)XAVS2_CLIP1(v); } dst += i_dst; tmp += 32; } } /* --------------------------------------------------------------------------- */ static void intpl_luma_block_hor_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int x, y, v; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_8TAP_HOR(src, x, coeff) + 32) >> 6; dst[x] = (pel_t)XAVS2_CLIP1(v); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ #define intpl_luma_block_ver_c intpl_luma_ver_c /* --------------------------------------------------------------------------- */ static void intpl_luma_block_ext_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v) { #define TMP_STRIDE 64 const int shift1 = g_bit_depth - 8; const int add1 = (1 << shift1) >> 1; const int shift2 = 20 - g_bit_depth; const int add2 = 1 << (shift2 - 1);//1<<(19-bit_depth) ALIGN16(mct_t tmp_buf[(64 + 7) * TMP_STRIDE]); mct_t *tmp = tmp_buf; int x, y, v; src -= 3 * i_src; for (y = -3; y < height + 4; y++) { for (x = 0; x < width; x++) { v = FLT_8TAP_HOR(src, x, coeff_h); tmp[x] = (mct_t)((v + add1) >> shift1); } src += i_src; tmp += TMP_STRIDE; } tmp = tmp_buf + 3 * TMP_STRIDE; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_8TAP_VER(tmp, x, TMP_STRIDE, coeff_v) + add2) >> shift2; dst[x] = (pel_t)XAVS2_CLIP1(v); } dst += i_dst; tmp += TMP_STRIDE; } #undef TMP_STRIDE } /* --------------------------------------------------------------------------- */ static void intpl_luma_hor_c(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int x, y, v; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = FLT_8TAP_HOR(src, x, coeff); tmp[x] = (mct_t)v; dst[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6); } src += i_src; tmp += i_tmp; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intpl_luma_ver_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int x, y; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { int v = FLT_8TAP_VER(src, x, i_src, coeff); v = (v + 32) >> 6; dst[x] = (pel_t)XAVS2_CLIP1(v); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intpl_luma_ver_x3_c(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, int8_t const **coeff) { int x, y, v; pel_t *dst0 = dst[0]; pel_t *dst1 = dst[1]; pel_t *dst2 = dst[2]; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = FLT_8TAP_VER(src, x, i_src, coeff[0]); dst0[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6); v = FLT_8TAP_VER(src, x, i_src, coeff[1]); dst1[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6); v = FLT_8TAP_VER(src, x, i_src, coeff[2]); dst2[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6); } src += i_src; dst0 += i_dst; dst1 += i_dst; dst2 += i_dst; } } /* --------------------------------------------------------------------------- */ static void intpl_luma_hor_x3_c(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff) { int x, y, v; pel_t *dst0 = dst[0]; pel_t *dst1 = dst[1]; pel_t *dst2 = dst[2]; mct_t *tmp0 = tmp[0]; mct_t *tmp1 = tmp[1]; mct_t *tmp2 = tmp[2]; for (y = 0; y < height; y++) { for(x = 0; x < width; x++) { v = FLT_8TAP_HOR(src, x, coeff[0]); tmp0[x] = (mct_t)v; dst0[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6); v = FLT_8TAP_HOR(src, x, coeff[1]); tmp1[x] = (mct_t)v; dst1[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6); v = FLT_8TAP_HOR(src, x, coeff[2]); tmp2[x] = (mct_t)v; dst2[x] = (pel_t)XAVS2_CLIP1((v + 32) >> 6); } src += i_src; tmp0 += i_tmp; tmp1 += i_tmp; tmp2 += i_tmp; dst0 += i_dst; dst1 += i_dst; dst2 += i_dst; } } /* --------------------------------------------------------------------------- */ static void intpl_luma_ext_c(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff) { const int MC_SHIFT = 20 - g_bit_depth; const int MC_ADD = 1 << (MC_SHIFT - 1); // (1 << (19-g_bit_depth)) int x, y; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { int v = FLT_8TAP_VER(tmp, x, i_tmp, coeff); v = (v + MC_ADD) >> MC_SHIFT; dst[x] = (pel_t)XAVS2_CLIP1(v); } dst += i_dst; tmp += i_tmp; } } static void intpl_luma_ext_x3_c(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff) { const int MC_SHIFT = 20 - g_bit_depth; const int MC_ADD = 1 << (MC_SHIFT - 1); // (1 << (19-g_bit_depth)) int x, y; pel_t *dst0 = dst[0]; pel_t *dst1 = dst[1]; pel_t *dst2 = dst[2]; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { int v; v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[0]); v = (v + MC_ADD) >> MC_SHIFT; dst0[x] = (pel_t)XAVS2_CLIP1(v); v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[1]); v = (v + MC_ADD) >> MC_SHIFT; dst1[x] = (pel_t)XAVS2_CLIP1(v); v = FLT_8TAP_VER(tmp, x, i_tmp, coeff[2]); v = (v + MC_ADD) >> MC_SHIFT; dst2[x] = (pel_t)XAVS2_CLIP1(v); } dst0 += i_dst; dst1 += i_dst; dst2 += i_dst; tmp += i_tmp; } } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * predict one component of a luma block * ref_idx - reference frame (0.. / -1:backward) */ void mc_luma(pel_t *p_pred, int i_pred, int pix_quad_x, int pix_quad_y, int width, int height, const xavs2_frame_t *p_ref_frm) { int x = (pix_quad_x >> 2); int y = (pix_quad_y >> 2); int dx = pix_quad_x & 3; int dy = pix_quad_y & 3; int i_src = p_ref_frm->i_stride[0]; pel_t *src = p_ref_frm->filtered[(dy << 2) + dx]; /* fetch prediction result */ #if ENABLE_FRAME_SUBPEL_INTPL if (src != NULL) { src += y * i_src + x; g_funcs.pixf.copy_pp[PART_INDEX(width, height)](p_pred, i_pred, src, i_src); } else { #endif src = p_ref_frm->filtered[0] + y * i_src + x; if (dx == 0 && dy == 0) { g_funcs.pixf.copy_pp[PART_INDEX(width, height)](p_pred, i_pred, src, i_src); } else if (dy == 0) { g_funcs.intpl_luma_block_hor(p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dx]); } else if (dx == 0) { g_funcs.intpl_luma_block_ver(p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dy]); } else { g_funcs.intpl_luma_block_ext(p_pred, i_pred, src, i_src, width, height, INTPL_FILTERS[dx], INTPL_FILTERS[dy]); } #if ENABLE_FRAME_SUBPEL_INTPL } #endif } /* --------------------------------------------------------------------------- */ void interpolate_sample_rows(xavs2_t *h, xavs2_frame_t* frm, int start_y, int height, int b_start, int b_end) { int stride = frm->i_stride[IMG_Y]; // for src and dst int i_tmp = frm->i_width[IMG_Y] + 2 * XAVS2_PAD; int width = frm->i_width[IMG_Y] + 2 * PAD_OFFSET; int off_dst = start_y * stride - PAD_OFFSET; pel_t *src = frm->planes[IMG_Y] + off_dst; // reconstructed luma plane pel_t *p_dst[3]; const int8_t *p_coeffs[3]; pel_t *dst; mct_t *intpl_tmp[3]; /* ------------------------------------------------------------- * init */ intpl_tmp[0] = h->img4Y_tmp[0] + (XAVS2_PAD + start_y) * i_tmp + XAVS2_PAD - PAD_OFFSET; intpl_tmp[1] = h->img4Y_tmp[1] + (XAVS2_PAD + start_y) * i_tmp + XAVS2_PAD - PAD_OFFSET; intpl_tmp[2] = h->img4Y_tmp[2] + (XAVS2_PAD + start_y) * i_tmp + XAVS2_PAD - PAD_OFFSET; /* ------------------------------------------------------------- * interpolate horizontal positions: a,b,c; * 4 more rows needed for vertical interpolation */ // SSE x3 Optimization /* decoded luma full pel plane */ /* interpolating position: a */ /* */ /* | */ /* | 0 1 2 3 */ /* ----+------------------ */ /* | */ /* 0 | * a b c */ /* | */ /* 1 | d e f g */ /* | */ /* 2 | h i j k */ /* | */ /* 3 | n p q r */ /* | */ /* */ /* ------------------------------------------------------------- * interpolate horizontal positions: a.b,c */ { const int shift_h = 4; // ƫ4²ֵԲ intpl_tmp[0] -= shift_h * i_tmp; intpl_tmp[1] -= shift_h * i_tmp; intpl_tmp[2] -= shift_h * i_tmp; src -= shift_h * stride; if (h->use_fractional_me > 1) { p_dst[0] = frm->filtered[INTPL_POS_A] + off_dst - shift_h * stride; // a p_coeffs[0] = INTPL_FILTERS[INTPL_POS_A]; // a p_dst[1] = frm->filtered[INTPL_POS_B] + off_dst - shift_h * stride; // b p_coeffs[1] = INTPL_FILTERS[INTPL_POS_B]; // b p_dst[2] = frm->filtered[INTPL_POS_C] + off_dst - shift_h * stride; // c p_coeffs[2] = INTPL_FILTERS[INTPL_POS_C]; // c g_funcs.intpl_luma_hor_x3(p_dst, stride, intpl_tmp, i_tmp, src, stride, width, height + 4 + shift_h, p_coeffs); } else { // b dst = frm->filtered[INTPL_POS_B] + off_dst - 4 * stride; g_funcs.intpl_luma_hor(dst, stride, intpl_tmp[1], i_tmp, src, stride, width, height + 4 + shift_h, INTPL_FILTERS[INTPL_POS_B]); } src += shift_h * stride; intpl_tmp[0] += shift_h * i_tmp; intpl_tmp[1] += shift_h * i_tmp; intpl_tmp[2] += shift_h * i_tmp; } /* ------------------------------------------------------------- * interpolate vertical positions: d,h,n */ if (h->use_fractional_me > 1) { p_dst[0] = frm->filtered[INTPL_POS_D] + off_dst; // d p_coeffs[0] = INTPL_FILTERS[INTPL_POS_D >> 2]; // d p_dst[1] = frm->filtered[INTPL_POS_H] + off_dst; // h p_coeffs[1] = INTPL_FILTERS[INTPL_POS_H >> 2]; // h p_dst[2] = frm->filtered[INTPL_POS_N] + off_dst; // n p_coeffs[2] = INTPL_FILTERS[INTPL_POS_N >> 2]; // n g_funcs.intpl_luma_ver_x3(p_dst, stride, src, stride, width, height, p_coeffs); } else{ p_dst[1] = frm->filtered[INTPL_POS_H] + off_dst; // h g_funcs.intpl_luma_ver(p_dst[1], stride, src, stride, width, height, INTPL_FILTERS[INTPL_POS_H >> 2]); } /* ------------------------------------------------------------- * interpolate tilt positions: [e,f,g; i,j,k; p,q,r] */ if (h->use_fractional_me > 1) { // --- for e,i,p --- p_dst[0] = frm->filtered[INTPL_POS_E] + off_dst; // e p_coeffs[0] = INTPL_FILTERS[INTPL_POS_E >> 2]; // e p_dst[1] = frm->filtered[INTPL_POS_I] + off_dst; // i p_coeffs[1] = INTPL_FILTERS[INTPL_POS_I >> 2]; // i p_dst[2] = frm->filtered[INTPL_POS_P] + off_dst; // p p_coeffs[2] = INTPL_FILTERS[INTPL_POS_P >> 2]; // p g_funcs.intpl_luma_ext_x3(p_dst, stride, intpl_tmp[0], i_tmp, width, height, p_coeffs); // --- for f,j,q --- p_dst[0] = frm->filtered[INTPL_POS_F] + off_dst; // f p_coeffs[0] = INTPL_FILTERS[INTPL_POS_F >> 2]; // f p_dst[1] = frm->filtered[INTPL_POS_J] + off_dst; // j p_coeffs[1] = INTPL_FILTERS[INTPL_POS_J >> 2]; // j p_dst[2] = frm->filtered[INTPL_POS_Q] + off_dst; // q p_coeffs[2] = INTPL_FILTERS[INTPL_POS_Q >> 2]; // q g_funcs.intpl_luma_ext_x3(p_dst, stride, intpl_tmp[1], i_tmp, width, height, p_coeffs); // --- for g,k,r --- p_dst[0] = frm->filtered[INTPL_POS_G] + off_dst; // g p_coeffs[0] = INTPL_FILTERS[INTPL_POS_G >> 2]; // g p_dst[1] = frm->filtered[INTPL_POS_K] + off_dst; // k p_coeffs[1] = INTPL_FILTERS[INTPL_POS_K >> 2]; // k p_dst[2] = frm->filtered[INTPL_POS_R] + off_dst; // r p_coeffs[2] = INTPL_FILTERS[INTPL_POS_R >> 2]; // r g_funcs.intpl_luma_ext_x3(p_dst, stride, intpl_tmp[2], i_tmp, width, height, p_coeffs); } else { // j dst = frm->filtered[INTPL_POS_J] + off_dst; g_funcs.intpl_luma_ext(dst, stride, intpl_tmp[1], i_tmp, width, height, INTPL_FILTERS[INTPL_POS_J >> 2]); } /* --------------------------------------------------------------------------- * expand border for all 15 filtered planes */ { const int padh = XAVS2_PAD - PAD_OFFSET; const int padv = XAVS2_PAD - PAD_OFFSET; int i; width = frm->i_width[IMG_Y] + PAD_OFFSET * 2; /* loop over all 15 filtered planes */ for (i = 1; i < 16; i++) { pel_t *pix = frm->filtered[i]; if (pix != NULL) { pix += start_y * stride - PAD_OFFSET; plane_expand_border(pix, stride, width, height, padh, padv, b_start, b_end); } } } } /* --------------------------------------------------------------------------- */ void interpolate_lcu_row(xavs2_t *h, xavs2_frame_t* frm, int i_lcu_y) { int b_start = !i_lcu_y; int b_end = i_lcu_y == h->i_height_in_lcu - 1; int y_start = (i_lcu_y + 0) << h->i_lcu_level; int y_end = (i_lcu_y + 1) << h->i_lcu_level; int height; slice_t *slice = h->slices[h->i_slice_index]; /* Чֵʼͽк */ if (b_start) { y_start -= PAD_OFFSET; } else { y_start -= MC_OFFSET; } if (b_end) { y_end = h->i_height + MC_OFFSET - PAD_OFFSET; } else { y_end -= MC_OFFSET; } /* sliceʱ */ if (h->param->slice_num > 1 && !b_start && !b_end) { if (slice->i_first_lcu_y == i_lcu_y) { /* Sliceϱ߽ */ y_start += (MC_OFFSET + PAD_OFFSET); } if (slice->i_last_lcu_y == i_lcu_y) { /* Slice±߽ */ y_end += PAD_OFFSET; } } height = y_end - y_start; // xavs2_log(NULL, XAVS2_LOG_DEBUG, "Intpl POC [%3d], Slice %2d, Row %2d, [%3d, %3d)\n", // h->fenc->i_frame, h->i_slice_index, i_lcu_y, y_start, y_end); interpolate_sample_rows(h, frm, y_start, height, b_start, b_end); } /** * =========================================================================== * interpolating for chroma * =========================================================================== */ /* --------------------------------------------------------------------------- * predict one component of a chroma block */ void mc_chroma(pel_t *p_pred_u, pel_t *p_pred_v, int i_pred, int pix_quad_x, int pix_quad_y, int width, int height, const xavs2_frame_t *p_ref_frm) { int posx = pix_quad_x & 7; int posy = pix_quad_y & 7; int i_src = p_ref_frm->i_stride[IMG_U]; pel_t *p_src_u = p_ref_frm->planes[IMG_U]; pel_t *p_src_v = p_ref_frm->planes[IMG_V]; int src_offset = (pix_quad_y >> 3) * i_src + (pix_quad_x >> 3); p_src_u += src_offset; p_src_v += src_offset; if (posy == 0 && posx == 0) { if (width != 2 && width != 6 && height != 2 && height != 6) { g_funcs.pixf.copy_pp[PART_INDEX(width, height)](p_pred_u, i_pred, p_src_u, i_src); g_funcs.pixf.copy_pp[PART_INDEX(width, height)](p_pred_v, i_pred, p_src_v, i_src); } else { g_funcs.align_copy(p_pred_u, i_pred, p_src_u, i_src, width, height); g_funcs.align_copy(p_pred_v, i_pred, p_src_v, i_src, width, height); } } else if (posy == 0) { g_funcs.intpl_chroma_block_hor(p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posx]); g_funcs.intpl_chroma_block_hor(p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posx]); } else if (posx == 0) { g_funcs.intpl_chroma_block_ver(p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posy]); g_funcs.intpl_chroma_block_ver(p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posy]); } else { g_funcs.intpl_chroma_block_ext(p_pred_u, i_pred, p_src_u, i_src, width, height, INTPL_FILTERS_C[posx], INTPL_FILTERS_C[posy]); g_funcs.intpl_chroma_block_ext(p_pred_v, i_pred, p_src_v, i_src, width, height, INTPL_FILTERS_C[posx], INTPL_FILTERS_C[posy]); } } /** * =========================================================================== * low resolution (down sampling) * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void lowres_filter_core_c(pel_t *src, int i_src, pel_t *dst, int i_dst, int width, int height) { #define FILTER(a,b,c,d) ((((a+b+1)>>1) + ((c+d+1)>>1) + 1) >> 1) int i_src2 = i_src << 1; // stride of 2 src lines int x, y; pel_t *dwn; for (y = 0; y < height; y++) { dwn = src + i_src; // point to down line of src for (x = 0; x < width; x++) { dst[x] = FILTER(src[2 * x], dwn[2 * x], src[2 * x + 1], dwn[2 * x + 1]); } src += i_src2; dst += i_dst; } #undef FILTER } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * global function set initial */ void xavs2_mem_oper_init(uint32_t cpuid, intrinsic_func_t *pf) { pf->fast_memcpy = memcpy; pf->memcpy_aligned = memcpy; pf->fast_memset = memset; pf->fast_memzero = memzero_aligned_c; pf->memzero_aligned = memzero_aligned_c; pf->mem_repeat_i = mem_repeat_i_c; pf->mem_repeat_p = memset; pf->lowres_filter = lowres_filter_core_c; #if ARCH_X86_64 pf->mem_repeat_i = mem_repeat_8i_c; // x64ܹ£ѭͬʱʹ64λֵ #endif #if HAVE_MMX if (cpuid & XAVS2_CPU_MMX) { pf->fast_memcpy = xavs2_fast_memcpy_mmx; pf->memcpy_aligned = xavs2_memcpy_aligned_mmx; pf->fast_memset = xavs2_fast_memset_mmx; pf->fast_memzero = xavs2_fast_memzero_mmx; pf->memzero_aligned = xavs2_fast_memzero_mmx; } if (cpuid & XAVS2_CPU_MMX2) { pf->lowres_filter = xavs2_lowres_filter_core_mmx2; } if (cpuid & XAVS2_CPU_SSE) { // pf->memcpy_aligned = xavs2_memcpy_aligned_sse; // pf->memzero_aligned = xavs2_memzero_aligned_sse; } if (cpuid & XAVS2_CPU_SSE2) { pf->memzero_aligned = xavs2_memzero_aligned_c_sse2; // pf->memcpy_aligned = xavs2_memcpy_aligned_c_sse2; pf->lowres_filter = xavs2_lowres_filter_core_sse2; // pf->mem_repeat_i = xavs2_mem_repeat_i_c_sse2; // TODO: C汾 } if (cpuid & XAVS2_CPU_SSSE3) { pf->lowres_filter = xavs2_lowres_filter_core_ssse3; } if (cpuid & XAVS2_CPU_AVX2) { pf->memzero_aligned = xavs2_memzero_aligned_c_avx; // pf->mem_repeat_i = xavs2_mem_repeat_i_c_avx; // TODO: C汾 pf->lowres_filter = xavs2_lowres_filter_core_avx; } #else UNUSED_PARAMETER(cpuid); #endif } /* --------------------------------------------------------------------------- */ void xavs2_mc_init(uint32_t cpuid, intrinsic_func_t *pf) { /* align copy */ pf->align_copy = mc_copy_c; /* plane copy */ pf->plane_copy = plane_copy_c; pf->plane_copy_deinterleave = plane_copy_deinterleave_c; /* interpolate */ pf->intpl_luma_hor = intpl_luma_hor_c; pf->intpl_luma_ver = intpl_luma_ver_c; pf->intpl_luma_ext = intpl_luma_ext_c; pf->intpl_luma_ver_x3 = intpl_luma_ver_x3_c; pf->intpl_luma_hor_x3 = intpl_luma_hor_x3_c; pf->intpl_luma_ext_x3 = intpl_luma_ext_x3_c; pf->intpl_luma_block_hor = intpl_luma_block_hor_c; pf->intpl_luma_block_ver = intpl_luma_block_ver_c; pf->intpl_luma_block_ext = intpl_luma_block_ext_c; pf->intpl_chroma_block_hor = intpl_chroma_block_hor_c; pf->intpl_chroma_block_ver = intpl_chroma_block_ver_c; pf->intpl_chroma_block_ext = intpl_chroma_block_ext_c; #if HAVE_MMX if (cpuid & XAVS2_CPU_MMX2) { pf->plane_copy = plane_copy_mmx2; pf->plane_copy_deinterleave = xavs2_plane_copy_deinterleave_mmx; } if (cpuid & XAVS2_CPU_SSE42) { pf->intpl_luma_hor = intpl_luma_hor_sse128; pf->intpl_luma_ver = intpl_luma_ver_sse128; pf->intpl_luma_ext = intpl_luma_ext_sse128; pf->intpl_luma_hor_x3 = intpl_luma_hor_x3_sse128; pf->intpl_luma_ver_x3 = intpl_luma_ver_x3_sse128; pf->intpl_luma_ext_x3 = intpl_luma_ext_x3_sse128; pf->intpl_luma_block_hor = intpl_luma_block_hor_sse128; pf->intpl_luma_block_ver = intpl_luma_block_ver_sse128; pf->intpl_luma_block_ext = intpl_luma_block_ext_sse128; pf->intpl_chroma_block_hor = intpl_chroma_block_hor_sse128; pf->intpl_chroma_block_ver = intpl_chroma_block_ver_sse128; pf->intpl_chroma_block_ext = intpl_chroma_block_ext_sse128; } if (cpuid & XAVS2_CPU_AVX2) { pf->intpl_luma_hor = intpl_luma_hor_avx2; pf->intpl_luma_ver = intpl_luma_ver_avx2; pf->intpl_luma_ext = intpl_luma_ext_avx2; pf->intpl_luma_ver_x3 = intpl_luma_ver_x3_avx2; pf->intpl_luma_hor_x3 = intpl_luma_hor_x3_avx2; pf->intpl_luma_ext_x3 = intpl_luma_ext_x3_avx2; pf->intpl_luma_block_hor = intpl_luma_block_hor_avx2; pf->intpl_luma_block_ver = intpl_luma_block_ver_avx2; pf->intpl_luma_block_ext = intpl_luma_block_ext_avx2; pf->intpl_chroma_block_ver = intpl_chroma_block_ver_avx2; pf->intpl_chroma_block_hor = intpl_chroma_block_hor_avx2; pf->intpl_chroma_block_ext = intpl_chroma_block_ext_avx2; } #else UNUSED_PARAMETER(cpuid); #endif } xavs2-1.3/source/common/mc.h000066400000000000000000000073531340660520300157670ustar00rootroot00000000000000/* * mc.h * * Description of this file: * MC functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_MC_H #define XAVS2_MC_H /** * =========================================================================== * inline function declares * =========================================================================== */ /* --------------------------------------------------------------------------- * img_size: ؾȵͼ Ȼ߶ ؾȣ * blk_size: ǰԤ Ȼ߶ ؾȣ * blk_pos: ǰͼе x/y ؾȣ * mv : MV x/y 1/4ؾȣ */ static INLINE int cu_get_mc_pos(int img_size, int blk_size, int blk_pos, int mv) { int imv = mv >> 2; // MVؾ int fmv = mv & 7; // MVķؾȲ֣ 1/8 if (blk_pos + imv < -blk_size - 8) { return ((-blk_size - 8) << 2) + (fmv); } else if (blk_pos + imv > img_size + 4) { return ((img_size + 4) << 2) + (fmv); } else { return (blk_pos << 2) + mv; } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void get_mv_for_mc(xavs2_t *h, mv_t *mv, int pic_pix_x, int pic_pix_y, int blk_w, int blk_h) { // WARNING: ͼֱΪ 4K ʱ㹻8K ʱ mv->x = (int16_t)cu_get_mc_pos(h->i_width, blk_w, pic_pix_x, mv->x); mv->y = (int16_t)cu_get_mc_pos(h->i_height, blk_h, pic_pix_y, mv->y); } /** * =========================================================================== * function declares * =========================================================================== */ #define interpolate_lcu_row FPFX(interpolate_lcu_row) void interpolate_lcu_row(xavs2_t *h, xavs2_frame_t* frm, int i_lcu_y); #define interpolate_sample_rows FPFX(interpolate_sample_rows) void interpolate_sample_rows(xavs2_t *h, xavs2_frame_t* frm, int start_y, int height, int b_start, int b_end); #define mc_luma FPFX(mc_luma) void mc_luma (pel_t *p_pred, int i_pred, int pic_pix_x, int pic_pix_y, int width, int height, const xavs2_frame_t *p_ref_frm); #define mc_chroma FPFX(mc_chroma) void mc_chroma(pel_t *p_pred_u, pel_t *p_pred_v, int i_pred, int pix_quad_x, int pix_quad_y, int width, int height, const xavs2_frame_t *p_ref_frm); #endif // XAVS2_MC_H xavs2-1.3/source/common/nal.h000066400000000000000000000116041340660520300161340ustar00rootroot00000000000000/* * nal.h * * Description of this file: * NAL functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_NAL_H #define XAVS2_NAL_H #include "bitstream.h" /** * =========================================================================== * nal function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void nal_start(xavs2_t *h, int i_type, int i_ref_idc) { nal_t *nal = &h->p_nal[h->i_nal]; nal->i_ref_idc = i_ref_idc; nal->i_type = i_type; nal->i_payload = 0; nal->p_payload = &h->p_bs_buf_header[xavs2_bs_pos(&h->header_bs) >> 3]; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void nal_end(xavs2_t *h) { nal_t *nal = &h->p_nal[h->i_nal]; uint8_t *end = &h->p_bs_buf_header[xavs2_bs_pos(&h->header_bs) >> 3]; nal->i_payload = (int)(end - nal->p_payload); h->i_nal++; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void nal_merge_slice(xavs2_t *h, uint8_t *p_bs_buffer, int i_bs_len, int i_type, int i_ref_idc) { nal_t *nal = &h->p_nal[h->i_nal]; assert(i_bs_len > 8); // update the current nal nal->i_ref_idc = i_ref_idc; nal->i_type = i_type; nal->i_payload = i_bs_len; nal->p_payload = p_bs_buffer; // next nal h->i_nal++; } /* --------------------------------------------------------------------------- */ static INLINE uint8_t *nal_escape_c(uint8_t *dst, uint8_t *src, uint8_t *end) { int left_bits = 8; uint8_t tmp = 0; /* check pseudo start code */ while (src < end) { tmp |= (uint8_t)(*src >> (8 - left_bits)); if (tmp <= 0x03 && !dst[-2] && !dst[-1]) { *dst++ = 0x02; /* insert '10' */ tmp <<= 6; if (left_bits >= 2) { tmp |= (uint8_t)((*src++) << (left_bits - 2)); left_bits = left_bits - 2; } else { tmp |= (uint8_t)((*src) >> (2 - left_bits)); *dst++ = tmp; tmp = (uint8_t)((*src++) << (6 + left_bits)); left_bits = 6 + left_bits; } continue; } *dst++ = tmp; tmp = (uint8_t)((*src++) << left_bits); } /* rest bits */ if (left_bits != 8 && tmp != 0) { *dst++ = tmp; } return dst; } /* --------------------------------------------------------------------------- */ static INLINE intptr_t encoder_encapsulate_nals(xavs2_t *h, xavs2_frame_t *frm, int start) { uint8_t *nal_buffer; int previous_nal_size = 0; int nal_size = 0; int i; for (i = 0; i < start; i++) { previous_nal_size += h->p_nal[i].i_payload; } for (i = start; i < h->i_nal; i++) { nal_size += h->p_nal[i].i_payload; } /* NOTE: frame->i_bs_buf is big enough, no need to reallocate memory */ // assert(previous_nal_size + nal_size <= frame->i_bs_buf); /* copy new nals */ nal_buffer = frm->p_bs_buf + previous_nal_size; nal_size = h->i_nal; /* number of all nals */ for (i = start; i < nal_size; i++) { nal_t *nal = &h->p_nal[i]; memcpy(nal_buffer, nal->p_payload, nal->i_payload); nal_buffer += nal->i_payload; } return nal_buffer - (frm->p_bs_buf + previous_nal_size); } #endif // XAVS2_NAL_H xavs2-1.3/source/common/osdep.h000066400000000000000000000332521340660520300164770ustar00rootroot00000000000000/* * osdep.h * * Description of this file: * platform-specific code functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_OSDEP_H #define XAVS2_OSDEP_H /** * =========================================================================== * includes * =========================================================================== */ /* --------------------------------------------------------------------------- * disable warning C4996: functions or variables may be unsafe. */ #if defined(_MSC_VER) #define WIN32_LEAN_AND_MEAN #define _CRT_NONSTDC_NO_DEPRECATE #define _CRT_SECURE_NO_DEPRECATE #define _CRT_SECURE_NO_WARNINGS #include #include #endif #define _LARGEFILE_SOURCE 1 #define _FILE_OFFSET_BITS 64 #if defined(__ICL) || defined(_MSC_VER) #include "configw.h" #else #include "config.h" #endif #include #include #include #if HAVE_STDINT_H #include #else #include #endif #if defined(__INTEL_COMPILER) #include #else #include #endif #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include #endif /* --------------------------------------------------------------------------- * disable warning C4100: unreferenced formal parameter */ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #define UNUSED_PARAMETER(P) (P) /* same as UNREFERENCED_PARAMETER */ #else #define UNUSED_PARAMETER(P) #endif /** * =========================================================================== * const defines * =========================================================================== */ /* --------------------------------------------------------------------------- * Specifies the number of bits per pixel that xavs2 encoder uses. This is also the * bit depth that xavs2 encoder encodes in. If this value is > 8, xavs2 encoder will read * two bytes of input data for each pixel sample, and expect the upper * (16-XAVS2_BIT_DEPTH) bits to be zero. * Note: The flag XAVS2_CSP_HIGH_DEPTH must be used to specify the * colorspace depth as well. */ #define XAVS2_BIT_DEPTH BIT_DEPTH #define WORD_SIZE sizeof(void*) #define asm __asm__ /** * =========================================================================== * const defines * =========================================================================== */ #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0) #define UNINIT(x) x = x #define UNUSED __attribute__((unused)) #define ALWAYS_INLINE __attribute__((always_inline)) inline #define NOINLINE __attribute__((noinline)) #define MAY_ALIAS __attribute__((may_alias)) #define xavs2_constant_p(x) __builtin_constant_p(x) #define xavs2_nonconstant_p(x) (!__builtin_constant_p(x)) #define INLINE __inline #else #define UNINIT(x) x #if defined(__ICL) #define ALWAYS_INLINE __forceinline #define NOINLINE __declspec(noinline) #else #define ALWAYS_INLINE INLINE #define NOINLINE #endif #define UNUSED #define MAY_ALIAS #define xavs2_constant_p(x) 0 #define xavs2_nonconstant_p(x) 0 #endif #if defined(__ICL) || defined(_MSC_VER) #define INLINE __inline #define strcasecmp _stricmp #define strncasecmp _strnicmp #if !HAVE_POSIXTHREAD #define strtok_r strtok_s #endif #define S_ISREG(x) (((x) & S_IFMT) == S_IFREG) #endif #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && (ARCH_X86 || ARCH_X86_64) #ifndef HAVE_X86_INLINE_ASM #define HAVE_X86_INLINE_ASM 1 #endif #endif /* --------------------------------------------------------------------------- * align */ /* align a pointer */ # define CACHE_LINE_SIZE 32 /* for x86-64 and x86 */ # define ALIGN_POINTER(p) (p) = (uint8_t *)((intptr_t)((p) + (CACHE_LINE_SIZE - 1)) & (~(intptr_t)(CACHE_LINE_SIZE - 1))) # define CACHE_LINE_256B 32 /* for x86-64 and x86 */ # define ALIGN_256_PTR(p) (p) = (uint8_t *)((intptr_t)((p) + (CACHE_LINE_256B - 1)) & (~(intptr_t)(CACHE_LINE_256B - 1))) #if defined(_MSC_VER) #pragma warning(disable:4324) /* disable warning C4324: __declspec(align())ṹ */ #define DECLARE_ALIGNED(var, n) __declspec(align(n)) var #else #define DECLARE_ALIGNED(var, n) var __attribute__((aligned(n))) #endif #define ALIGN32(var) DECLARE_ALIGNED(var, 32) #define ALIGN16(var) DECLARE_ALIGNED(var, 16) #define ALIGN8(var) DECLARE_ALIGNED(var, 8) // ARM compiliers don't reliably align stack variables // - EABI requires only 8 byte stack alignment to be maintained // - gcc can't align stack variables to more even if the stack were to be correctly aligned outside the function // - armcc can't either, but is nice enough to actually tell you so // - Apple gcc only maintains 4 byte alignment // - llvm can align the stack, but only in svn and (unrelated) it exposes bugs in all released GNU binutils... #define ALIGNED_ARRAY_EMU( mask, type, name, sub1, ... )\ uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + mask]; \ type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+mask) & ~mask) #if ARCH_ARM && SYS_MACOSX #define ALIGNED_ARRAY_8( ... ) ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ ) #else #define ALIGNED_ARRAY_8( type, name, sub1, ... ) \ ALIGN8( type name sub1 __VA_ARGS__ ) #endif #if ARCH_ARM #define ALIGNED_ARRAY_16( ... ) ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ ) #else #define ALIGNED_ARRAY_16( type, name, sub1, ... ) \ ALIGN16( type name sub1 __VA_ARGS__ ) #endif #define EXPAND(x) x #if defined(STACK_ALIGNMENT) && STACK_ALIGNMENT >= 32 #define ALIGNED_ARRAY_32( type, name, sub1, ... ) \ ALIGN32( type name sub1 __VA_ARGS__ ) #else #define ALIGNED_ARRAY_32(...) EXPAND(ALIGNED_ARRAY_EMU(31, __VA_ARGS__)) #endif #define ALIGNED_ARRAY_64(...) EXPAND(ALIGNED_ARRAY_EMU(63, __VA_ARGS__)) /* For AVX2 */ #if ARCH_X86 || ARCH_X86_64 #define NATIVE_ALIGN 32 #define ALIGNED_N ALIGN32 #define ALIGNED_ARRAY_N ALIGNED_ARRAY_32 #else #define NATIVE_ALIGN 16 #define ALIGNED_N ALIGN16 #define ALIGNED_ARRAY_N ALIGNED_ARRAY_16 #endif /* --------------------------------------------------------------------------- * threads */ #if HAVE_BEOSTHREAD #include #define xavs2_thread_t thread_id static int ALWAYS_INLINE xavs2_thread_create(xavs2_thread_t *t, void *a, void *(*f)(void *), void *d) { *t = spawn_thread(f, "", 10, d); if (*t < B_NO_ERROR) { return -1; } resume_thread(*t); return 0; } #define xavs2_thread_join(t,s) \ {\ long tmp; \ wait_for_thread(t,(s)?(long*)(*(s)):&tmp);\ } #elif HAVE_POSIXTHREAD #if defined(_MSC_VER) || defined(__ICL) #if _MSC_VER >= 1900 #define HAVE_STRUCT_TIMESPEC 1 /* for struct timespec */ #endif #pragma comment(lib, "pthread_lib.lib") #endif #include #define xavs2_thread_t pthread_t #define xavs2_thread_create pthread_create #define xavs2_thread_join pthread_join #define xavs2_thread_mutex_t pthread_mutex_t #define xavs2_thread_mutex_init pthread_mutex_init #define xavs2_thread_mutex_destroy pthread_mutex_destroy #define xavs2_thread_mutex_lock pthread_mutex_lock #define xavs2_thread_mutex_unlock pthread_mutex_unlock #define xavs2_thread_cond_t pthread_cond_t #define xavs2_thread_cond_init pthread_cond_init #define xavs2_thread_cond_destroy pthread_cond_destroy #define xavs2_thread_cond_signal pthread_cond_signal #define xavs2_thread_cond_broadcast pthread_cond_broadcast #define xavs2_thread_cond_wait pthread_cond_wait #define xavs2_thread_attr_t pthread_attr_t #define xavs2_thread_attr_init pthread_attr_init #define xavs2_thread_attr_destroy pthread_attr_destroy #define xavs2_thread_attr_setdetachstate pthread_attr_setdetachstate #define xavs2_thread_num_processors_np pthread_num_processors_np #define XAVS2_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #elif HAVE_WIN32THREAD #include "win32thread.h" #else #define xavs2_thread_t int #define xavs2_thread_create(t,u,f,d) 0 #define xavs2_thread_join(t,s) #endif // HAVE_*THREAD #if !HAVE_POSIXTHREAD && !HAVE_WIN32THREAD #define xavs2_thread_mutex_t int #define xavs2_thread_mutex_init(m,f) 0 #define xavs2_thread_mutex_destroy(m) #define xavs2_thread_mutex_lock(m) #define xavs2_thread_mutex_unlock(m) #define xavs2_thread_cond_t int #define xavs2_thread_cond_init(c,f) 0 #define xavs2_thread_cond_destroy(c) #define xavs2_thread_cond_broadcast(c) #define xavs2_thread_cond_wait(c,m) #define xavs2_thread_attr_t int #define xavs2_thread_attr_init(a) 0 #define xavs2_thread_attr_destroy(a) #define XAVS2_PTHREAD_MUTEX_INITIALIZER 0 #endif #if HAVE_POSIXTHREAD #if SYS_WINDOWS #define xavs2_lower_thread_priority(p) \ {\ xavs2_thread_t handle = pthread_self();\ struct sched_param sp;\ int policy = SCHED_OTHER;\ pthread_getschedparam(handle, &policy, &sp);\ sp.sched_priority -= p;\ pthread_setschedparam(handle, policy, &sp);\ } #else #include #define xavs2_lower_thread_priority(p) { UNUSED int nice_ret = nice(p); } #endif /* SYS_WINDOWS */ #elif HAVE_WIN32THREAD #define xavs2_lower_thread_priority(p) SetThreadPriority(GetCurrentThread(), XAVS2_MAX(-2, -p)) #else #define xavs2_lower_thread_priority(p) #endif #if SYS_WINDOWS #define xavs2_sleep_ms(x) Sleep(x) #else #define xavs2_sleep_ms(x) usleep(x * 1000) #endif /** * =========================================================================== * inline functions * =========================================================================== */ #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 3) #define xavs2_clz(x) __builtin_clz(x) #define xavs2_ctz(x) __builtin_ctz(x) #elif defined(_MSC_VER) && defined(_WIN32) static int ALWAYS_INLINE xavs2_clz(const uint32_t x) { DWORD r; _BitScanReverse(&r, (DWORD)x); return (r ^ 31); } static int ALWAYS_INLINE xavs2_ctz(const uint32_t x) { DWORD r; _BitScanForward(&r, (DWORD)x); return r; } #else static int ALWAYS_INLINE xavs2_clz(uint32_t x) { static uint8_t lut[16] = {4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0}; int y, z = (((x >> 16) - 1) >> 27) & 16; x >>= z ^ 16; z += y = ((x - 0x100) >> 28) & 8; x >>= y ^ 8; z += y = ((x - 0x10) >> 29) & 4; x >>= y ^ 4; return z + lut[x]; } static int ALWAYS_INLINE xavs2_ctz(uint32_t x) { static uint8_t lut[16] = {4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0}; int y, z = (((x & 0xffff) - 1) >> 27) & 16; x >>= z; z += y = (((x & 0xff) - 1) >> 28) & 8; x >>= y; z += y = (((x & 0xf) - 1) >> 29) & 4; x >>= y; return z + lut[x & 0xf]; } #endif /* --------------------------------------------------------------------------- * prefetch */ #if HAVE_X86_INLINE_ASM && HAVE_MMX /* Don't use __builtin_prefetch; even as recent as 4.3.4, GCC seems incapable * of using complex address modes properly unless we use inline asm. */ static void ALWAYS_INLINE xavs2_prefetch(void *p) { asm volatile("prefetcht0 %0"::"m"(*(uint8_t *)p)); } /* We require that prefetch not fault on invalid reads, so we only enable it * on known architectures. */ #elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 1) &&\ (ARCH_X86 || ARCH_X86_64 || ARCH_ARM || ARCH_PPC) #define xavs2_prefetch(x) __builtin_prefetch(x) #elif defined(_MSC_VER) #define xavs2_prefetch(x) _mm_prefetch((const char*)(x), _MM_HINT_T0) #else #define xavs2_prefetch(x) #endif /* --------------------------------------------------------------------------- * log2/log2f */ #if !HAVE_LOG2F #define log2f(x) (logf(x)/0.693147180559945f) #define log2(x) (log(x)/0.693147180559945) #endif #endif /* XAVS2_OSDEP_H */ xavs2-1.3/source/common/pixel.c000066400000000000000000002064221340660520300165020ustar00rootroot00000000000000/* * pixel.c * * Description of this file: * Pixel processing functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "pixel.h" #include "cpu.h" #include #if HAVE_MMX #include "vec/intrinsic.h" #include "x86/pixel.h" #include "x86/blockcopy8.h" #include "x86/pixel-util.h" #endif /** * =========================================================================== * global variables * =========================================================================== */ /* --------------------------------------------------------------------------- * partition map table */ #define INVALID LUMA_INVALID const uint8_t g_partition_map_tab[] = { // 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 64 LUMA_4x4, LUMA_4x8, INVALID, LUMA_4x16, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, // 4 LUMA_8x4, LUMA_8x8, INVALID, LUMA_8x16, INVALID, INVALID, INVALID, LUMA_8x32, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, // 8 INVALID, INVALID, INVALID, LUMA_12x16, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, // 12 LUMA_16x4, LUMA_16x8, LUMA_16x12, LUMA_16x16, INVALID, INVALID, INVALID, LUMA_16x32, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, LUMA_16x64, // 16 INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, // 20 INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, LUMA_24x32, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, // 24 INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, // 28 INVALID, LUMA_32x8, INVALID, LUMA_32x16, INVALID, LUMA_32x24, INVALID, LUMA_32x32, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, LUMA_32x64, // 32 INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, // 36 INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, // 40 INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, // 44 INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, LUMA_48x64, // 48 INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, // 52 INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, // 56 INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, // 60 INVALID, INVALID, INVALID, LUMA_64x16, INVALID, INVALID, INVALID, LUMA_64x32, INVALID, INVALID, INVALID, LUMA_64x48, INVALID, INVALID, INVALID, LUMA_64x64 // 64 }; #undef INVALID /** * =========================================================================== * local function defines * =========================================================================== */ /** * --------------------------------------------------------------------------- * SAD * --------------------------------------------------------------------------- */ #define PIXEL_SAD_C(w, h) \ static cmp_dist_t xavs2_pixel_sad_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\ {\ cmp_dist_t sum = 0;\ int x, y;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x += 4) {\ sum += abs(pix1[x] - pix2[x]);\ sum += abs(pix1[x + 1] - pix2[x + 1]);\ sum += abs(pix1[x + 2] - pix2[x + 2]);\ sum += abs(pix1[x + 3] - pix2[x + 3]);\ }\ pix1 += i_pix1;\ pix2 += i_pix2;\ }\ return sum;\ } PIXEL_SAD_C(64, 64) /* 64x64 */ PIXEL_SAD_C(64, 32) PIXEL_SAD_C(32, 64) PIXEL_SAD_C(64, 16) PIXEL_SAD_C(64, 48) PIXEL_SAD_C(16, 64) PIXEL_SAD_C(48, 64) PIXEL_SAD_C(32, 32) /* 32x32 */ PIXEL_SAD_C(32, 16) PIXEL_SAD_C(16, 32) PIXEL_SAD_C(32, 8) PIXEL_SAD_C(32, 24) PIXEL_SAD_C( 8, 32) PIXEL_SAD_C(24, 32) PIXEL_SAD_C(16, 16) /* 16x16 */ PIXEL_SAD_C(16, 8) PIXEL_SAD_C( 8, 16) PIXEL_SAD_C(16, 4) PIXEL_SAD_C(16, 12) PIXEL_SAD_C( 4, 16) PIXEL_SAD_C(12, 16) PIXEL_SAD_C( 8, 8) /* 8x8 */ PIXEL_SAD_C( 8, 4) PIXEL_SAD_C( 4, 8) PIXEL_SAD_C( 4, 4) /* 4x4 */ /** * --------------------------------------------------------------------------- * SAD x3 * --------------------------------------------------------------------------- */ #define PIXEL_SAD_X3_C(w, h) \ void xavs2_pixel_sad_x3_##w##x##h(const pel_t* pix1, const pel_t* pix2, const pel_t* pix3, const pel_t* pix4, intptr_t i_fref_stride, int32_t* res)\ {\ int x, y;\ res[0] = 0;\ res[1] = 0;\ res[2] = 0;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ res[0] += abs(pix1[x] - pix2[x]);\ res[1] += abs(pix1[x] - pix3[x]);\ res[2] += abs(pix1[x] - pix4[x]);\ }\ pix1 += FENC_STRIDE;\ pix2 += i_fref_stride;\ pix3 += i_fref_stride;\ pix4 += i_fref_stride;\ }\ } PIXEL_SAD_X3_C(64, 64) /* 64x64 */ PIXEL_SAD_X3_C(64, 32) PIXEL_SAD_X3_C(32, 64) PIXEL_SAD_X3_C(64, 16) PIXEL_SAD_X3_C(64, 48) PIXEL_SAD_X3_C(16, 64) PIXEL_SAD_X3_C(48, 64) PIXEL_SAD_X3_C(32, 32) /* 32x32 */ PIXEL_SAD_X3_C(32, 16) PIXEL_SAD_X3_C(16, 32) PIXEL_SAD_X3_C(32, 8) PIXEL_SAD_X3_C(32, 24) PIXEL_SAD_X3_C( 8, 32) PIXEL_SAD_X3_C(24, 32) PIXEL_SAD_X3_C(16, 16) /* 16x16 */ PIXEL_SAD_X3_C(16, 8) PIXEL_SAD_X3_C( 8, 16) PIXEL_SAD_X3_C(16, 4) PIXEL_SAD_X3_C(16, 12) PIXEL_SAD_X3_C( 4, 16) PIXEL_SAD_X3_C(12, 16) PIXEL_SAD_X3_C( 8, 8) /* 8x8 */ PIXEL_SAD_X3_C( 8, 4) PIXEL_SAD_X3_C( 4, 8) PIXEL_SAD_X3_C( 4, 4) /* 4x4 */ /** * --------------------------------------------------------------------------- * SAD x4 * --------------------------------------------------------------------------- */ #define PIXEL_SAD_X4_C(w, h) \ void xavs2_pixel_sad_x4_##w##x##h(const pel_t* pix1, const pel_t* pix2, const pel_t* pix3, const pel_t* pix4, const pel_t* pix5, intptr_t i_fref_stride, int32_t* res)\ {\ int x, y;\ res[0] = 0;\ res[1] = 0;\ res[2] = 0;\ res[3] = 0;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ res[0] += abs(pix1[x] - pix2[x]);\ res[1] += abs(pix1[x] - pix3[x]);\ res[2] += abs(pix1[x] - pix4[x]);\ res[3] += abs(pix1[x] - pix5[x]);\ }\ pix1 += FENC_STRIDE;\ pix2 += i_fref_stride;\ pix3 += i_fref_stride;\ pix4 += i_fref_stride;\ pix5 += i_fref_stride;\ }\ } PIXEL_SAD_X4_C(64, 64) /* 64x64 */ PIXEL_SAD_X4_C(64, 32) PIXEL_SAD_X4_C(32, 64) PIXEL_SAD_X4_C(64, 16) PIXEL_SAD_X4_C(64, 48) PIXEL_SAD_X4_C(16, 64) PIXEL_SAD_X4_C(48, 64) PIXEL_SAD_X4_C(32, 32) /* 32x32 */ PIXEL_SAD_X4_C(32, 16) PIXEL_SAD_X4_C(16, 32) PIXEL_SAD_X4_C(32, 8) PIXEL_SAD_X4_C(32, 24) PIXEL_SAD_X4_C( 8, 32) PIXEL_SAD_X4_C(24, 32) PIXEL_SAD_X4_C(16, 16) /* 16x16 */ PIXEL_SAD_X4_C(16, 8) PIXEL_SAD_X4_C( 8, 16) PIXEL_SAD_X4_C(16, 4) PIXEL_SAD_X4_C(16, 12) PIXEL_SAD_X4_C( 4, 16) PIXEL_SAD_X4_C(12, 16) PIXEL_SAD_X4_C( 8, 8) /* 8x8 */ PIXEL_SAD_X4_C( 8, 4) PIXEL_SAD_X4_C( 4, 8) PIXEL_SAD_X4_C( 4, 4) /* 4x4 */ /** * --------------------------------------------------------------------------- * SATD * --------------------------------------------------------------------------- */ #define BITS_PER_SUM (8 * sizeof(uint16_t)) #define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) \ {\ uint32_t t0 = s0 + s1;\ uint32_t t1 = s0 - s1;\ uint32_t t2 = s2 + s3;\ uint32_t t3 = s2 - s3;\ d0 = t0 + t2;\ d2 = t0 - t2;\ d1 = t1 + t3;\ d3 = t1 - t3;\ } #define HADAMARD4_10bit(d0, d1, d2, d3, s0, s1, s2, s3) \ {\ uint64_t t0 = s0 + s1; \ uint64_t t1 = s0 - s1; \ uint64_t t2 = s2 + s3; \ uint64_t t3 = s2 - s3; \ d0 = t0 + t2; \ d2 = t0 - t2; \ d1 = t1 + t3; \ d3 = t1 - t3; \ } /* --------------------------------------------------------------------------- * in: a pseudo-simd number of the form x+(y<<16) * return: abs(x) + (abs(y)<<16) */ ALWAYS_INLINE uint32_t abs2(uint32_t a) { uint32_t s = ((a >> (BITS_PER_SUM - 1)) & (((uint32_t)1 << BITS_PER_SUM) + 1)) * ((uint16_t)-1); return (a + s) ^ s; } ALWAYS_INLINE uint64_t abs2_10bit(uint64_t a) { uint64_t s = ((a >> (BITS_PER_SUM - 1)) & (((uint64_t)1 << BITS_PER_SUM) + 1)) * ((uint64_t)-1); return (a + s) ^ s; } /* --------------------------------------------------------------------------- */ static cmp_dist_t xavs2_pixel_satd_4x4(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2) { uint32_t tmp[4][2]; uint32_t a0, a1, a2, a3, b0, b1; cmp_dist_t sum = 0; int i; for (i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2) { a0 = pix1[0] - pix2[0]; a1 = pix1[1] - pix2[1]; b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); a2 = pix1[2] - pix2[2]; a3 = pix1[3] - pix2[3]; b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); tmp[i][0] = b0 + b1; tmp[i][1] = b0 - b1; } for (i = 0; i < 2; i++) { HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); sum += ((uint16_t)a0) + (a0 >> BITS_PER_SUM); } return (sum >> 1); } /* --------------------------------------------------------------------------- * SWAR version of satd 8x4, performs two 4x4 SATDs at once */ static cmp_dist_t xavs2_pixel_satd_8x4(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2) { uint32_t tmp[4][4]; uint32_t a0, a1, a2, a3; cmp_dist_t sum = 0; int i; for (i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2) { a0 = (pix1[0] - pix2[0]) + ((uint32_t)(pix1[4] - pix2[4]) << BITS_PER_SUM); a1 = (pix1[1] - pix2[1]) + ((uint32_t)(pix1[5] - pix2[5]) << BITS_PER_SUM); a2 = (pix1[2] - pix2[2]) + ((uint32_t)(pix1[6] - pix2[6]) << BITS_PER_SUM); a3 = (pix1[3] - pix2[3]) + ((uint32_t)(pix1[7] - pix2[7]) << BITS_PER_SUM); HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3); } for (i = 0; i < 4; i++) { HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); } return (((uint16_t)sum) + (sum >> BITS_PER_SUM)) >> 1; } /* --------------------------------------------------------------------------- * calculate satd in blocks of 4x4 */ #define PIXEL_SATD4_C(w, h) \ static cmp_dist_t xavs2_pixel_satd_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\ {\ cmp_dist_t satd = 0;\ int y, x;\ for (y = 0; y < h; y += 4) {\ for (x = 0; x < w; x += 4) {\ satd += xavs2_pixel_satd_4x4(pix1 + y * i_pix1 + x, i_pix1,\ pix2 + y * i_pix2 + x, i_pix2);\ }\ }\ return satd;\ } /* --------------------------------------------------------------------------- * calculate satd in blocks of 8x4 */ #define PIXEL_SATD8_C(w, h) \ static cmp_dist_t xavs2_pixel_satd_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\ {\ cmp_dist_t satd = 0;\ int y, x;\ for (y = 0; y < h; y += 4) {\ for (x = 0; x < w; x += 8) {\ satd += xavs2_pixel_satd_8x4(pix1 + y * i_pix1 + x, i_pix1,\ pix2 + y * i_pix2 + x, i_pix2);\ }\ }\ return satd;\ } PIXEL_SATD8_C(64, 64) /* 64x64 */ PIXEL_SATD8_C(64, 32) PIXEL_SATD8_C(32, 64) PIXEL_SATD8_C(64, 16) PIXEL_SATD8_C(64, 48) PIXEL_SATD8_C(16, 64) PIXEL_SATD8_C(48, 64) PIXEL_SATD8_C(32, 32) /* 32x32 */ PIXEL_SATD8_C(32, 16) PIXEL_SATD8_C(16, 32) PIXEL_SATD8_C(32, 8) PIXEL_SATD8_C(32, 24) PIXEL_SATD8_C( 8, 32) PIXEL_SATD8_C(24, 32) PIXEL_SATD8_C(16, 16) /* 16x16 */ PIXEL_SATD8_C(16, 8) PIXEL_SATD8_C( 8, 16) PIXEL_SATD8_C(16, 4) PIXEL_SATD8_C(16, 12) PIXEL_SATD4_C( 4, 16) PIXEL_SATD4_C(12, 16) PIXEL_SATD8_C( 8, 8) /* 8x8 */ PIXEL_SATD4_C( 4, 8) /** * --------------------------------------------------------------------------- * SA8D * --------------------------------------------------------------------------- */ int _sa8d_8x8(const pel_t* pix1, intptr_t i_pix1, const pel_t* pix2, intptr_t i_pix2) { sum2_t tmp[8][4]; sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; sum2_t sum = 0; for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2) { a0 = pix1[0] - pix2[0]; a1 = pix1[1] - pix2[1]; b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); a2 = pix1[2] - pix2[2]; a3 = pix1[3] - pix2[3]; b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); a4 = pix1[4] - pix2[4]; a5 = pix1[5] - pix2[5]; b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM); a6 = pix1[6] - pix2[6]; a7 = pix1[7] - pix2[7]; b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM); HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3); } for (int i = 0; i < 4; i++) { HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]); b0 = abs2(a0 + a4) + abs2(a0 - a4); b0 += abs2(a1 + a5) + abs2(a1 - a5); b0 += abs2(a2 + a6) + abs2(a2 - a6); b0 += abs2(a3 + a7) + abs2(a3 - a7); sum += (sum_t)b0 + (b0 >> BITS_PER_SUM); } return (cmp_dist_t)sum; } /* --------------------------------------------------------------------------- */ static cmp_dist_t xavs2_pixel_sa8d_8x8(const pel_t* pix1, intptr_t i_pix1, const pel_t* pix2, intptr_t i_pix2) { return (cmp_dist_t)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2); } /* --------------------------------------------------------------------------- */ static cmp_dist_t xavs2_pixel_sa8d_16x16(const pel_t* pix1, intptr_t i_pix1, const pel_t* pix2, intptr_t i_pix2) { cmp_dist_t sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2) + _sa8d_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2) + _sa8d_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2); // This matches x264 sa8d_16x16, but is slightly different from HM's behavior because // this version only rounds once at the end return (sum + 2) >> 2; } /* --------------------------------------------------------------------------- * calculate sa8d in blocks of 8x8 */ #define PIXEL_SA8D_C8(w, h) \ static cmp_dist_t xavs2_pixel_sa8d_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\ {\ cmp_dist_t sa8d = 0;\ int y, x;\ for (y = 0; y < h; y += 8) {\ for (x = 0; x < w; x += 8) {\ sa8d += xavs2_pixel_sa8d_8x8(pix1 + y * i_pix1 + x, i_pix1,\ pix2 + y * i_pix2 + x, i_pix2);\ }\ }\ return sa8d;\ } /* --------------------------------------------------------------------------- * calculate sa8d in blocks of 16x16 */ #define PIXEL_SA8D_C16(w, h) \ static cmp_dist_t xavs2_pixel_sa8d_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\ {\ cmp_dist_t sa8d = 0;\ int y, x;\ for (y = 0; y < h; y += 16) {\ for (x = 0; x < w; x += 16) {\ sa8d += xavs2_pixel_sa8d_16x16(pix1 + y * i_pix1 + x, i_pix1,\ pix2 + y * i_pix2 + x, i_pix2);\ }\ }\ return sa8d;\ } #define xavs2_pixel_sa8d_4x4 xavs2_pixel_satd_4x4 #define xavs2_pixel_sa8d_4x8 xavs2_pixel_satd_4x8 #define xavs2_pixel_sa8d_8x4 xavs2_pixel_satd_8x4 #define xavs2_pixel_sa8d_16x4 xavs2_pixel_satd_16x4 #define xavs2_pixel_sa8d_4x16 xavs2_pixel_satd_4x16 #define xavs2_pixel_sa8d_12x16 xavs2_pixel_satd_12x16 #define xavs2_pixel_sa8d_16x12 xavs2_pixel_satd_16x12 PIXEL_SA8D_C8(8, 16) PIXEL_SA8D_C8(8, 32) PIXEL_SA8D_C8(16, 8) PIXEL_SA8D_C8(32, 8) PIXEL_SA8D_C16(32, 16) PIXEL_SA8D_C8(32, 24) PIXEL_SA8D_C8(24, 32) PIXEL_SA8D_C16(32, 32) PIXEL_SA8D_C16(16, 32) PIXEL_SA8D_C16(64, 16) PIXEL_SA8D_C16(64, 32) PIXEL_SA8D_C16(64, 48) PIXEL_SA8D_C16(16, 64) PIXEL_SA8D_C16(32, 64) PIXEL_SA8D_C16(48, 64) PIXEL_SA8D_C16(64, 64) /** * --------------------------------------------------------------------------- * SSD * --------------------------------------------------------------------------- */ dist_t xavs2_get_block_ssd_c(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2, int width, int height) { dist_t sum = 0; int x, y, tmp; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { tmp = pix1[x] - pix2[x]; sum += (tmp * tmp); } pix1 += i_pix1; pix2 += i_pix2; } return sum; } #define PIXEL_SSD_C(w, h) \ static dist_t xavs2_pixel_ssd_##w##x##h(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2)\ {\ dist_t sum = 0;\ int x, y, tmp;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ tmp = pix1[x] - pix2[x];\ sum += (tmp * tmp);\ }\ pix1 += i_pix1;\ pix2 += i_pix2;\ }\ return sum;\ } PIXEL_SSD_C(64, 64) /* 64x64 */ PIXEL_SSD_C(64, 32) PIXEL_SSD_C(32, 64) PIXEL_SSD_C(64, 16) PIXEL_SSD_C(64, 48) PIXEL_SSD_C(16, 64) PIXEL_SSD_C(48, 64) PIXEL_SSD_C(32, 32) /* 32x32 */ PIXEL_SSD_C(32, 16) PIXEL_SSD_C(16, 32) PIXEL_SSD_C(32, 8) PIXEL_SSD_C(32, 24) PIXEL_SSD_C( 8, 32) PIXEL_SSD_C(24, 32) PIXEL_SSD_C(16, 16) /* 16x16 */ PIXEL_SSD_C(16, 8) PIXEL_SSD_C( 8, 16) PIXEL_SSD_C(16, 4) PIXEL_SSD_C(16, 12) PIXEL_SSD_C( 4, 16) PIXEL_SSD_C(12, 16) PIXEL_SSD_C( 8, 8) /* 8x8 */ PIXEL_SSD_C( 8, 4) PIXEL_SSD_C( 4, 8) PIXEL_SSD_C( 4, 4) /* 4x4 */ /* --------------------------------------------------------------------------- * ssd for one plane of frame */ #if XAVS2_STAT uint64_t xavs2_pixel_ssd_wxh(pixel_funcs_t *pf, pel_t *p_pix1, intptr_t i_pix1, pel_t *p_pix2, intptr_t i_pix2, int i_width, int i_height, int inout_shift) { uint64_t i_ssd = 0; int align = !(((intptr_t)p_pix1 | (intptr_t)p_pix2 | i_pix1 | i_pix2) & 15); int x, y; pixel_ssd_t cal_ssd[2]; if (inout_shift > 0) { int inout_offset = 1 << (inout_shift - 1); for (y = 0; y < i_height; y++) { for (x = 0; x < i_width; x++) { int d = ((p_pix1[x] + inout_offset) >> inout_shift) - ((p_pix2[x] + inout_offset) >> inout_shift); i_ssd += d * d; } p_pix1 += i_pix1; p_pix2 += i_pix2; } } else { cal_ssd[0] = pf->ssd[LUMA_8x8]; /* 8 x 8 */ cal_ssd[1] = pf->ssd[LUMA_16x16]; /* 16 x 16 */ #define SSD(id) i_ssd += cal_ssd[id](p_pix1 + y*i_pix1 + x, i_pix1, p_pix2 + y*i_pix2 + x, i_pix2) for (y = 0; y < i_height - 15;) { if (align) { for (x = 0; x < i_width - 15; x += 16) { SSD(1); /* 16x16 */ } y += 16; } else { for (x = 0; x < i_width - 7; x += 8) { SSD(0); /* 8x8 */ } y += 8; for (x = 0; x < i_width - 7; x += 8) { SSD(0); /* 8x8 */ } y += 8; } } if (y < i_height - 7) { for (x = 0; x < i_width - 7; x += 8) { SSD(0); /* 8x8 */ } } #undef SSD /* sum the rest ssd */ #define SSD1 { int d = p_pix1[y*i_pix1+x] - p_pix2[y*i_pix2+x]; i_ssd += d*d; } if (i_width & 7) { for (y = 0; y < (i_height & ~7); y++) { for (x = i_width & ~7; x < i_width; x++) { SSD1; } } } if (i_height & 7) { for (y = i_height & ~7; y < i_height; y++) { for (x = 0; x < i_width; x++) { SSD1; } } } #undef SSD1 } return i_ssd; } #endif /** * --------------------------------------------------------------------------- * AVG * --------------------------------------------------------------------------- */ #define PIXEL_AVG_C(w, h) \ static void xavs2_pixel_avg_##w##x##h(pel_t* dst, intptr_t dstride, const pel_t* src0, intptr_t sstride0, const pel_t* src1, intptr_t sstride1, int weight)\ {\ int x, y;\ UNUSED_PARAMETER(weight); \ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ dst[x] = (src0[x] + src1[x] + 1) >> 1;\ }\ dst += dstride;\ src0 += sstride0;\ src1 += sstride1;\ }\ } PIXEL_AVG_C(64, 64) /* 64x64 */ PIXEL_AVG_C(64, 32) PIXEL_AVG_C(32, 64) PIXEL_AVG_C(64, 16) PIXEL_AVG_C(64, 48) PIXEL_AVG_C(16, 64) PIXEL_AVG_C(48, 64) PIXEL_AVG_C(32, 32) /* 32x32 */ PIXEL_AVG_C(32, 16) PIXEL_AVG_C(16, 32) PIXEL_AVG_C(32, 8) PIXEL_AVG_C(32, 24) PIXEL_AVG_C( 8, 32) PIXEL_AVG_C(24, 32) PIXEL_AVG_C(16, 16) /* 16x16 */ PIXEL_AVG_C(16, 8) PIXEL_AVG_C( 8, 16) PIXEL_AVG_C(16, 4) PIXEL_AVG_C(16, 12) PIXEL_AVG_C( 4, 16) PIXEL_AVG_C(12, 16) PIXEL_AVG_C( 8, 8) /* 8x8 */ PIXEL_AVG_C( 8, 4) PIXEL_AVG_C( 4, 8) PIXEL_AVG_C( 4, 4) /* 4x4 */ /** * --------------------------------------------------------------------------- * block operation: copy/add/sub (p: pixel, s: short) * --------------------------------------------------------------------------- */ #define BLOCKCOPY_PP_C(w, h) \ static void xavs2_blockcopy_pp_##w##x##h(pel_t *a, intptr_t stridea, const pel_t *b, intptr_t strideb)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ a[x] = b[x];\ }\ a += stridea;\ b += strideb;\ }\ } #define BLOCKCOPY_SS_C(w, h) \ static void xavs2_blockcopy_ss_##w##x##h(coeff_t* a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ a[x] = b[x];\ }\ a += stridea;\ b += strideb;\ }\ } #define BLOCKCOPY_SP_C(w, h) \ static void xavs2_blockcopy_sp_##w##x##h(pel_t *a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ assert((b[x] >= 0) && (b[x] <= ((1 << 8) - 1)));\ a[x] = (pel_t)b[x];\ }\ a += stridea;\ b += strideb;\ }\ } #define BLOCKCOPY_PS_C(w, h) \ static void xavs2_blockcopy_ps_##w##x##h(coeff_t *a, intptr_t stridea, const pel_t *b, intptr_t strideb)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ a[x] = (int16_t)b[x];\ }\ a += stridea;\ b += strideb;\ }\ }\ #define PIXEL_SUB_PS_C(w, h) \ static void xavs2_pixel_sub_ps_##w##x##h(coeff_t *a, intptr_t dstride, const pel_t *b0, const pel_t *b1, intptr_t sstride0, intptr_t sstride1)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ a[x] = (int16_t)(b0[x] - b1[x]);\ }\ b0 += sstride0;\ b1 += sstride1;\ a += dstride;\ }\ } #define PIXEL_ADD_PS_C(w, h) \ static void xavs2_pixel_add_ps_##w##x##h(pel_t *a, intptr_t dstride, const pel_t *b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ a[x] = (pel_t)XAVS2_CLIP1(b0[x] + b1[x]);\ }\ b0 += sstride0;\ b1 += sstride1;\ a += dstride;\ }\ } #define BLOCK_OP_C(w, h) \ BLOCKCOPY_PP_C(w, h);\ BLOCKCOPY_SS_C(w, h);\ BLOCKCOPY_SP_C(w, h);\ BLOCKCOPY_PS_C(w, h);\ PIXEL_SUB_PS_C(w, h);\ PIXEL_ADD_PS_C(w, h); BLOCK_OP_C(64, 64) /* 64x64 */ BLOCK_OP_C(64, 32) BLOCK_OP_C(32, 64) BLOCK_OP_C(64, 16) BLOCK_OP_C(64, 48) BLOCK_OP_C(16, 64) BLOCK_OP_C(48, 64) BLOCK_OP_C(32, 32) /* 32x32 */ BLOCK_OP_C(32, 16) BLOCK_OP_C(16, 32) BLOCK_OP_C(32, 8) BLOCK_OP_C(32, 24) BLOCK_OP_C( 8, 32) BLOCK_OP_C(24, 32) BLOCK_OP_C(16, 16) /* 16x16 */ BLOCK_OP_C(16, 8) BLOCK_OP_C( 8, 16) BLOCK_OP_C(16, 4) BLOCK_OP_C(16, 12) BLOCK_OP_C( 4, 16) BLOCK_OP_C(12, 16) BLOCK_OP_C( 8, 8) /* 8x8 */ BLOCK_OP_C( 8, 4) BLOCK_OP_C( 4, 8) BLOCK_OP_C( 4, 4) /* 4x4 */ /* --------------------------------------------------------------------------- */ static void xavs2_pixel_average(pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height) { int i, j; for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { dst[j] = (pel_t)((src1[j] + src2[j] + 1) >> 1); } dst += i_dst; src1 += i_src1; src2 += i_src2; } } /* --------------------------------------------------------------------------- * init functions of block operation : copy / add / sub */ static void init_block_opreation_funcs(uint32_t cpuid, pixel_funcs_t* pixf) { #define ALL_LUMA_CU(name1, name2, cpu) \ pixf->name1[LUMA_64x64] = xavs2_ ## name2 ## _64x64 ## cpu;\ pixf->name1[LUMA_32x32] = xavs2_ ## name2 ## _32x32 ## cpu;\ pixf->name1[LUMA_16x16] = xavs2_ ## name2 ## _16x16 ## cpu;\ pixf->name1[LUMA_8x8 ] = xavs2_ ## name2 ## _8x8 ## cpu;\ pixf->name1[LUMA_4x4 ] = xavs2_ ## name2 ## _4x4 ## cpu #define ALL_LUMA_PU(name1, name2, cpu) \ pixf->name1[LUMA_64x64] = xavs2_ ## name2 ## _64x64 ## cpu; /* 64x64 */ \ pixf->name1[LUMA_64x32] = xavs2_ ## name2 ## _64x32 ## cpu;\ pixf->name1[LUMA_32x64] = xavs2_ ## name2 ## _32x64 ## cpu;\ pixf->name1[LUMA_64x16] = xavs2_ ## name2 ## _64x16 ## cpu;\ pixf->name1[LUMA_64x48] = xavs2_ ## name2 ## _64x48 ## cpu;\ pixf->name1[LUMA_16x64] = xavs2_ ## name2 ## _16x64 ## cpu;\ pixf->name1[LUMA_48x64] = xavs2_ ## name2 ## _48x64 ## cpu;\ pixf->name1[LUMA_32x32] = xavs2_ ## name2 ## _32x32 ## cpu; /* 32x32 */ \ pixf->name1[LUMA_32x16] = xavs2_ ## name2 ## _32x16 ## cpu;\ pixf->name1[LUMA_16x32] = xavs2_ ## name2 ## _16x32 ## cpu;\ pixf->name1[LUMA_32x8 ] = xavs2_ ## name2 ## _32x8 ## cpu;\ pixf->name1[LUMA_32x24] = xavs2_ ## name2 ## _32x24 ## cpu;\ pixf->name1[LUMA_8x32 ] = xavs2_ ## name2 ## _8x32 ## cpu;\ pixf->name1[LUMA_24x32] = xavs2_ ## name2 ## _24x32 ## cpu;\ pixf->name1[LUMA_16x16] = xavs2_ ## name2 ## _16x16 ## cpu; /* 16x16 */ \ pixf->name1[LUMA_16x8 ] = xavs2_ ## name2 ## _16x8 ## cpu;\ pixf->name1[LUMA_8x16 ] = xavs2_ ## name2 ## _8x16 ## cpu;\ pixf->name1[LUMA_16x4 ] = xavs2_ ## name2 ## _16x4 ## cpu;\ pixf->name1[LUMA_16x12] = xavs2_ ## name2 ## _16x12 ## cpu;\ pixf->name1[LUMA_4x16 ] = xavs2_ ## name2 ## _4x16 ## cpu;\ pixf->name1[LUMA_12x16] = xavs2_ ## name2 ## _12x16 ## cpu;\ pixf->name1[LUMA_8x8 ] = xavs2_ ## name2 ## _8x8 ## cpu; /* 8x8 */ \ pixf->name1[LUMA_8x4 ] = xavs2_ ## name2 ## _8x4 ## cpu;\ pixf->name1[LUMA_4x8 ] = xavs2_ ## name2 ## _4x8 ## cpu;\ pixf->name1[LUMA_4x4 ] = xavs2_ ## name2 ## _4x4 ## cpu /* 4x4 */ /* ------------------------------------------------------------- * init all c functions */ //ALL_LUMA_CU(add_ps, pixel_add_ps, ); ALL_LUMA_PU(add_ps, pixel_add_ps, ); // ALL_LUMA_CU(sub_ps, pixel_sub_ps, ); ALL_LUMA_PU(sub_ps, pixel_sub_ps, ); ALL_LUMA_PU(copy_sp, blockcopy_sp, ); ALL_LUMA_PU(copy_ps, blockcopy_ps, ); ALL_LUMA_PU(copy_ss, blockcopy_ss, ); ALL_LUMA_PU(copy_pp, blockcopy_pp, ); pixf->ssd_block = xavs2_get_block_ssd_c; /* ------------------------------------------------------------- * init all SIMD functions */ #if HAVE_MMX if (cpuid & XAVS2_CPU_SSE2) { ALL_LUMA_PU(copy_sp, blockcopy_sp, _sse2); ALL_LUMA_PU(copy_ss, blockcopy_ss, _sse2); ALL_LUMA_PU(copy_pp, blockcopy_pp, _sse2); } if (cpuid & XAVS2_CPU_SSE4) { pixf->add_ps [LUMA_4x4 ] = xavs2_pixel_add_ps_4x4_sse4; pixf->add_ps [LUMA_4x8 ] = xavs2_pixel_add_ps_4x8_sse4; pixf->add_ps [LUMA_4x16 ] = xavs2_pixel_add_ps_4x16_sse4; pixf->add_ps [LUMA_8x8 ] = xavs2_pixel_add_ps_8x8_sse4; pixf->add_ps [LUMA_8x16 ] = xavs2_pixel_add_ps_8x16_sse4; pixf->add_ps [LUMA_8x32 ] = xavs2_pixel_add_ps_8x32_sse4; pixf->add_ps [LUMA_16x4 ] = xavs2_pixel_add_ps_16x4_sse4; pixf->add_ps [LUMA_16x8 ] = xavs2_pixel_add_ps_16x8_sse4; pixf->add_ps [LUMA_16x12] = xavs2_pixel_add_ps_16x12_sse4; pixf->add_ps [LUMA_16x16] = xavs2_pixel_add_ps_16x16_sse4; pixf->add_ps [LUMA_16x64] = xavs2_pixel_add_ps_16x64_sse4; pixf->add_ps [LUMA_32x8 ] = xavs2_pixel_add_ps_32x8_sse4; // pixf->add_ps [LUMA_32x16] = xavs2_pixel_add_ps_32x16_sse4; // pixf->add_ps [LUMA_32x24] = xavs2_pixel_add_ps_32x24_sse4; pixf->add_ps [LUMA_32x32] = xavs2_pixel_add_ps_32x32_sse4; pixf->add_ps [LUMA_32x64] = xavs2_pixel_add_ps_32x64_sse4; pixf->add_ps [LUMA_64x16] = xavs2_pixel_add_ps_64x16_sse4; // pixf->add_ps [LUMA_64x32] = xavs2_pixel_add_ps_64x32_sse4; // pixf->add_ps [LUMA_64x48] = xavs2_pixel_add_ps_64x48_sse4; pixf->add_ps [LUMA_64x64] = xavs2_pixel_add_ps_64x64_sse4; pixf->sub_ps [LUMA_4x4 ] = xavs2_pixel_sub_ps_4x4_sse4; pixf->sub_ps [LUMA_4x8 ] = xavs2_pixel_sub_ps_4x8_sse4; pixf->sub_ps [LUMA_4x16 ] = xavs2_pixel_sub_ps_4x16_sse4; pixf->sub_ps [LUMA_8x8 ] = xavs2_pixel_sub_ps_8x8_sse4; pixf->sub_ps [LUMA_8x16 ] = xavs2_pixel_sub_ps_8x16_sse4; pixf->sub_ps [LUMA_8x32 ] = xavs2_pixel_sub_ps_8x32_sse4; pixf->sub_ps [LUMA_16x4 ] = xavs2_pixel_sub_ps_16x4_sse4; // pixf->sub_ps [LUMA_16x8 ] = xavs2_pixel_sub_ps_16x8_sse4; // pixf->sub_ps [LUMA_16x12] = xavs2_pixel_sub_ps_16x12_sse4; pixf->sub_ps [LUMA_16x16] = xavs2_pixel_sub_ps_16x16_sse4; pixf->sub_ps [LUMA_16x64] = xavs2_pixel_sub_ps_16x64_sse4; pixf->sub_ps [LUMA_32x8 ] = xavs2_pixel_sub_ps_32x8_sse4; // pixf->sub_ps [LUMA_32x16] = xavs2_pixel_sub_ps_32x16_sse4; // pixf->sub_ps [LUMA_32x24] = xavs2_pixel_sub_ps_32x24_sse4; pixf->sub_ps [LUMA_32x32] = xavs2_pixel_sub_ps_32x32_sse4; pixf->sub_ps [LUMA_32x64] = xavs2_pixel_sub_ps_32x64_sse4; pixf->sub_ps [LUMA_64x16] = xavs2_pixel_sub_ps_64x16_sse4; // pixf->sub_ps [LUMA_64x32] = xavs2_pixel_sub_ps_64x32_sse4; // pixf->sub_ps [LUMA_64x48] = xavs2_pixel_sub_ps_64x48_sse4; pixf->sub_ps [LUMA_64x64] = xavs2_pixel_sub_ps_64x64_sse4; ALL_LUMA_PU(copy_ps, blockcopy_ps, _sse4); } if (cpuid & XAVS2_CPU_AVX) { pixf->copy_pp[LUMA_64x64] = xavs2_blockcopy_pp_64x64_avx; pixf->copy_pp[LUMA_64x32] = xavs2_blockcopy_pp_64x32_avx; pixf->copy_pp[LUMA_32x64] = xavs2_blockcopy_pp_32x64_avx; pixf->copy_pp[LUMA_64x16] = xavs2_blockcopy_pp_64x16_avx; pixf->copy_pp[LUMA_64x48] = xavs2_blockcopy_pp_64x48_avx; pixf->copy_pp[LUMA_48x64] = xavs2_blockcopy_pp_48x64_avx; pixf->copy_pp[LUMA_32x32] = xavs2_blockcopy_pp_32x32_avx; pixf->copy_pp[LUMA_32x16] = xavs2_blockcopy_pp_32x16_avx; pixf->copy_pp[LUMA_32x8 ] = xavs2_blockcopy_pp_32x8_avx; pixf->copy_pp[LUMA_32x24] = xavs2_blockcopy_pp_32x24_avx; pixf->copy_ss[LUMA_64x64] = xavs2_blockcopy_ss_64x64_avx; pixf->copy_ss[LUMA_64x32] = xavs2_blockcopy_ss_64x32_avx; pixf->copy_ss[LUMA_32x64] = xavs2_blockcopy_ss_32x64_avx; pixf->copy_ss[LUMA_64x16] = xavs2_blockcopy_ss_64x16_avx; pixf->copy_ss[LUMA_64x48] = xavs2_blockcopy_ss_64x48_avx; pixf->copy_ss[LUMA_16x64] = xavs2_blockcopy_ss_16x64_avx; pixf->copy_ss[LUMA_48x64] = xavs2_blockcopy_ss_48x64_avx; pixf->copy_ss[LUMA_32x32] = xavs2_blockcopy_ss_32x32_avx; pixf->copy_ss[LUMA_32x16] = xavs2_blockcopy_ss_32x16_avx; pixf->copy_ss[LUMA_16x32] = xavs2_blockcopy_ss_16x32_avx; pixf->copy_ss[LUMA_32x8 ] = xavs2_blockcopy_ss_32x8_avx; pixf->copy_ss[LUMA_32x24] = xavs2_blockcopy_ss_32x24_avx; pixf->copy_ss[LUMA_24x32] = xavs2_blockcopy_ss_24x32_avx; pixf->copy_ss[LUMA_16x16] = xavs2_blockcopy_ss_16x16_avx; pixf->copy_ss[LUMA_16x8 ] = xavs2_blockcopy_ss_16x8_avx; pixf->copy_ss[LUMA_16x4 ] = xavs2_blockcopy_ss_16x4_avx; pixf->copy_ss[LUMA_16x12] = xavs2_blockcopy_ss_16x12_avx; } if (cpuid & XAVS2_CPU_AVX2) { pixf->add_ps [LUMA_16x4 ] = xavs2_pixel_add_ps_16x4_avx2; pixf->add_ps [LUMA_16x8 ] = xavs2_pixel_add_ps_16x8_avx2; pixf->add_ps [LUMA_16x16] = xavs2_pixel_add_ps_16x16_avx2; pixf->add_ps [LUMA_16x32] = xavs2_pixel_add_ps_16x32_avx2; pixf->add_ps [LUMA_16x64] = xavs2_pixel_add_ps_16x64_avx2; #if ARCH_X86_64 pixf->add_ps [LUMA_32x8 ] = xavs2_pixel_add_ps_32x8_avx2; pixf->add_ps [LUMA_32x16] = xavs2_pixel_add_ps_32x16_avx2; pixf->add_ps [LUMA_32x24] = xavs2_pixel_add_ps_32x24_avx2; pixf->add_ps [LUMA_32x32] = xavs2_pixel_add_ps_32x32_avx2; pixf->add_ps [LUMA_32x64] = xavs2_pixel_add_ps_32x64_avx2; #endif pixf->add_ps [LUMA_64x16] = xavs2_pixel_add_ps_64x16_avx2; pixf->add_ps [LUMA_64x32] = xavs2_pixel_add_ps_64x32_avx2; pixf->add_ps [LUMA_64x48] = xavs2_pixel_add_ps_64x48_avx2; pixf->add_ps [LUMA_64x64] = xavs2_pixel_add_ps_64x64_avx2; #if ARCH_X86_64 pixf->sub_ps [LUMA_16x16] = xavs2_pixel_sub_ps_16x16_avx2; pixf->sub_ps [LUMA_16x32] = xavs2_pixel_sub_ps_16x32_avx2; pixf->sub_ps [LUMA_16x64] = xavs2_pixel_sub_ps_16x64_avx2; pixf->sub_ps [LUMA_32x8 ] = xavs2_pixel_sub_ps_32x8_avx2; pixf->sub_ps [LUMA_32x16] = xavs2_pixel_sub_ps_32x16_avx2; pixf->sub_ps [LUMA_32x32] = xavs2_pixel_sub_ps_32x32_avx2; pixf->sub_ps [LUMA_32x64] = xavs2_pixel_sub_ps_32x64_avx2; #endif pixf->sub_ps [LUMA_64x16] = xavs2_pixel_sub_ps_64x16_avx2; pixf->sub_ps [LUMA_64x64] = xavs2_pixel_sub_ps_64x64_avx2; pixf->copy_sp[LUMA_64x64] = xavs2_blockcopy_sp_64x64_avx2; pixf->copy_sp[LUMA_32x64] = xavs2_blockcopy_sp_32x64_avx2; pixf->copy_sp[LUMA_32x32] = xavs2_blockcopy_sp_32x32_avx2; pixf->copy_sp[LUMA_16x32] = xavs2_blockcopy_sp_16x32_avx2; pixf->copy_sp[LUMA_16x16] = xavs2_blockcopy_sp_16x16_avx2; pixf->copy_ps[LUMA_64x64] = xavs2_blockcopy_ps_64x64_avx2; pixf->copy_ps[LUMA_32x64] = xavs2_blockcopy_ps_32x64_avx2; pixf->copy_ps[LUMA_32x32] = xavs2_blockcopy_ps_32x32_avx2; pixf->copy_ps[LUMA_16x32] = xavs2_blockcopy_ps_16x32_avx2; pixf->copy_ps[LUMA_16x16] = xavs2_blockcopy_ps_16x16_avx2; } #endif // if HAVE_MMX #undef ALL_LUMA_CU #undef ALL_LUMA_PU } /** * --------------------------------------------------------------------------- * pixel init * --------------------------------------------------------------------------- */ void xavs2_pixel_init(uint32_t cpuid, pixel_funcs_t* pixf) { /* ------------------------------------------------------------- */ #define INIT_PIXEL_FUNC(name, cpu) \ /* 64x64 */ \ pixf->name[LUMA_64x64] = xavs2_pixel_ ## name ## _64x64 ## cpu;\ pixf->name[LUMA_64x32] = xavs2_pixel_ ## name ## _64x32 ## cpu;\ pixf->name[LUMA_32x64] = xavs2_pixel_ ## name ## _32x64 ## cpu;\ pixf->name[LUMA_64x16] = xavs2_pixel_ ## name ## _64x16 ## cpu;\ pixf->name[LUMA_64x48] = xavs2_pixel_ ## name ## _64x48 ## cpu;\ pixf->name[LUMA_16x64] = xavs2_pixel_ ## name ## _16x64 ## cpu;\ pixf->name[LUMA_48x64] = xavs2_pixel_ ## name ## _48x64 ## cpu;\ /* 32x32 */ \ pixf->name[LUMA_32x32] = xavs2_pixel_ ## name ## _32x32 ## cpu;\ pixf->name[LUMA_32x16] = xavs2_pixel_ ## name ## _32x16 ## cpu;\ pixf->name[LUMA_16x32] = xavs2_pixel_ ## name ## _16x32 ## cpu;\ pixf->name[LUMA_32x8 ] = xavs2_pixel_ ## name ## _32x8 ## cpu;\ pixf->name[LUMA_32x24] = xavs2_pixel_ ## name ## _32x24 ## cpu;\ pixf->name[LUMA_8x32 ] = xavs2_pixel_ ## name ## _8x32 ## cpu;\ pixf->name[LUMA_24x32] = xavs2_pixel_ ## name ## _24x32 ## cpu;\ /* 16x16 */ \ pixf->name[LUMA_16x16] = xavs2_pixel_ ## name ## _16x16 ## cpu;\ pixf->name[LUMA_16x8 ] = xavs2_pixel_ ## name ## _16x8 ## cpu;\ pixf->name[LUMA_8x16 ] = xavs2_pixel_ ## name ## _8x16 ## cpu;\ pixf->name[LUMA_16x4 ] = xavs2_pixel_ ## name ## _16x4 ## cpu;\ pixf->name[LUMA_16x12] = xavs2_pixel_ ## name ## _16x12 ## cpu;\ pixf->name[LUMA_4x16 ] = xavs2_pixel_ ## name ## _4x16 ## cpu;\ pixf->name[LUMA_12x16] = xavs2_pixel_ ## name ## _12x16 ## cpu;\ /* 8x8 */ \ pixf->name[LUMA_8x8 ] = xavs2_pixel_ ## name ## _8x8 ## cpu;\ pixf->name[LUMA_8x4 ] = xavs2_pixel_ ## name ## _8x4 ## cpu;\ pixf->name[LUMA_4x8 ] = xavs2_pixel_ ## name ## _4x8 ## cpu;\ /* 4x4 */ \ pixf->name[LUMA_4x4 ] = xavs2_pixel_ ## name ## _4x4 ## cpu; /* ------------------------------------------------------------- */ #define INIT_SATD(cpu) \ pixf->satd[LUMA_64x64] = xavs2_pixel_satd_64x64_ ## cpu; /* 64x64 */ \ pixf->satd[LUMA_64x32] = xavs2_pixel_satd_64x32_ ## cpu;\ pixf->satd[LUMA_32x64] = xavs2_pixel_satd_32x64_ ## cpu;\ pixf->satd[LUMA_64x16] = xavs2_pixel_satd_64x16_ ## cpu;\ pixf->satd[LUMA_64x48] = xavs2_pixel_satd_64x48_ ## cpu;\ pixf->satd[LUMA_16x64] = xavs2_pixel_satd_16x64_ ## cpu;\ pixf->satd[LUMA_48x64] = xavs2_pixel_satd_48x64_ ## cpu;\ pixf->satd[LUMA_32x32] = xavs2_pixel_satd_32x32_ ## cpu; /* 32x32 */ \ pixf->satd[LUMA_32x16] = xavs2_pixel_satd_32x16_ ## cpu;\ pixf->satd[LUMA_16x32] = xavs2_pixel_satd_16x32_ ## cpu;\ pixf->satd[LUMA_32x8 ] = xavs2_pixel_satd_32x8_ ## cpu;\ pixf->satd[LUMA_32x24] = xavs2_pixel_satd_32x24_ ## cpu;\ pixf->satd[LUMA_8x32 ] = xavs2_pixel_satd_8x32_ ## cpu;\ pixf->satd[LUMA_24x32] = xavs2_pixel_satd_24x32_ ## cpu;\ pixf->satd[LUMA_16x16] = xavs2_pixel_satd_16x16_ ## cpu; /* 16x16 */ \ pixf->satd[LUMA_16x8 ] = xavs2_pixel_satd_16x8_ ## cpu;\ pixf->satd[LUMA_8x16 ] = xavs2_pixel_satd_8x16_ ## cpu;\ pixf->satd[LUMA_16x4 ] = xavs2_pixel_satd_16x4_ ## cpu;\ pixf->satd[LUMA_16x12] = xavs2_pixel_satd_16x12_ ## cpu;\ pixf->satd[LUMA_4x16 ] = xavs2_pixel_satd_4x16_ ## cpu;\ pixf->satd[LUMA_12x16] = xavs2_pixel_satd_12x16_ ## cpu;\ pixf->satd[LUMA_8x8 ] = xavs2_pixel_satd_8x8_ ## cpu; /* 8x8 */ \ pixf->satd[LUMA_8x4 ] = xavs2_pixel_satd_8x4_ ## cpu;\ pixf->satd[LUMA_4x8 ] = xavs2_pixel_satd_4x8_ ## cpu; /* ------------------------------------------------------------- */ #define INIT_SSD(cpu) \ pixf->ssd[LUMA_32x64] = xavs2_pixel_ssd_32x64_ ## cpu;\ pixf->ssd[LUMA_16x64] = xavs2_pixel_ssd_16x64_ ## cpu;\ pixf->ssd[LUMA_32x32] = xavs2_pixel_ssd_32x32_ ## cpu;\ pixf->ssd[LUMA_32x16] = xavs2_pixel_ssd_32x16_ ## cpu;\ pixf->ssd[LUMA_16x32] = xavs2_pixel_ssd_16x32_ ## cpu;\ pixf->ssd[LUMA_32x24] = xavs2_pixel_ssd_32x24_ ## cpu;\ pixf->ssd[LUMA_32x8 ] = xavs2_pixel_ssd_32x8_ ## cpu;\ pixf->ssd[LUMA_8x32 ] = xavs2_pixel_ssd_8x32_ ## cpu;\ pixf->ssd[LUMA_16x16] = xavs2_pixel_ssd_16x16_ ## cpu;\ pixf->ssd[LUMA_16x8 ] = xavs2_pixel_ssd_16x8_ ## cpu;\ pixf->ssd[LUMA_8x16 ] = xavs2_pixel_ssd_8x16_ ## cpu;\ pixf->ssd[LUMA_16x12] = xavs2_pixel_ssd_16x12_ ## cpu;\ pixf->ssd[LUMA_16x4 ] = xavs2_pixel_ssd_16x4_ ## cpu;\ /* SIMD_ERROR pixf->ssd[LUMA_8x8 ] = xavs2_pixel_ssd_8x8_ ## cpu;*/\ pixf->ssd[LUMA_8x4 ] = xavs2_pixel_ssd_8x4_ ## cpu /* clear */ memset(pixf, 0, sizeof(pixel_funcs_t)); /* ------------------------------------------------------------- * init all c functions */ INIT_PIXEL_FUNC(sad, ); // sad INIT_PIXEL_FUNC(sad_x3, ); // sad_x3 INIT_PIXEL_FUNC(sad_x4, ); // sad_x4 INIT_PIXEL_FUNC(satd, ); // satd INIT_PIXEL_FUNC(ssd, ); // ssd INIT_PIXEL_FUNC(avg, ); // avg INIT_PIXEL_FUNC(sa8d, ); // sa8d pixf->average = xavs2_pixel_average;// block average /* ------------------------------------------------------------- * init SIMD functions */ #if HAVE_MMX if (cpuid & XAVS2_CPU_MMX2) { pixf->sad [LUMA_16x16] = xavs2_pixel_sad_16x16_mmx2; pixf->sad [LUMA_16x8 ] = xavs2_pixel_sad_16x8_mmx2; pixf->sad [LUMA_8x16 ] = xavs2_pixel_sad_8x16_mmx2; pixf->sad [LUMA_16x4 ] = xavs2_pixel_sad_16x4_mmx2; pixf->sad [LUMA_4x16 ] = xavs2_pixel_sad_4x16_mmx2; pixf->sad [LUMA_8x8 ] = xavs2_pixel_sad_8x8_mmx2; pixf->sad [LUMA_8x4 ] = xavs2_pixel_sad_8x4_mmx2; pixf->sad [LUMA_4x8 ] = xavs2_pixel_sad_4x8_mmx2; pixf->sad [LUMA_4x4 ] = xavs2_pixel_sad_4x4_mmx2; pixf->sad_x3[LUMA_16x16] = xavs2_pixel_sad_x3_16x16_mmx2; pixf->sad_x3[LUMA_16x8 ] = xavs2_pixel_sad_x3_16x8_mmx2; pixf->sad_x3[LUMA_8x16 ] = xavs2_pixel_sad_x3_8x16_mmx2; pixf->sad_x3[LUMA_8x8 ] = xavs2_pixel_sad_x3_8x8_mmx2; pixf->sad_x3[LUMA_8x4 ] = xavs2_pixel_sad_x3_8x4_mmx2; pixf->sad_x3[LUMA_4x16 ] = xavs2_pixel_sad_x3_4x16_mmx2; pixf->sad_x3[LUMA_4x8 ] = xavs2_pixel_sad_x3_4x8_mmx2; pixf->sad_x3[LUMA_4x4 ] = xavs2_pixel_sad_x3_4x4_mmx2; pixf->sad_x4[LUMA_16x16] = xavs2_pixel_sad_x4_16x16_mmx2; pixf->sad_x4[LUMA_16x8 ] = xavs2_pixel_sad_x4_16x8_mmx2; pixf->sad_x4[LUMA_8x16 ] = xavs2_pixel_sad_x4_8x16_mmx2; pixf->sad_x4[LUMA_8x8 ] = xavs2_pixel_sad_x4_8x8_mmx2; pixf->sad_x4[LUMA_8x4 ] = xavs2_pixel_sad_x4_8x4_mmx2; pixf->sad_x4[LUMA_4x16 ] = xavs2_pixel_sad_x4_4x16_mmx2; pixf->sad_x4[LUMA_4x8 ] = xavs2_pixel_sad_x4_4x8_mmx2; pixf->sad_x4[LUMA_4x4 ] = xavs2_pixel_sad_x4_4x4_mmx2; pixf->ssd [LUMA_16x16] = xavs2_pixel_ssd_16x16_mmx; pixf->ssd [LUMA_16x8 ] = xavs2_pixel_ssd_16x8_mmx; pixf->ssd [LUMA_8x16 ] = xavs2_pixel_ssd_8x16_mmx; pixf->ssd [LUMA_4x16 ] = xavs2_pixel_ssd_4x16_mmx; pixf->ssd [LUMA_8x8 ] = xavs2_pixel_ssd_8x8_mmx; pixf->ssd [LUMA_8x4 ] = xavs2_pixel_ssd_8x4_mmx; pixf->ssd [LUMA_4x8 ] = xavs2_pixel_ssd_4x8_mmx; pixf->ssd [LUMA_4x4 ] = xavs2_pixel_ssd_4x4_mmx; pixf->satd [LUMA_16x16] = xavs2_pixel_satd_16x16_mmx2; pixf->satd [LUMA_16x8 ] = xavs2_pixel_satd_16x8_mmx2; pixf->satd [LUMA_8x16 ] = xavs2_pixel_satd_8x16_mmx2; pixf->satd [LUMA_4x16 ] = xavs2_pixel_satd_4x16_mmx2; pixf->satd [LUMA_8x8 ] = xavs2_pixel_satd_8x8_mmx2; pixf->satd [LUMA_8x4 ] = xavs2_pixel_satd_8x4_mmx2; pixf->satd [LUMA_4x8 ] = xavs2_pixel_satd_4x8_mmx2; pixf->satd [LUMA_4x4 ] = xavs2_pixel_satd_4x4_mmx2; //pixf->sa8d [LUMA_16x16] = xavs2_pixel_satd_16x16_mmx2; // not found in x265 //pixf->sa8d [LUMA_16x8 ] = xavs2_pixel_satd_16x8_mmx2; //pixf->sa8d [LUMA_8x16 ] = xavs2_pixel_satd_8x16_mmx2; //pixf->sa8d [LUMA_4x16 ] = xavs2_pixel_satd_4x16_mmx2; //pixf->sa8d [LUMA_8x8 ] = xavs2_pixel_satd_8x8_mmx2; //pixf->sa8d [LUMA_8x4 ] = xavs2_pixel_satd_8x4_mmx2; //pixf->sa8d [LUMA_4x8 ] = xavs2_pixel_satd_4x8_mmx2; pixf->sa8d [LUMA_4x4 ] = xavs2_pixel_satd_4x4_mmx2; } if (cpuid & XAVS2_CPU_SSE2) { pixf->sad [LUMA_16x16] = xavs2_pixel_sad_16x16_sse2; pixf->sad [LUMA_16x8 ] = xavs2_pixel_sad_16x8_sse2; pixf->sad [LUMA_16x12] = xavs2_pixel_sad_16x12_sse2; pixf->sad [LUMA_16x32] = xavs2_pixel_sad_16x32_sse2; pixf->sad [LUMA_16x64] = xavs2_pixel_sad_16x64_sse2; pixf->sad [LUMA_16x4 ] = xavs2_pixel_sad_16x4_sse2; pixf->sad [LUMA_32x8 ] = xavs2_pixel_sad_32x8_sse2; pixf->sad [LUMA_32x24] = xavs2_pixel_sad_32x24_sse2; pixf->sad [LUMA_32x32] = xavs2_pixel_sad_32x32_sse2; pixf->sad [LUMA_32x16] = xavs2_pixel_sad_32x16_sse2; pixf->sad [LUMA_32x64] = xavs2_pixel_sad_32x64_sse2; pixf->sad [LUMA_8x32 ] = xavs2_pixel_sad_8x32_sse2; pixf->sad [LUMA_64x16] = xavs2_pixel_sad_64x16_sse2; pixf->sad [LUMA_64x32] = xavs2_pixel_sad_64x32_sse2; pixf->sad [LUMA_64x48] = xavs2_pixel_sad_64x48_sse2; pixf->sad [LUMA_64x64] = xavs2_pixel_sad_64x64_sse2; pixf->sad [LUMA_48x64] = xavs2_pixel_sad_48x64_sse2; pixf->sad [LUMA_24x32] = xavs2_pixel_sad_24x32_sse2; pixf->sad [LUMA_12x16] = xavs2_pixel_sad_12x16_sse2; pixf->sa8d [LUMA_64x16] = xavs2_pixel_sa8d_64x16_sse2; pixf->sa8d [LUMA_64x32] = xavs2_pixel_sa8d_64x32_sse2; pixf->sa8d [LUMA_64x48] = xavs2_pixel_sa8d_64x48_sse2; pixf->sa8d [LUMA_48x64] = xavs2_pixel_sa8d_48x64_sse2; pixf->sa8d [LUMA_24x32] = xavs2_pixel_sa8d_24x32_sse2; pixf->sa8d [LUMA_8x16 ] = xavs2_pixel_sa8d_8x16_sse2; pixf->sa8d [LUMA_16x32] = xavs2_pixel_sa8d_16x32_sse2; pixf->sa8d [LUMA_32x64] = xavs2_pixel_sa8d_32x64_sse2; pixf->sa8d [LUMA_8x8 ] = xavs2_pixel_sa8d_8x8_sse2; pixf->sa8d [LUMA_16x16] = xavs2_pixel_sa8d_16x16_sse2; pixf->sa8d [LUMA_32x32] = xavs2_pixel_sa8d_32x32_sse2; pixf->sa8d [LUMA_64x64] = xavs2_pixel_sa8d_64x64_sse2; INIT_SATD(sse2); pixf->sad_x3[LUMA_16x16] = xavs2_pixel_sad_x3_16x16_sse2; pixf->sad_x3[LUMA_16x8 ] = xavs2_pixel_sad_x3_16x8_sse2; pixf->sad_x3[LUMA_8x16 ] = xavs2_pixel_sad_x3_8x16_sse2; pixf->sad_x3[LUMA_8x8 ] = xavs2_pixel_sad_x3_8x8_sse2; pixf->sad_x3[LUMA_8x4 ] = xavs2_pixel_sad_x3_8x4_sse2; pixf->sad_x4[LUMA_16x16] = xavs2_pixel_sad_x4_16x16_sse2; pixf->sad_x4[LUMA_16x8 ] = xavs2_pixel_sad_x4_16x8_sse2; pixf->sad_x4[LUMA_8x16 ] = xavs2_pixel_sad_x4_8x16_sse2; pixf->sad_x4[LUMA_8x8 ] = xavs2_pixel_sad_x4_8x8_sse2; pixf->sad_x4[LUMA_8x4 ] = xavs2_pixel_sad_x4_8x4_sse2; INIT_SSD (sse2); } if (cpuid & XAVS2_CPU_SSE3) { pixf->sad [LUMA_16x16] = xavs2_pixel_sad_16x16_sse3; pixf->sad [LUMA_16x8 ] = xavs2_pixel_sad_16x8_sse3; pixf->sad [LUMA_16x12] = xavs2_pixel_sad_16x12_sse3; pixf->sad [LUMA_16x32] = xavs2_pixel_sad_16x32_sse3; pixf->sad [LUMA_16x64] = xavs2_pixel_sad_16x64_sse3; pixf->sad [LUMA_16x4 ] = xavs2_pixel_sad_16x4_sse3; pixf->sad [LUMA_32x8 ] = xavs2_pixel_sad_32x8_sse3; pixf->sad [LUMA_32x24] = xavs2_pixel_sad_32x24_sse3; pixf->sad [LUMA_32x32] = xavs2_pixel_sad_32x32_sse3; pixf->sad [LUMA_32x16] = xavs2_pixel_sad_32x16_sse3; pixf->sad [LUMA_32x64] = xavs2_pixel_sad_32x64_sse3; pixf->sad [LUMA_8x32 ] = xavs2_pixel_sad_8x32_sse3; pixf->sad [LUMA_64x16] = xavs2_pixel_sad_64x16_sse3; pixf->sad [LUMA_64x32] = xavs2_pixel_sad_64x32_sse3; pixf->sad [LUMA_64x48] = xavs2_pixel_sad_64x48_sse3; pixf->sad [LUMA_64x64] = xavs2_pixel_sad_64x64_sse3; pixf->sad [LUMA_48x64] = xavs2_pixel_sad_48x64_sse3; pixf->sad [LUMA_24x32] = xavs2_pixel_sad_24x32_sse3; pixf->sad [LUMA_12x16] = xavs2_pixel_sad_12x16_sse3; pixf->sad_x3[LUMA_16x16] = xavs2_pixel_sad_x3_16x16_sse3; pixf->sad_x3[LUMA_16x8 ] = xavs2_pixel_sad_x3_16x8_sse3; pixf->sad_x3[LUMA_16x4 ] = xavs2_pixel_sad_x3_16x4_sse3; pixf->sad_x4[LUMA_16x16] = xavs2_pixel_sad_x4_16x16_sse3; pixf->sad_x4[LUMA_16x8 ] = xavs2_pixel_sad_x4_16x8_sse3; pixf->sad_x4[LUMA_16x4 ] = xavs2_pixel_sad_x4_16x4_sse3; } if (cpuid & XAVS2_CPU_SSSE3) { INIT_SATD(ssse3); pixf->sad_x3[LUMA_64x64] = xavs2_pixel_sad_x3_64x64_ssse3; /* 64x64 */ pixf->sad_x3[LUMA_64x32] = xavs2_pixel_sad_x3_64x32_ssse3; pixf->sad_x3[LUMA_32x64] = xavs2_pixel_sad_x3_32x64_ssse3; pixf->sad_x3[LUMA_64x16] = xavs2_pixel_sad_x3_64x16_ssse3; pixf->sad_x3[LUMA_64x48] = xavs2_pixel_sad_x3_64x48_ssse3; pixf->sad_x3[LUMA_16x64] = xavs2_pixel_sad_x3_16x64_ssse3; pixf->sad_x3[LUMA_48x64] = xavs2_pixel_sad_x3_48x64_ssse3; pixf->sad_x3[LUMA_32x32] = xavs2_pixel_sad_x3_32x32_ssse3; /* 32x32 */ pixf->sad_x3[LUMA_32x16] = xavs2_pixel_sad_x3_32x16_ssse3; pixf->sad_x3[LUMA_16x32] = xavs2_pixel_sad_x3_16x32_ssse3; pixf->sad_x3[LUMA_32x8 ] = xavs2_pixel_sad_x3_32x8_ssse3; pixf->sad_x3[LUMA_32x24] = xavs2_pixel_sad_x3_32x24_ssse3; pixf->sad_x3[LUMA_8x32 ] = xavs2_pixel_sad_x3_8x32_ssse3; pixf->sad_x3[LUMA_24x32] = xavs2_pixel_sad_x3_24x32_ssse3; pixf->sad_x3[LUMA_16x16] = xavs2_pixel_sad_x3_16x16_ssse3; /* 16x16 */ pixf->sad_x3[LUMA_16x8 ] = xavs2_pixel_sad_x3_16x8_ssse3; pixf->sad_x3[LUMA_8x16 ] = xavs2_pixel_sad_x3_8x16_ssse3; pixf->sad_x3[LUMA_12x16] = xavs2_pixel_sad_x3_12x16_ssse3; pixf->sad_x4[LUMA_64x64] = xavs2_pixel_sad_x4_64x64_ssse3; /* 64x64 */ pixf->sad_x4[LUMA_64x32] = xavs2_pixel_sad_x4_64x32_ssse3; pixf->sad_x4[LUMA_32x64] = xavs2_pixel_sad_x4_32x64_ssse3; pixf->sad_x4[LUMA_64x16] = xavs2_pixel_sad_x4_64x16_ssse3; pixf->sad_x4[LUMA_64x48] = xavs2_pixel_sad_x4_64x48_ssse3; pixf->sad_x4[LUMA_16x64] = xavs2_pixel_sad_x4_16x64_ssse3; pixf->sad_x4[LUMA_48x64] = xavs2_pixel_sad_x4_48x64_ssse3; pixf->sad_x4[LUMA_32x32] = xavs2_pixel_sad_x4_32x32_ssse3; /* 32x32 */ pixf->sad_x4[LUMA_32x16] = xavs2_pixel_sad_x4_32x16_ssse3; pixf->sad_x4[LUMA_16x32] = xavs2_pixel_sad_x4_16x32_ssse3; pixf->sad_x4[LUMA_32x8 ] = xavs2_pixel_sad_x4_32x8_ssse3; pixf->sad_x4[LUMA_32x24] = xavs2_pixel_sad_x4_32x24_ssse3; pixf->sad_x4[LUMA_8x32 ] = xavs2_pixel_sad_x4_8x32_ssse3; pixf->sad_x4[LUMA_24x32] = xavs2_pixel_sad_x4_24x32_ssse3; pixf->sad_x4[LUMA_16x16] = xavs2_pixel_sad_x4_16x16_ssse3; /* 16x16 */ pixf->sad_x4[LUMA_16x8 ] = xavs2_pixel_sad_x4_16x8_ssse3; pixf->sad_x4[LUMA_8x16 ] = xavs2_pixel_sad_x4_8x16_ssse3; pixf->sad_x4[LUMA_12x16] = xavs2_pixel_sad_x4_12x16_ssse3; INIT_SSD (ssse3); pixf->sa8d [LUMA_4x4 ] = xavs2_pixel_satd_4x4_ssse3; pixf->sa8d [LUMA_8x8 ] = xavs2_pixel_sa8d_8x8_ssse3; pixf->sa8d [LUMA_16x16] = xavs2_pixel_sa8d_16x16_ssse3; pixf->sa8d [LUMA_32x32] = xavs2_pixel_sa8d_32x32_ssse3; pixf->sa8d [LUMA_8x16 ] = xavs2_pixel_sa8d_8x16_ssse3; pixf->sa8d [LUMA_16x32] = xavs2_pixel_sa8d_16x32_ssse3; pixf->sa8d [LUMA_32x64] = xavs2_pixel_sa8d_32x64_ssse3; } if (cpuid & XAVS2_CPU_SSE4) { INIT_SATD(sse4); pixf->ssd [LUMA_12x16] = xavs2_pixel_ssd_12x16_sse4; pixf->ssd [LUMA_24x32] = xavs2_pixel_ssd_24x32_sse4; pixf->ssd [LUMA_48x64] = xavs2_pixel_ssd_48x64_sse4; pixf->ssd [LUMA_64x16] = xavs2_pixel_ssd_64x16_sse4; pixf->ssd [LUMA_64x32] = xavs2_pixel_ssd_64x32_sse4; pixf->ssd [LUMA_64x48] = xavs2_pixel_ssd_64x48_sse4; pixf->ssd [LUMA_64x64] = xavs2_pixel_ssd_64x64_sse4; pixf->sa8d [LUMA_4x4 ] = xavs2_pixel_satd_4x4_sse4; pixf->sa8d [LUMA_8x8 ] = xavs2_pixel_sa8d_8x8_sse4; pixf->sa8d [LUMA_16x16] = xavs2_pixel_sa8d_16x16_sse4; pixf->sa8d [LUMA_32x32] = xavs2_pixel_sa8d_32x32_sse4; pixf->sa8d [LUMA_8x16 ] = xavs2_pixel_sa8d_8x16_sse4; pixf->sa8d [LUMA_16x32] = xavs2_pixel_sa8d_16x32_sse4; pixf->sa8d [LUMA_32x64] = xavs2_pixel_sa8d_32x64_sse4; } if (cpuid & XAVS2_CPU_AVX) { INIT_SATD(avx); pixf->sad_x3[LUMA_64x64] = xavs2_pixel_sad_x3_64x64_avx; /* 64x64 */ pixf->sad_x3[LUMA_64x32] = xavs2_pixel_sad_x3_64x32_avx; pixf->sad_x3[LUMA_32x64] = xavs2_pixel_sad_x3_32x64_avx; pixf->sad_x3[LUMA_64x16] = xavs2_pixel_sad_x3_64x16_avx; pixf->sad_x3[LUMA_64x48] = xavs2_pixel_sad_x3_64x48_avx; pixf->sad_x3[LUMA_48x64] = xavs2_pixel_sad_x3_48x64_avx; pixf->sad_x3[LUMA_16x64] = xavs2_pixel_sad_x3_16x64_avx; pixf->sad_x3[LUMA_32x32] = xavs2_pixel_sad_x3_32x32_avx; /* 32x32 */ pixf->sad_x3[LUMA_32x16] = xavs2_pixel_sad_x3_32x16_avx; pixf->sad_x3[LUMA_16x32] = xavs2_pixel_sad_x3_16x32_avx; pixf->sad_x3[LUMA_32x8 ] = xavs2_pixel_sad_x3_32x8_avx; pixf->sad_x3[LUMA_32x24] = xavs2_pixel_sad_x3_32x24_avx; pixf->sad_x3[LUMA_24x32] = xavs2_pixel_sad_x3_24x32_avx; pixf->sad_x3[LUMA_16x16] = xavs2_pixel_sad_x3_16x16_avx; /* 16x16 */ pixf->sad_x3[LUMA_16x8 ] = xavs2_pixel_sad_x3_16x8_avx; pixf->sad_x3[LUMA_16x4 ] = xavs2_pixel_sad_x3_16x4_avx; pixf->sad_x3[LUMA_16x12] = xavs2_pixel_sad_x3_16x12_avx; pixf->sad_x3[LUMA_12x16] = xavs2_pixel_sad_x3_12x16_avx; pixf->sad_x4[LUMA_64x64] = xavs2_pixel_sad_x4_64x64_avx; /* 64x64 */ pixf->sad_x4[LUMA_64x32] = xavs2_pixel_sad_x4_64x32_avx; pixf->sad_x4[LUMA_32x64] = xavs2_pixel_sad_x4_32x64_avx; pixf->sad_x4[LUMA_64x16] = xavs2_pixel_sad_x4_64x16_avx; pixf->sad_x4[LUMA_64x48] = xavs2_pixel_sad_x4_64x48_avx; pixf->sad_x4[LUMA_16x64] = xavs2_pixel_sad_x4_16x64_avx; pixf->sad_x4[LUMA_48x64] = xavs2_pixel_sad_x4_48x64_avx; pixf->sad_x4[LUMA_32x32] = xavs2_pixel_sad_x4_32x32_avx; /* 32x32 */ pixf->sad_x4[LUMA_32x16] = xavs2_pixel_sad_x4_32x16_avx; pixf->sad_x4[LUMA_16x32] = xavs2_pixel_sad_x4_16x32_avx; pixf->sad_x4[LUMA_32x8 ] = xavs2_pixel_sad_x4_32x8_avx; pixf->sad_x4[LUMA_32x24] = xavs2_pixel_sad_x4_32x24_avx; pixf->sad_x4[LUMA_24x32] = xavs2_pixel_sad_x4_24x32_avx; pixf->sad_x4[LUMA_16x16] = xavs2_pixel_sad_x4_16x16_avx; /* 16x16 */ pixf->sad_x4[LUMA_16x8 ] = xavs2_pixel_sad_x4_16x8_avx; pixf->sad_x4[LUMA_16x4 ] = xavs2_pixel_sad_x4_16x4_avx; pixf->sad_x4[LUMA_16x12] = xavs2_pixel_sad_x4_16x12_avx; pixf->sad_x4[LUMA_12x16] = xavs2_pixel_sad_x4_12x16_avx; INIT_SSD (avx); pixf->sa8d [LUMA_4x4 ] = xavs2_pixel_satd_4x4_avx; pixf->sa8d [LUMA_8x8 ] = xavs2_pixel_sa8d_8x8_avx; pixf->sa8d [LUMA_16x16] = xavs2_pixel_sa8d_16x16_avx; pixf->sa8d [LUMA_32x32] = xavs2_pixel_sa8d_32x32_avx; pixf->sa8d [LUMA_8x16 ] = xavs2_pixel_sa8d_8x16_avx; pixf->sa8d [LUMA_16x32] = xavs2_pixel_sa8d_16x32_avx; pixf->sa8d [LUMA_32x64] = xavs2_pixel_sa8d_32x64_avx; pixf->sa8d [LUMA_64x64] = xavs2_pixel_sa8d_64x64_avx; } if (cpuid & XAVS2_CPU_XOP) { INIT_SATD(xop); pixf->ssd [LUMA_16x16] = xavs2_pixel_ssd_16x16_xop; pixf->ssd [LUMA_16x8 ] = xavs2_pixel_ssd_16x8_xop; pixf->ssd [LUMA_8x16 ] = xavs2_pixel_ssd_8x16_xop; pixf->ssd [LUMA_8x8 ] = xavs2_pixel_ssd_8x8_xop; pixf->ssd [LUMA_8x4 ] = xavs2_pixel_ssd_8x4_xop; //pixf->sa8d [LUMA_4x4 ] = xavs2_pixel_satd_4x4_xop; // in x265, this one is broken pixf->sa8d [LUMA_8x8 ] = xavs2_pixel_sa8d_8x8_xop; pixf->sa8d [LUMA_16x16] = xavs2_pixel_sa8d_16x16_xop; pixf->sa8d [LUMA_32x32] = xavs2_pixel_sa8d_32x32_xop; pixf->sa8d [LUMA_8x16 ] = xavs2_pixel_sa8d_8x16_xop; pixf->sa8d [LUMA_16x32] = xavs2_pixel_sa8d_16x32_xop; pixf->sa8d [LUMA_32x64] = xavs2_pixel_sa8d_32x64_xop; } #if ARCH_X86_64 if (cpuid & XAVS2_CPU_AVX2) { pixf->sad [LUMA_32x8 ] = xavs2_pixel_sad_32x8_avx2; pixf->sad [LUMA_32x16] = xavs2_pixel_sad_32x16_avx2; pixf->sad [LUMA_32x24] = xavs2_pixel_sad_32x24_avx2; pixf->sad [LUMA_32x32] = xavs2_pixel_sad_32x32_avx2; pixf->sad [LUMA_32x64] = xavs2_pixel_sad_32x64_avx2; pixf->sad [LUMA_48x64] = xavs2_pixel_sad_48x64_avx2; pixf->sad [LUMA_64x16] = xavs2_pixel_sad_64x16_avx2; pixf->sad [LUMA_64x32] = xavs2_pixel_sad_64x32_avx2; pixf->sad [LUMA_64x48] = xavs2_pixel_sad_64x48_avx2; pixf->sad [LUMA_64x64] = xavs2_pixel_sad_64x64_avx2; pixf->sad_x3[LUMA_8x16 ] = xavs2_pixel_sad_x3_8x16_avx2; pixf->sad_x3[LUMA_8x8 ] = xavs2_pixel_sad_x3_8x8_avx2; pixf->sad_x3[LUMA_8x4 ] = xavs2_pixel_sad_x3_8x4_avx2; pixf->sad_x4[LUMA_8x8 ] = xavs2_pixel_sad_x4_8x8_avx2; pixf->ssd [LUMA_64x64] = xavs2_pixel_ssd_64x64_avx2; pixf->ssd [LUMA_32x32] = xavs2_pixel_ssd_32x32_avx2; pixf->ssd [LUMA_16x16] = xavs2_pixel_ssd_16x16_avx2; pixf->ssd [LUMA_16x8 ] = xavs2_pixel_ssd_16x8_avx2; pixf->satd [LUMA_16x16] = xavs2_pixel_satd_16x16_avx2; pixf->satd [LUMA_16x8 ] = xavs2_pixel_satd_16x8_avx2; pixf->satd [LUMA_8x16 ] = xavs2_pixel_satd_8x16_avx2; pixf->satd [LUMA_8x8 ] = xavs2_pixel_satd_8x8_avx2; pixf->satd [LUMA_64x64] = xavs2_pixel_satd_64x64_avx2; pixf->satd [LUMA_64x32] = xavs2_pixel_satd_64x32_avx2; pixf->satd [LUMA_32x64] = xavs2_pixel_satd_32x64_avx2; pixf->satd [LUMA_64x16] = xavs2_pixel_satd_64x16_avx2; pixf->satd [LUMA_64x48] = xavs2_pixel_satd_64x48_avx2; pixf->satd [LUMA_16x64] = xavs2_pixel_satd_16x64_avx2; pixf->satd [LUMA_48x64] = xavs2_pixel_satd_48x64_avx2; pixf->satd [LUMA_32x32] = xavs2_pixel_satd_32x32_avx2; pixf->satd [LUMA_32x16] = xavs2_pixel_satd_32x16_avx2; pixf->satd [LUMA_16x32] = xavs2_pixel_satd_16x32_avx2; pixf->satd [LUMA_32x24] = xavs2_pixel_satd_32x24_avx2; pixf->satd [LUMA_16x4 ] = xavs2_pixel_satd_16x4_avx2; pixf->satd [LUMA_16x12] = xavs2_pixel_satd_16x12_avx2; pixf->sad_x3[LUMA_32x8 ] = xavs2_pixel_sad_x3_32x8_avx2; pixf->sad_x3[LUMA_32x16] = xavs2_pixel_sad_x3_32x16_avx2; pixf->sad_x3[LUMA_32x24] = xavs2_pixel_sad_x3_32x24_avx2; pixf->sad_x3[LUMA_32x32] = xavs2_pixel_sad_x3_32x32_avx2; pixf->sad_x3[LUMA_32x64] = xavs2_pixel_sad_x3_32x64_avx2; pixf->sad_x3[LUMA_48x64] = xavs2_pixel_sad_x3_48x64_avx2; pixf->sad_x3[LUMA_64x16] = xavs2_pixel_sad_x3_64x16_avx2; pixf->sad_x3[LUMA_64x32] = xavs2_pixel_sad_x3_64x32_avx2; pixf->sad_x3[LUMA_64x48] = xavs2_pixel_sad_x3_64x48_avx2; pixf->sad_x3[LUMA_64x64] = xavs2_pixel_sad_x3_64x64_avx2; pixf->sad_x4[LUMA_16x8 ] = xavs2_pixel_sad_x4_16x8_avx2; pixf->sad_x4[LUMA_16x12] = xavs2_pixel_sad_x4_16x12_avx2; pixf->sad_x4[LUMA_16x16] = xavs2_pixel_sad_x4_16x16_avx2; pixf->sad_x4[LUMA_16x32] = xavs2_pixel_sad_x4_16x32_avx2; pixf->sad_x4[LUMA_32x8 ] = xavs2_pixel_sad_x4_32x8_avx2; pixf->sad_x4[LUMA_32x16] = xavs2_pixel_sad_x4_32x16_avx2; pixf->sad_x4[LUMA_32x24] = xavs2_pixel_sad_x4_32x24_avx2; pixf->sad_x4[LUMA_32x32] = xavs2_pixel_sad_x4_32x32_avx2; pixf->sad_x4[LUMA_32x64] = xavs2_pixel_sad_x4_32x64_avx2; pixf->sad_x4[LUMA_48x64] = xavs2_pixel_sad_x4_48x64_avx2; pixf->sad_x4[LUMA_64x16] = xavs2_pixel_sad_x4_64x16_avx2; pixf->sad_x4[LUMA_64x32] = xavs2_pixel_sad_x4_64x32_avx2; pixf->sad_x4[LUMA_64x48] = xavs2_pixel_sad_x4_64x48_avx2; pixf->sad_x4[LUMA_64x64] = xavs2_pixel_sad_x4_64x64_avx2; pixf->sa8d [LUMA_8x8 ] = xavs2_pixel_sa8d_8x8_avx2; pixf->sa8d [LUMA_16x16] = xavs2_pixel_sa8d_16x16_avx2; pixf->sa8d [LUMA_32x32] = xavs2_pixel_sa8d_32x32_avx2; } #endif /* ------------------------------------------------------------- * init AVG functions */ #define INIT_PIXEL_AVG(w, h, suffix) \ pixf->avg[LUMA_## w ##x## h] = xavs2_pixel_avg_##w##x##h##_##suffix if (cpuid & XAVS2_CPU_MMX2) { INIT_PIXEL_AVG(64, 64, mmx2); INIT_PIXEL_AVG(64, 16, mmx2); INIT_PIXEL_AVG(64, 48, mmx2); INIT_PIXEL_AVG(16, 64, mmx2); INIT_PIXEL_AVG(48, 64, mmx2); INIT_PIXEL_AVG(32, 32, mmx2); INIT_PIXEL_AVG(32, 16, mmx2); INIT_PIXEL_AVG(16, 32, mmx2); INIT_PIXEL_AVG(32, 8, mmx2); INIT_PIXEL_AVG(32, 24, mmx2); INIT_PIXEL_AVG(24, 32, mmx2); INIT_PIXEL_AVG(16, 16, mmx2); INIT_PIXEL_AVG(16, 8, mmx2); INIT_PIXEL_AVG(16, 4, mmx2); INIT_PIXEL_AVG(16, 12, mmx2); INIT_PIXEL_AVG( 8, 32, mmx2); INIT_PIXEL_AVG( 8, 16, mmx2); INIT_PIXEL_AVG( 4, 16, mmx2); INIT_PIXEL_AVG(12, 16, mmx2); INIT_PIXEL_AVG( 8, 8, mmx2); INIT_PIXEL_AVG( 8, 4, mmx2); INIT_PIXEL_AVG( 4, 8, mmx2); INIT_PIXEL_AVG( 4, 4, mmx2); } if (cpuid & XAVS2_CPU_SSE2){ INIT_PIXEL_AVG(64, 64, sse2); INIT_PIXEL_AVG(64, 32, sse2); INIT_PIXEL_AVG(32, 64, sse2); INIT_PIXEL_AVG(64, 16, sse2); INIT_PIXEL_AVG(64, 48, sse2); INIT_PIXEL_AVG(16, 64, sse2); INIT_PIXEL_AVG(48, 64, sse2); INIT_PIXEL_AVG(32, 32, sse2); INIT_PIXEL_AVG(32, 16, sse2); INIT_PIXEL_AVG(16, 32, sse2); INIT_PIXEL_AVG(32, 8, sse2); INIT_PIXEL_AVG(32, 24, sse2); INIT_PIXEL_AVG(16, 16, sse2); INIT_PIXEL_AVG(16, 8, sse2); INIT_PIXEL_AVG(16, 4, sse2); INIT_PIXEL_AVG(16, 12, sse2); INIT_PIXEL_AVG( 8, 32, sse2); INIT_PIXEL_AVG(24, 32, sse2); INIT_PIXEL_AVG( 8, 16, sse2); INIT_PIXEL_AVG(12, 16, sse2); INIT_PIXEL_AVG( 8, 8, sse2); INIT_PIXEL_AVG( 8, 4, sse2); } if (cpuid & XAVS2_CPU_SSE3) { INIT_PIXEL_FUNC(avg, _ssse3); } if (cpuid & XAVS2_CPU_AVX2) { #if ARCH_X86_64 INIT_PIXEL_AVG(64, 64, avx2); INIT_PIXEL_AVG(64, 32, avx2); INIT_PIXEL_AVG(64, 16, avx2); INIT_PIXEL_AVG(64, 48, avx2); INIT_PIXEL_AVG(32, 32, avx2); INIT_PIXEL_AVG(32, 64, avx2); INIT_PIXEL_AVG(32, 16, avx2); INIT_PIXEL_AVG(32, 8, avx2); INIT_PIXEL_AVG(32, 24, avx2); #endif INIT_PIXEL_AVG(16, 64, avx2); INIT_PIXEL_AVG(16, 32, avx2); INIT_PIXEL_AVG(16, 16, avx2); INIT_PIXEL_AVG(16, 8, avx2); INIT_PIXEL_AVG(16, 4, avx2); INIT_PIXEL_AVG(16, 12, avx2); } /* block average */ if (cpuid & XAVS2_CPU_SSE42) { pixf->average = xavs2_pixel_average_sse128; } #if _MSC_VER if (cpuid & XAVS2_CPU_AVX2) { pixf->average = xavs2_pixel_average_avx; } #endif #endif /* init functions of block operation : copy/add/sub */ init_block_opreation_funcs(cpuid, pixf); #undef INIT_PIXEL_AVG #undef INIT_PIXEL_FUNC #undef INIT_SATD #undef INIT_SSD } /* --------------------------------------------------------------------------- */ static int mad_NxN_c(pel_t *p_src, int i_src, int cu_size) { pel_t *p_src_base = p_src; int num_pix = cu_size * cu_size; int x, y; int sum = 0; int f_avg = 0; /* average of all pixels in current block */ int mad = 0; /* cal average */ for (y = 0; y < cu_size; ++y) { for (x = 0; x < cu_size; ++x) { sum += p_src[x]; } p_src += i_src; } f_avg = (sum + (num_pix >> 1)) / num_pix; /* cal mad */ p_src = p_src_base; for (y = 0; y < cu_size; ++y) { for (x = 0; x < cu_size; ++x) { int f_pxl = p_src[x]; mad += XAVS2_ABS(f_pxl - f_avg); } p_src += i_src; } return mad; } /* --------------------------------------------------------------------------- */ void xavs2_mad_init(uint32_t cpuid, mad_funcs_t *madf) { madf[B16X16_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_NxN_c; madf[B32X32_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_NxN_c; madf[B64X64_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_NxN_c; /* init asm function handles */ #if HAVE_MMX /* functions defined in file intrinsic_mad.c */ if (cpuid & XAVS2_CPU_SSE2) { madf[B16X16_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_16x16_sse128; madf[B32X32_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_32x32_sse128; madf[B64X64_IN_BIT - MIN_CU_SIZE_IN_BIT] = mad_64x64_sse128; } #endif //if HAVE_MMX } xavs2-1.3/source/common/pixel.h000066400000000000000000000171331340660520300165060ustar00rootroot00000000000000/* * pixel.h * * Description of this file: * Pixel processing functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_PIXEL_H #define XAVS2_PIXEL_H /** * =========================================================================== * type defines * =========================================================================== */ /* --------------------------------------------------------------------------- * Luma PU partition */ enum LumaPU { /* square (the first 5 PUs match the block sizes) */ LUMA_4x4, LUMA_8x8, LUMA_16x16, LUMA_32x32, LUMA_64x64, /* rectangular */ LUMA_8x4, LUMA_4x8, LUMA_16x8, LUMA_8x16, LUMA_32x16, LUMA_16x32, LUMA_64x32, LUMA_32x64, /* asymmetrical (0.75, 0.25) */ LUMA_16x12, LUMA_12x16, LUMA_16x4, LUMA_4x16, LUMA_32x24, LUMA_24x32, LUMA_32x8, LUMA_8x32, LUMA_64x48, LUMA_48x64, LUMA_64x16, LUMA_16x64, /* number */ NUM_PU_SIZES, /* total number of PU sizes */ LUMA_INVALID = 255 }; /* --------------------------------------------------------------------------- * Luma CU sizes, can be indexed using log2n(width)-2 */ enum LumaCU { BLOCK_4x4, BLOCK_8x8, BLOCK_16x16, BLOCK_32x32, BLOCK_64x64, NUM_CU_SIZES /* total number of CU sizes */ }; /* --------------------------------------------------------------------------- * TU sizes */ enum TransUnit { /* square */ TU_4x4, TU_8x8, TU_16x16, TU_32x32, TU_64x64, /* asymmetrical */ TU_16x4, TU_4x16, TU_32x8, TU_8x32, TU_64x16, TU_16x64, /* number */ NUM_TU_SIZES /* total number of TU sizes */ }; /* --------------------------------------------------------------------------- * Chroma (only for 4:2:0) partition sizes. * These enum are only a convenience for indexing into the chroma primitive * arrays when instantiating macros or templates. The chroma function tables * should always be indexed by a LumaPU enum when used. */ enum ChromaPU { /* square */ CHROMA_2x2, CHROMA_4x4, CHROMA_8x8, CHROMA_16x16, CHROMA_32x32, /* rectangular */ CHROMA_4x2, CHROMA_2x4, CHROMA_8x4, CHROMA_4x8, CHROMA_16x8, CHROMA_8x16, CHROMA_32x16, CHROMA_16x32, /* asymmetrical (0.75, 0.25) */ CHROMA_8x6, CHROMA_6x8, CHROMA_8x2, CHROMA_2x8, CHROMA_16x12, CHROMA_12x16, CHROMA_16x4, CHROMA_4x16, CHROMA_32x24, CHROMA_24x32, CHROMA_32x8, CHROMA_8x32, }; /* --------------------------------------------------------------------------- */ enum ChromaCU { BLOCK_C_2x2, BLOCK_C_4x4, BLOCK_C_8x8, BLOCK_C_16x16, BLOCK_C_32x32 }; typedef cmp_dist_t(*pixel_cmp_t)(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2); typedef dist_t(*pixel_ssd_t)(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2); typedef dist_t(*pixel_ssd2_t)(const pel_t *pix1, intptr_t i_pix1, const pel_t *pix2, intptr_t i_pix2, int width, int height); typedef void(*pixel_cmp_x3_t)(const pel_t *fenc, const pel_t *pix0, const pel_t *pix1, const pel_t *pix2, intptr_t i_stride, int scores[3]); typedef void(*pixel_cmp_x4_t)(const pel_t *fenc, const pel_t *pix0, const pel_t *pix1, const pel_t *pix2, const pel_t *pix3, intptr_t i_stride, int scores[4]); typedef void(*copy_pp_t)(pel_t* dst, intptr_t dstStride, const pel_t* src, intptr_t srcStride); // dst is aligned typedef void(*copy_sp_t)(pel_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride); typedef void(*copy_ps_t)(coeff_t* dst, intptr_t dstStride, const pel_t* src, intptr_t srcStride); typedef void(*copy_ss_t)(coeff_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride); typedef void(*pixel_sub_ps_t)(coeff_t* dst, intptr_t dstride, const pel_t* src0, const pel_t* src1, intptr_t sstride0, intptr_t sstride1); typedef void(*pixel_add_ps_t)(pel_t* a, intptr_t dstride, const pel_t* b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1); typedef void(*pixel_avg_pp_t)(pel_t* dst, intptr_t dstride, const pel_t* src0, intptr_t sstride0, const pel_t* src1, intptr_t sstride1, int weight); typedef int(*mad_funcs_t)(pel_t *p_src, int i_src, int cu_size); typedef struct { pixel_cmp_t sad [NUM_PU_SIZES]; pixel_cmp_t satd [NUM_PU_SIZES]; pixel_cmp_t sa8d [NUM_PU_SIZES]; pixel_ssd_t ssd [NUM_PU_SIZES]; pixel_cmp_x3_t sad_x3 [NUM_PU_SIZES]; pixel_cmp_x4_t sad_x4 [NUM_PU_SIZES]; pixel_sub_ps_t sub_ps [NUM_PU_SIZES]; pixel_add_ps_t add_ps [NUM_PU_SIZES]; copy_sp_t copy_sp[NUM_PU_SIZES]; copy_ps_t copy_ps[NUM_PU_SIZES]; copy_ss_t copy_ss[NUM_PU_SIZES]; copy_pp_t copy_pp[NUM_PU_SIZES]; pixel_avg_pp_t avg [NUM_PU_SIZES]; pixel_cmp_t *intra_cmp; /* either satd or sad for intra mode prediction */ pixel_cmp_t *fpel_cmp; /* either satd or sad for fractional pixel comparison in ME */ mad_funcs_t madf[CTU_DEPTH]; pixel_ssd2_t ssd_block; /* block average */ void (*average)(pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height); } pixel_funcs_t; /** * =========================================================================== * global variables * =========================================================================== */ /* get partition index for the given size */ #define g_partition_map_tab FPFX(g_partition_map_tab) extern const uint8_t g_partition_map_tab[]; #define PART_INDEX(w, h) (g_partition_map_tab[((((w) >> 2) - 1) << 4) + ((h) >> 2) - 1]) /** * =========================================================================== * function declares * =========================================================================== */ #define xavs2_pixel_init FPFX(pixel_init) void xavs2_pixel_init(uint32_t cpu, pixel_funcs_t* pixf); #define xavs2_pixel_ssd_wxh FPFX(xpixel_ssd_wxh) uint64_t xavs2_pixel_ssd_wxh(pixel_funcs_t *pf, pel_t *p_pix1, intptr_t i_pix1, pel_t *p_pix2, intptr_t i_pix2, int i_width, int i_height, int inout_shift); #define xavs2_mad_init FPFX(mad_init) void xavs2_mad_init(uint32_t cpu, mad_funcs_t *madf); #endif // XAVS2_PIXEL_H xavs2-1.3/source/common/predict.h000066400000000000000000000201421340660520300170110ustar00rootroot00000000000000/* * predict.h * * Description of this file: * Prediction functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_PREDICT_H #define XAVS2_PREDICT_H /** * =========================================================================== * local/global variables * =========================================================================== */ static const int REF_BITS[32] = { 1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9 }; #define REF_COST(ref_idx) WEIGHTED_COST(h->i_lambda_factor, REF_BITS[ref_idx + 1]) /** * =========================================================================== * inline function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * determine the mv value (1/4 pixel) is legal or not * Return: 0: out of the legal mv range; * 1: in the legal mv range */ static ALWAYS_INLINE int check_mv_range(xavs2_t *h, const mv_t *mv, int ref_idx, int pix_x, int pix_y, int bsx, int bsy) { int bsize = 1 << h->i_lcu_level; /* valid padding size */ int min_x = -((pix_x + bsize) << 2); int min_y = -((pix_y + bsize) << 2); int max_x = ((h->i_width - (pix_x + bsx)) + bsize) << 2; int max_y = ((h->i_height - (pix_y + bsy)) + bsize) << 2; min_x = XAVS2_MAX(min_x, h->min_mv_range[0]); min_y = XAVS2_MAX(min_y, h->min_mv_range[1]); max_x = XAVS2_MIN(max_x, h->max_mv_range[0]); max_y = XAVS2_MIN(max_y, h->max_mv_range[1]); /* ֡ʱǰLCUк */ int dep_lcu_y = (pix_y + bsy + ((mv->y >> 2) + 4) + 4) >> h->i_lcu_level; int dep_lcu_x = (pix_x + bsx + ((mv->x >> 2) + 4) + 4) >> h->i_lcu_level; int dep_lcu_row_avail; dep_lcu_y = XAVS2_MAX(0, dep_lcu_y); dep_lcu_x = XAVS2_MAX(0, dep_lcu_x); dep_lcu_y = XAVS2_MIN(h->i_height_in_lcu - 1, dep_lcu_y); dep_lcu_x = XAVS2_MIN(h->i_width_in_lcu - 1, dep_lcu_x); dep_lcu_row_avail = h->fref[ref_idx]->num_lcu_coded_in_row[dep_lcu_y] > dep_lcu_x; return dep_lcu_row_avail && (mv->x <= max_x && mv->x >= min_x && mv->y <= max_y && mv->y >= min_y); } /* --------------------------------------------------------------------------- * get distance for a reference frame */ static ALWAYS_INLINE int calculate_distance(xavs2_t *h, int blkref) { assert(blkref >= 0 && blkref < MAX_REFS); return h->fdec->ref_dpoc[blkref]; } /* --------------------------------------------------------------------------- * ڳYŵƫ */ static ALWAYS_INLINE void getDeltas(xavs2_t *h, int *delt, int *delt2, int OriPOC, int OriRefPOC, int ScaledPOC, int ScaledRefPOC) { const int factor = 2; if (h->b_field_sequence == 0) { *delt = *delt2 = 0; return; } OriPOC = (OriPOC + 512) & 511; // % 512 OriRefPOC = (OriRefPOC + 512) & 511; ScaledPOC = (ScaledPOC + 512) & 511; ScaledRefPOC = (ScaledRefPOC + 512) & 511; assert((OriPOC % factor) + (OriRefPOC % factor) + (ScaledPOC % factor) + (ScaledRefPOC % factor) == 0); OriPOC /= factor; OriRefPOC /= factor; ScaledPOC /= factor; ScaledRefPOC /= factor; if (h->b_top_field) { // scaled is top field *delt2 = (ScaledRefPOC & 1) != (ScaledPOC & 1) ? 2 : 0; if ((ScaledPOC & 1) == (OriPOC & 1)) { // ori is top *delt = (OriRefPOC & 1) != (OriPOC & 1) ? 2 : 0; } else { *delt = (OriRefPOC & 1) != (OriPOC & 1) ? -2 : 0; } } else { // scaled is bottom field *delt2 = (ScaledRefPOC & 1) != (ScaledPOC & 1) ? -2 : 0; if ((ScaledPOC & 1) == (OriPOC & 1)) { // ori is bottom *delt = (OriRefPOC & 1) != (OriPOC & 1) ? -2 : 0; } else { *delt = (OriRefPOC & 1) != (OriPOC & 1) ? 2 : 0; } } } // ---------------------------------------------------------- // MV scaling for Skip/Direct Mode static ALWAYS_INLINE int16_t scale_mv_skip(int mv, int dist_dst, int dist_src) { return (int16_t)((mv * dist_dst * (MULTI / dist_src) + HALF_MULTI) >> OFFSET); } static ALWAYS_INLINE int16_t scale_mv_skip_y(xavs2_t *h, int mvy, int dist_dst, int dist_src) { if (h->b_field_sequence == 0) { return scale_mv_skip(mvy, dist_dst, dist_src); } else { int oriPOC = h->fdec->i_frm_poc; int oriRefPOC = oriPOC - dist_src; int scaledPOC = h->fdec->i_frm_poc; int scaledRefPOC = scaledPOC - dist_dst; int delta, delta2; getDeltas(h, &delta, &delta2, oriPOC, oriRefPOC, scaledPOC, scaledRefPOC); return (int16_t)(scale_mv_skip(mvy + delta, dist_dst, dist_src) - delta2); } } // ---------------------------------------------------------- // MV scaling for Bi-Skip/Direct Mode static ALWAYS_INLINE int16_t scale_mv_biskip(int mv, int dist_dst, int dist_src_scale) { return (int16_t)(xavs2_sign3(mv) * ((dist_src_scale * (1 + XAVS2_ABS(mv) * dist_dst) - 1) >> OFFSET)); } static ALWAYS_INLINE int16_t scale_mv_biskip_y(xavs2_t *h, int mvy, int dist_dst, int dist_src, int dist_src_scale) { int oriPOC = h->fdec->i_frm_poc; int oriRefPOC = oriPOC - dist_src; int scaledPOC = h->fdec->i_frm_poc; int scaledRefPOC = scaledPOC - dist_dst; int delta, delta2; getDeltas(h, &delta, &delta2, oriPOC, oriRefPOC, scaledPOC, scaledRefPOC); return (int16_t)(scale_mv_biskip(mvy + delta, dist_dst, dist_src_scale) - delta2); } /** * =========================================================================== * interface function declares * =========================================================================== */ #define get_mv_predictors_bskip FPFX(get_mv_predictors_bskip) int get_mv_predictors_bskip(xavs2_t *h, cu_t *p_cu); #define get_mv_predictors_pskip FPFX(get_mv_predictors_pskip) int get_mv_predictors_pskip(xavs2_t *h, cu_t *p_cu); #define get_mvp_default FPFX(get_mvp_default) void get_mvp_default (xavs2_t *h, const neighbor_inter_t *p_neighbors, mv_t *pmv, int bwd_2nd, cb_t *p_cb, int ref_idx); #define pred_inter_search_single FPFX(pred_inter_search_single) int pred_inter_search_single(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me, dist_t *fwd_cost, dist_t *bwd_cost); #define pred_inter_search_bi FPFX(pred_inter_search_bi) void pred_inter_search_bi (xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me, dist_t *sym_mcost, dist_t *bid_mcost); #define pred_inter_search_dual FPFX(pred_inter_search_dual) void pred_inter_search_dual (xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me, dist_t *dual_mcost, int *dual_best_fst_ref, int *dual_best_snd_ref); #endif // XAVS2_PREDICT_H xavs2-1.3/source/common/primitives.c000066400000000000000000000054761340660520300175620ustar00rootroot00000000000000/* * primitives.c * * Description of this file: * function handles initialize functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "primitives.h" #include "cpu.h" #include "intra.h" #include "mc.h" #include "transform.h" #include "filter.h" #include "sao.h" /* --------------------------------------------------------------------------- * global function handle */ intrinsic_func_t g_funcs; /* --------------------------------------------------------------------------- */ void xavs2_init_all_primitives(xavs2_param_t* param, intrinsic_func_t *p_funcs) { uint32_t cpuid = p_funcs->cpuid; if (param != NULL) { if (param->sample_bit_depth != g_bit_depth) { xavs2_log(NULL, XAVS2_LOG_ERROR, "init primitives error: only %d bit-depth is supported\n", g_bit_depth); } } /* init memory operation function handlers */ xavs2_mem_oper_init (cpuid, p_funcs); /* init function handles */ xavs2_intra_pred_init(cpuid, p_funcs); xavs2_mc_init (cpuid, p_funcs); xavs2_pixel_init (cpuid, &p_funcs->pixf); xavs2_deblock_init (cpuid, p_funcs); xavs2_dct_init (cpuid, &p_funcs->dctf); xavs2_quant_init (cpuid, &p_funcs->dctf); xavs2_cg_scan_init (cpuid, p_funcs); xavs2_mad_init (cpuid, p_funcs->pixf.madf); xavs2_sao_init (cpuid, p_funcs); xavs2_alf_init (cpuid, p_funcs); xavs2_rdo_init (cpuid, p_funcs); } xavs2-1.3/source/common/primitives.h000066400000000000000000000273231340660520300175620ustar00rootroot00000000000000/* * primitives.h * * Description of this file: * function handles initialize functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_PRIMITIVES_H #define XAVS2_PRIMITIVES_H #include "pixel.h" /** * =========================================================================== * function definitions and structures * =========================================================================== */ /** * =========================================================================== * type defines * =========================================================================== */ typedef void *(*memcpy_t)(void *dst, const void *src, size_t n); /* --------------------------------------------------------------------------- * inter prediction */ typedef void(*block_copy_t )(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h); typedef void(*plane_copy_di_t)(pel_t *dstu, intptr_t i_dstu, pel_t *dstv, intptr_t i_dstv, pel_t *src, intptr_t i_src, int w, int h); typedef void(*intpl_t )(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); typedef void(*intpl_ext_t )(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_x, const int8_t *coeff_y); typedef void(*intpl_luma_hor_t)(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t *coeff); typedef void(*intpl_luma_ext_t)(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff); typedef void(*intpl_luma_ver_t)(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff); typedef void(*intpl_luma_ver_x3_t)(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, int8_t const **coeff); typedef void(*intpl_luma_hor_x3_t)(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff); typedef void(*intpl_luma_ext_x3_t)(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff); typedef void (*filter_pp_t) (const pel_t *src, intptr_t srcStride, pel_t *dst, intptr_t dstStride, int coeffIdx); typedef void (*filter_hps_t) (const pel_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt); typedef void (*filter_ps_t) (const pel_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx); typedef void (*filter_sp_t) (const int16_t *src, intptr_t srcStride, pel_t *dst, intptr_t dstStride, int coeffIdx); typedef void (*filter_ss_t) (const int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx); typedef void (*filter_hv_pp_t) (const pel_t *src, intptr_t srcStride, pel_t *dst, intptr_t dstStride, int idxX, int idxY); typedef void (*filter_p2s_t) (const pel_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride); /* --------------------------------------------------------------------------- * intra prediction */ typedef void(*intra_pred_t)(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); typedef void(*fill_edge_t) (const pel_t *p_topleft, int i_topleft, const pel_t *p_lcu_ep, pel_t *ep, uint32_t i_avail, int bsx, int bsy); typedef void(*fill_ref_samples_t)(xavs2_t *h, cu_t *p_cu, int img_x, int img_y, int block_x, int block_y, int bsx, int bsy); /* --------------------------------------------------------------------------- * transform and quantization functions */ typedef void(*dct_t)(const coeff_t *src, coeff_t *dst, int i_src); /* --------------------------------------------------------------------------- * coefficient scan */ typedef void(*coeff_scan_t)(coeff_t *dst, const coeff_t *src, int i_src_shift); typedef void(*coeff_scan4x4_t)(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4); typedef struct { /* dct */ dct_t dct[NUM_PU_SIZES]; dct_t idct[NUM_PU_SIZES]; dct_t dct_half[NUM_PU_SIZES]; // ֻDCTĵƵϵ /* 2nd transform */ void(*transform_4x4_2nd) (coeff_t *coeff, int i_coeff); void(*inv_transform_4x4_2nd)(coeff_t *coeff, int i_coeff); void(*transform_2nd) (coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left); void(*inv_transform_2nd) (coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left); /* quant */ void(*abs_coeff)(coeff_t *coef, const coeff_t *src, const int i_coef); int (*add_sign) (coeff_t *coef, const coeff_t *abs_val, const int i_coef); int(*quant) (coeff_t *coef, const int i_coef, const int scale, const int shift, const int add); void(*dequant)(coeff_t *coef, const int i_coef, const int scale, const int shift); int(*wquant) (coeff_t *coef, const int i_coef, const int scale, const int shift, const int add, int *levelscale); } dct_funcs_t; /* SAO filter function */ typedef void(*sao_flt_t)(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int i_block_w, int i_block_h, int *lcu_avail, SAOBlkParam *sao_param); /* --------------------------------------------------------------------------- */ typedef struct intrinsic_func_t { ALIGN32(uint32_t cpuid); memcpy_t fast_memcpy; memcpy_t memcpy_aligned; void*(*fast_memzero)(void *dst, size_t n); void*(*memzero_aligned)(void *dst, size_t n); void*(*fast_memset)(void *dst, int val, size_t n); void (*mem_repeat_i)(void *dst, int val, size_t count); void*(*mem_repeat_p)(void *dst, int val, size_t count); void (*lowres_filter)(pel_t *src, int i_src, pel_t *dst, int i_dst, int width, int height); pixel_funcs_t pixf; /* --------------------------------------------------------------------------- * block copy */ /* align copy */ block_copy_t align_copy; /* plane copy */ block_copy_t plane_copy; plane_copy_di_t plane_copy_deinterleave; /* --------------------------------------------------------------------------- * Motion Compensation */ intpl_luma_hor_t intpl_luma_hor; intpl_luma_ver_t intpl_luma_ver; intpl_luma_ext_t intpl_luma_ext; intpl_luma_ver_x3_t intpl_luma_ver_x3; intpl_luma_hor_x3_t intpl_luma_hor_x3; intpl_luma_ext_x3_t intpl_luma_ext_x3; intpl_t intpl_luma_block_hor; intpl_t intpl_luma_block_ver; intpl_ext_t intpl_luma_block_ext; intpl_t intpl_chroma_block_hor; intpl_t intpl_chroma_block_ver; intpl_ext_t intpl_chroma_block_ext; struct inter_pred_t { filter_pp_t luma_hpp; // 8-tap luma motion compensation interpolation filters filter_hps_t luma_hps; filter_pp_t luma_vpp; filter_ps_t luma_vps; filter_sp_t luma_vsp; filter_ss_t luma_vss; filter_hv_pp_t luma_hvpp; // combines hps + vsp } intpl[NUM_PU_SIZES]; /* --------------------------------------------------------------------------- * intra prediction */ intra_pred_t intraf[NUM_INTRA_MODE]; fill_edge_t fill_edge_f[4]; /* 0, x, y, xy */ fill_ref_samples_t fill_ref_luma[2]; /* 0: CU inside picture; 1: on right/bottom */ /* --------------------------------------------------------------------------- * transform and quantization */ dct_funcs_t dctf; coeff_scan_t transpose_coeff_scan[NUM_PU_SIZES][2]; /* [TU size][0: no transpose; 1: transpose] */ coeff_scan4x4_t transpose_coeff_4x4[2]; /* [TU size][0: no transpose; 1: transpose] */ /* --------------------------------------------------------------------------- * In-loop filter */ void(*deblock_luma[2])(pel_t *, int, int, int, uint8_t*); void(*deblock_chroma[2])(pel_t *, pel_t *, int, int, int, uint8_t*); void(*deblock_luma_double[2]) (pel_t *src, int stride, int alpha, int beta, uint8_t *flt_flag); void(*deblock_chroma_double[2])(pel_t *src_u, pel_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag); sao_flt_t sao_block; /* filter for SAO */ /* function handles */ void(*alf_flt[2])(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, int *alf_coeff, int b_top_avail, int b_down_avail); /* ----------------------------------------------------------------------- * RDO procedure */ int (*get_skip_mv_predictors[SLICE_TYPE_NUM])(xavs2_t *h, cu_t *p_cu); /* get MVs for skip/direct mode */ rdcost_t (*compress_ctu[SLICE_TYPE_NUM])(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int i_level, int i_min_level, int i_max_level, rdcost_t cost_limit); } intrinsic_func_t; extern intrinsic_func_t g_funcs; /** * =========================================================================== * interface function declares * =========================================================================== */ #define xavs2_mem_oper_init FPFX(mem_oper_init) void xavs2_mem_oper_init (uint32_t cpuid, intrinsic_func_t *pf); #define xavs2_mc_init FPFX(mc_init) void xavs2_mc_init (uint32_t cpuid, intrinsic_func_t *pf); #define xavs2_intra_pred_init FPFX(intra_pred_init) void xavs2_intra_pred_init (uint32_t cpuid, intrinsic_func_t *pf); #define xavs2_dct_init FPFX(dct_init) void xavs2_dct_init (uint32_t cpuid, dct_funcs_t *dctf); #define xavs2_quant_init FPFX(quant_init) void xavs2_quant_init (uint32_t cpuid, dct_funcs_t *quantf); #define xavs2_cg_scan_init FPFX(cg_scan_init) void xavs2_cg_scan_init (uint32_t cpuid, intrinsic_func_t *pf); #define xavs2_deblock_init FPFX(deblock_init) void xavs2_deblock_init (uint32_t cpuid, intrinsic_func_t* lf); #define xavs2_sao_init FPFX(sao_init) void xavs2_sao_init (uint32_t cpuid, intrinsic_func_t *pf); #define xavs2_alf_init FPFX(alf_init) void xavs2_alf_init (uint32_t cpuid, intrinsic_func_t *pf); #define xavs2_rdo_init FPFX(rdo_init) void xavs2_rdo_init (uint32_t cpuid, intrinsic_func_t *pf); #define xavs2_init_all_primitives FPFX(init_all_primitives) void xavs2_init_all_primitives (xavs2_param_t* param, intrinsic_func_t *p_funcs); #endif // XAVS2_PRIMITIVES_H xavs2-1.3/source/common/quant.c000066400000000000000000000177451340660520300165210ustar00rootroot00000000000000/* * quant.h * * Description of this file: * Quant functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "primitives.h" #include "block_info.h" #include "cpu.h" #if HAVE_MMX #include "x86/quant8.h" #endif /** * =========================================================================== * local/global variables * =========================================================================== */ /* --------------------------------------------------------------------------- */ const uint16_t tab_Q_TAB[80] = { 32768, 29775, 27554, 25268, 23170, 21247, 19369, 17770, 16302, 15024, 13777, 12634, 11626, 10624, 9742, 8958, 8192, 7512, 6889, 6305, 5793, 5303, 4878, 4467, 4091, 3756, 3444, 3161, 2894, 2654, 2435, 2235, 2048, 1878, 1722, 1579, 1449, 1329, 1218, 1117, 1024, 939, 861, 790, 724, 664, 609, 558, 512, 470, 430, 395, 362, 332, 304, 279, 256, 235, 215, 197, 181, 166, 152, 140, 128, 116, 108, 99, 91, 83, 76, 69, 64, 59, 54, 49, 45, 41, 38, 35 }; /* --------------------------------------------------------------------------- */ const uint16_t tab_IQ_TAB[80] = { 32768, 36061, 38968, 42495, 46341, 50535, 55437, 60424, 32932, 35734, 38968, 42495, 46177, 50535, 55109, 59933, 65535, 35734, 38968, 42577, 46341, 50617, 55027, 60097, 32809, 35734, 38968, 42454, 46382, 50576, 55109, 60056, 65535, 35734, 38968, 42495, 46320, 50515, 55109, 60076, 65535, 35744, 38968, 42495, 46341, 50535, 55099, 60087, 65535, 35734, 38973, 42500, 46341, 50535, 55109, 60097, 32771, 35734, 38965, 42497, 46341, 50535, 55109, 60099, 32768, 36061, 38968, 42495, 46341, 50535, 55437, 60424, 32932, 35734, 38968, 42495, 46177, 50535, 55109, 59933 }; /* --------------------------------------------------------------------------- */ const uint8_t tab_IQ_SHIFT[80] = { 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6 }; /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * i_coef - number of coeffs, 16 <= i_coef <= 1024 */ static int quant_c(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add) { int num_non_zero = 0; int i; for (i = 0; i < i_coef; i++) { coef[i] = (coeff_t)(xavs2_sign2(coef[i]) * ((XAVS2_ABS(coef[i]) * scale + add) >> shift)); num_non_zero += coef[i] != 0; } return num_non_zero; } /* --------------------------------------------------------------------------- * i_coef - number of coeffs, 16 <= i_coef <= 1024 */ static void abs_coeff_c(coeff_t *dst, const coeff_t *src, const int i_coef) { int i; for (i = 0; i < i_coef; i++) { dst[i] = (coeff_t)abs(src[i]); } } /* --------------------------------------------------------------------------- * i_coef - number of coeffs, 16 <= i_coef <= 1024 */ static int add_sign_c(coeff_t *dst, const coeff_t *abs_val, const int i_coef) { int nz = 0; int i; for (i = 0; i < i_coef; i++) { dst[i] = (dst[i] > 0) ? abs_val[i] : -abs_val[i]; nz += (!!abs_val[i]); } return nz; } /* --------------------------------------------------------------------------- * adaptive frequency weighting quantization */ static int quant_weighted_c(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add, int *levelscale) { int num_non_zero = 0; int i; for (i = 0; i < i_coef; i++) { coef[i] = (coeff_t)(xavs2_sign2(coef[i]) * ((((XAVS2_ABS(coef[i]) * levelscale[i] + (1 << 18)) >> 19) * scale + add) >> shift)); num_non_zero += coef[i] != 0; } return num_non_zero; } /* --------------------------------------------------------------------------- */ static void dequant_c(coeff_t *coef, const int i_coef, const int scale, const int shift) { const int add = (1 << (shift - 1)); int k; for (k = 0; k < i_coef; k++) { if (coef[k] != 0) { // dequantization & descale coef[k] = (coeff_t)XAVS2_CLIP3(-32768, 32767, (coef[k] * scale + add) >> shift); } } } #if ENABLE_WQUANT /* --------------------------------------------------------------------------- */ static void dequant_weighted_c(coeff_t *coef, int i_coef, int scale, int shift, int wqm_shift, int wqm_stride, int xy_shift, int16_t *wq_matrix, const int16_t(*AVS_SCAN)[2]) { const int add = (1 << (shift - 1)); const int wqm_mask = wqm_stride - 1; int xx, yy; int k; int16_t wqm_coef = 0; for (k = 0; k < i_coef; k++) { xx = AVS_SCAN[k][0] >> xy_shift; yy = AVS_SCAN[k][1] >> xy_shift; wqm_coef = wq_matrix[(yy & wqm_mask) * wqm_stride + (xx & wqm_mask)]; if (coef[k] != 0) { // dequantization & descale coef[k] = (coeff_t)XAVS2_CLIP3(-32768, 32767, (((((coef[k] * wqm_coef) >> wqm_shift) * scale) >> 4) + add) >> shift); } } } #endif /* --------------------------------------------------------------------------- */ void xavs2_quant_init(uint32_t cpuid, dct_funcs_t *dctf) { /* init c function handles */ dctf->quant = quant_c; dctf->dequant = dequant_c; dctf->wquant = quant_weighted_c; dctf->abs_coeff = abs_coeff_c; dctf->add_sign = add_sign_c; /* init asm function handles */ #if HAVE_MMX if (cpuid & XAVS2_CPU_SSE4) { dctf->quant = FPFX(quant_sse4); dctf->dequant = FPFX(dequant_sse4); dctf->abs_coeff = abs_coeff_sse128; dctf->add_sign = add_sign_sse128; } if (cpuid & XAVS2_CPU_AVX2) { dctf->quant = quant_c_avx2; dctf->dequant = dequant_c_avx2; dctf->abs_coeff = abs_coeff_avx2; dctf->add_sign = add_sign_avx2; #if _MSC_VER dctf->quant = FPFX(quant_avx2); // would cause mis-match on some machine/system #endif #if ARCH_X86_64 dctf->dequant = FPFX(dequant_avx2); #endif } #else UNUSED_PARAMETER(cpuid); #endif // if HAVE_MMX } xavs2-1.3/source/common/threadpool.c000066400000000000000000000275471340660520300175330ustar00rootroot00000000000000/* * threadpool.c * * Description of this file: * thread pooling functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "threadpool.h" #include "cpu.h" /** * =========================================================================== * type defines * =========================================================================== */ /* --------------------------------------------------------------------------- * job */ typedef struct threadpool_job_t { xavs2_tfunc_t func; void *arg; void *ret; int wait; } threadpool_job_t; /* --------------------------------------------------------------------------- * synchronized job list */ typedef struct xavs2_sync_job_list_t { xavs2_thread_mutex_t mutex; xavs2_thread_cond_t cv_fill; /* event signaling that the list became fuller */ xavs2_thread_cond_t cv_empty; /* event signaling that the list became emptier */ int i_max_size; int i_size; threadpool_job_t *list[XAVS2_THREAD_MAX + 1]; } xavs2_sync_job_list_t; /* --------------------------------------------------------------------------- * thread pool */ struct xavs2_threadpool_t { int i_exit; /* exit flag */ int i_threads; /* thread number in pool */ xavs2_tfunc_t init_func; void *init_arg; /* requires a synchronized list structure and associated methods, so use what is already implemented for jobs */ xavs2_sync_job_list_t uninit; /* list of jobs that are awaiting use */ xavs2_sync_job_list_t run; /* list of jobs that are queued for processing by the pool */ xavs2_sync_job_list_t done; /* list of jobs that have finished processing */ /* handler of threads */ xavs2_thread_t thread_handle[XAVS2_THREAD_MAX]; uint8_t cpu_core_used[64]; }; /** * =========================================================================== * thread properties * =========================================================================== */ /* --------------------------------------------------------------------------- */ static INLINE int xavs2_thread_set_cpu(int idx_core) { #if HAVE_POSIXTHREAD && (SYS_WINDOWS || SYS_LINUX) && !__MINGW32__ cpu_set_t mask; CPU_ZERO(&mask); CPU_SET(idx_core, &mask); if (-1 == pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)) { return -1; } return 0; #else return 0; #endif } /* --------------------------------------------------------------------------- */ static INLINE int xavs2_thread_is_on_cpu(int idx_core) { #if HAVE_POSIXTHREAD && (SYS_WINDOWS || SYS_LINUX) && !__MINGW32__ cpu_set_t get; CPU_ZERO(&get); if (pthread_getaffinity_np(pthread_self(), sizeof(get), &get) < 0) { fprintf(stderr, "get thread affinity failed\n"); } return (CPU_ISSET(idx_core, &get)); #else return 0; #endif } /** * =========================================================================== * list operators * =========================================================================== */ /* --------------------------------------------------------------------------- */ static threadpool_job_t *xavs2_job_shift(threadpool_job_t **list) { threadpool_job_t *job = list[0]; int i; for (i = 0; list[i]; i++) { list[i] = list[i + 1]; } assert(job); return job; } /** * =========================================================================== * list operators * =========================================================================== */ /* --------------------------------------------------------------------------- */ static int xavs2_sync_job_list_init(xavs2_sync_job_list_t *slist, int i_max_size) { if (i_max_size < 0 || i_max_size > XAVS2_THREAD_MAX) { return -1; } slist->i_max_size = i_max_size; slist->i_size = 0; if (xavs2_thread_mutex_init(&slist->mutex, NULL) || xavs2_thread_cond_init(&slist->cv_fill, NULL) || xavs2_thread_cond_init(&slist->cv_empty, NULL)) { return -1; } return 0; } /* --------------------------------------------------------------------------- */ static void xavs2_sync_job_list_delete(xavs2_sync_job_list_t *slist) { xavs2_thread_mutex_destroy(&slist->mutex); xavs2_thread_cond_destroy(&slist->cv_fill); xavs2_thread_cond_destroy(&slist->cv_empty); } /* --------------------------------------------------------------------------- */ static void xavs2_sync_job_list_push(xavs2_sync_job_list_t *slist, threadpool_job_t *job) { xavs2_thread_mutex_lock(&slist->mutex); /* lock */ while (slist->i_size == slist->i_max_size) { xavs2_thread_cond_wait(&slist->cv_empty, &slist->mutex); } slist->list[slist->i_size++] = job; xavs2_thread_mutex_unlock(&slist->mutex); /* unlock */ xavs2_thread_cond_broadcast(&slist->cv_fill); } /* --------------------------------------------------------------------------- */ static threadpool_job_t *xavs2_sync_job_list_pop(xavs2_sync_job_list_t *slist) { threadpool_job_t *job; xavs2_thread_mutex_lock(&slist->mutex); /* lock */ while (!slist->i_size) { xavs2_thread_cond_wait(&slist->cv_fill, &slist->mutex); } job = slist->list[--slist->i_size]; slist->list[slist->i_size] = NULL; xavs2_thread_cond_broadcast(&slist->cv_empty); xavs2_thread_mutex_unlock(&slist->mutex); /* unlock */ return job; } /** * =========================================================================== * thread pool operators * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void *proc_xavs2_threadpool_thread(xavs2_threadpool_t *pool) { /* init */ if (pool->init_func) { pool->init_func(pool->init_arg); } /* loop until exit flag is set */ while (pool->i_exit != XAVS2_EXIT_THREAD) { threadpool_job_t *job = NULL; /* fetch a job */ xavs2_thread_mutex_lock(&pool->run.mutex); /* lock */ while (pool->i_exit != XAVS2_EXIT_THREAD && !pool->run.i_size) { xavs2_thread_cond_wait(&pool->run.cv_fill, &pool->run.mutex); } if (pool->run.i_size) { job = xavs2_job_shift(pool->run.list); pool->run.i_size--; } xavs2_thread_mutex_unlock(&pool->run.mutex); /* unlock */ /* do the job */ if (!job) { continue; } job->ret = job->func(job->arg); /* execute the function */ /* the job is done */ if (job->wait) { xavs2_sync_job_list_push(&pool->done, job); } else { xavs2_sync_job_list_push(&pool->uninit, job); } } return NULL; } /* --------------------------------------------------------------------------- */ int xavs2_threadpool_init(xavs2_threadpool_t **p_pool, int threads, xavs2_tfunc_t init_func, void *init_arg) { xavs2_threadpool_t *pool; uint8_t *mem_ptr = NULL; int size_mem = 0; int i; if (threads <= 0) { return -1; } threads = XAVS2_MIN(threads, XAVS2_THREAD_MAX); size_mem = sizeof(xavs2_threadpool_t) + threads * sizeof(threadpool_job_t) + CACHE_LINE_SIZE * XAVS2_THREAD_MAX * 2; CHECKED_MALLOCZERO(mem_ptr, uint8_t *, size_mem); pool = (xavs2_threadpool_t *)mem_ptr; mem_ptr += sizeof(xavs2_threadpool_t); ALIGN_POINTER(mem_ptr); *p_pool = pool; pool->init_func = init_func; pool->init_arg = init_arg; pool->i_threads = threads; if (xavs2_sync_job_list_init(&pool->uninit, pool->i_threads) || xavs2_sync_job_list_init(&pool->run, pool->i_threads) || xavs2_sync_job_list_init(&pool->done, pool->i_threads)) { goto fail; } for (i = 0; i < pool->i_threads; i++) { threadpool_job_t *job = (threadpool_job_t *)mem_ptr; mem_ptr += sizeof(threadpool_job_t); ALIGN_POINTER(mem_ptr); xavs2_sync_job_list_push(&pool->uninit, job); } for (i = 0; i < pool->i_threads; i++) { if (xavs2_create_thread(pool->thread_handle + i, (xavs2_tfunc_t)proc_xavs2_threadpool_thread, pool)) { goto fail; } } return 0; fail: return -1; } /* --------------------------------------------------------------------------- */ void xavs2_threadpool_run(xavs2_threadpool_t *pool, void *(*func)(void *), void *arg, int wait_sign) { threadpool_job_t *job = xavs2_sync_job_list_pop(&pool->uninit); job->func = func; job->arg = arg; job->wait = wait_sign; xavs2_sync_job_list_push(&pool->run, job); } /* --------------------------------------------------------------------------- */ void *xavs2_threadpool_wait(xavs2_threadpool_t *pool, void *arg) { threadpool_job_t *job = NULL; void *ret; int i; xavs2_thread_mutex_lock(&pool->done.mutex); /* lock */ while (!job) { for (i = 0; i < pool->done.i_size; i++) { threadpool_job_t *t = pool->done.list[i]; if (t->arg == arg) { job = xavs2_job_shift(pool->done.list + i); pool->done.i_size--; break; /* found the job according to arg */ } } if (!job) { xavs2_thread_cond_wait(&pool->done.cv_fill, &pool->done.mutex); } } xavs2_thread_mutex_unlock(&pool->done.mutex); /* unlock */ ret = job->ret; xavs2_sync_job_list_push(&pool->uninit, job); return ret; } /* --------------------------------------------------------------------------- */ void xavs2_threadpool_delete(xavs2_threadpool_t *pool) { int i; xavs2_thread_mutex_lock(&pool->run.mutex); /* lock */ pool->i_exit = XAVS2_EXIT_THREAD; xavs2_thread_cond_broadcast(&pool->run.cv_fill); xavs2_thread_mutex_unlock(&pool->run.mutex); /* unlock */ for (i = 0; i < pool->i_threads; i++) { xavs2_thread_join(pool->thread_handle[i], NULL); } xavs2_sync_job_list_delete(&pool->uninit); xavs2_sync_job_list_delete(&pool->run); xavs2_sync_job_list_delete(&pool->done); xavs2_free(pool); } xavs2-1.3/source/common/threadpool.h000066400000000000000000000042521340660520300175240ustar00rootroot00000000000000/* * threadpool.h * * Description of this file: * thread pooling functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_THREADPOOL_H #define XAVS2_THREADPOOL_H typedef struct xavs2_threadpool_t xavs2_threadpool_t; #define xavs2_threadpool_init FPFX(threadpool_init) int xavs2_threadpool_init (xavs2_threadpool_t **p_pool, int threads, xavs2_tfunc_t init_func, void *init_arg); #define xavs2_threadpool_run FPFX(threadpool_run) void xavs2_threadpool_run (xavs2_threadpool_t *pool, void *(*func)(void *), void *arg, int wait_sign); #define xavs2_threadpool_wait FPFX(threadpool_wait) void *xavs2_threadpool_wait (xavs2_threadpool_t *pool, void *arg); #define xavs2_threadpool_delete FPFX(threadpool_delete) void xavs2_threadpool_delete(xavs2_threadpool_t *pool); #endif // XAVS2_THREADPOOL_H xavs2-1.3/source/common/transform.c000066400000000000000000002023131340660520300173670ustar00rootroot00000000000000/* * transform.c * * Description of this file: * transform functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "transform.h" #include "block_info.h" #include "cpu.h" #if HAVE_MMX #include "x86/dct8.h" #include "vec/intrinsic.h" #endif // --------------------------------------------------------------------------- #define LOT_MAX_WLT_TAP 2 // number of wavelet transform tap (5-3) /** * =========================================================================== * local/global variables * =========================================================================== */ /* --------------------------------------------------------------------------- */ static const int16_t g_T4[4][4] = { { 32, 32, 32, 32 }, { 42, 17, -17, -42 }, { 32, -32, -32, 32 }, { 17, -42, 42, -17 } }; /* --------------------------------------------------------------------------- */ static const int16_t g_T8[8][8] = { { 32, 32, 32, 32, 32, 32, 32, 32 }, { 44, 38, 25, 9, -9, -25, -38, -44 }, { 42, 17, -17, -42, -42, -17, 17, 42 }, { 38, -9, -44, -25, 25, 44, 9, -38 }, { 32, -32, -32, 32, 32, -32, -32, 32 }, { 25, -44, 9, 38, -38, -9, 44, -25 }, { 17, -42, 42, -17, -17, 42, -42, 17 }, { 9, -25, 38, -44, 44, -38, 25, -9 } }; /* --------------------------------------------------------------------------- */ static const int16_t g_T16[16][16] = { { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, // 0 { 45, 43, 40, 35, 29, 21, 13, 4, -4, -13, -21, -29, -35, -40, -43, -45 }, // 1 { 44, 38, 25, 9, -9, -25, -38, -44, -44, -38, -25, -9, 9, 25, 38, 44 }, // 2 { 43, 29, 4, -21, -40, -45, -35, -13, 13, 35, 45, 40, 21, -4, -29, -43 }, // 3 { 42, 17, -17, -42, -42, -17, 17, 42, 42, 17, -17, -42, -42, -17, 17, 42 }, // 4 { 40, 4, -35, -43, -13, 29, 45, 21, -21, -45, -29, 13, 43, 35, -4, -40 }, // 5 { 38, -9, -44, -25, 25, 44, 9, -38, -38, 9, 44, 25, -25, -44, -9, 38 }, // 6 { 35, -21, -43, 4, 45, 13, -40, -29, 29, 40, -13, -45, -4, 43, 21, -35 }, // 7 { 32, -32, -32, 32, 32, -32, -32, 32, 32, -32, -32, 32, 32, -32, -32, 32 }, // 8 { 29, -40, -13, 45, -4, -43, 21, 35, -35, -21, 43, 4, -45, 13, 40, -29 }, // 9 { 25, -44, 9, 38, -38, -9, 44, -25, -25, 44, -9, -38, 38, 9, -44, 25 }, // 10 { 21, -45, 29, 13, -43, 35, 4, -40, 40, -4, -35, 43, -13, -29, 45, -21 }, // 11 { 17, -42, 42, -17, -17, 42, -42, 17, 17, -42, 42, -17, -17, 42, -42, 17 }, // 12 { 13, -35, 45, -40, 21, 4, -29, 43, -43, 29, -4, -21, 40, -45, 35, -13 }, // 13 { 9, -25, 38, -44, 44, -38, 25, -9, -9, 25, -38, 44, -44, 38, -25, 9 }, // 14 { 4, -13, 21, -29, 35, -40, 43, -45, 45, -43, 40, -35, 29, -21, 13, -4 } // 15 }; /* --------------------------------------------------------------------------- */ static const int16_t g_T32[32][32] = { { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 },//0 { 45, 45, 44, 43, 41, 39, 36, 34, 30, 27, 23, 19, 15, 11, 7, 2, -2, -7,-11,-15,-19,-23,-27,-30,-34,-36,-39,-41,-43,-44,-45,-45 },//1 { 45, 43, 40, 35, 29, 21, 13, 4, -4,-13,-21,-29,-35,-40,-43,-45,-45,-43,-40,-35,-29,-21,-13, -4, 4, 13, 21, 29, 35, 40, 43, 45 },//2 { 45, 41, 34, 23, 11, -2,-15,-27,-36,-43,-45,-44,-39,-30,-19, -7, 7, 19, 30, 39, 44, 45, 43, 36, 27, 15, 2,-11,-23,-34,-41,-45 },//3 { 44, 38, 25, 9, -9,-25,-38,-44,-44,-38,-25, -9, 9, 25, 38, 44, 44, 38, 25, 9, -9,-25,-38,-44,-44,-38,-25, -9, 9, 25, 38, 44 },//4 { 44, 34, 15, -7,-27,-41,-45,-39,-23, -2, 19, 36, 45, 43, 30, 11,-11,-30,-43,-45,-36,-19, 2, 23, 39, 45, 41, 27, 7,-15,-34,-44 },//5 { 43, 29, 4,-21,-40,-45,-35,-13, 13, 35, 45, 40, 21, -4,-29,-43,-43,-29, -4, 21, 40, 45, 35, 13,-13,-35,-45,-40,-21, 4, 29, 43 },// { 43, 23, -7,-34,-45,-36,-11, 19, 41, 44, 27, -2,-30,-45,-39,-15, 15, 39, 45, 30, 2,-27,-44,-41,-19, 11, 36, 45, 34, 7, -23,-43 },// { 42, 17,-17,-42,-42,-17, 17, 42, 42, 17,-17,-42,-42,-17, 17, 42, 42, 17,-17,-42,-42,-17, 17, 42, 42, 17,-17,-42,-42,-17, 17, 42 },//8 { 41, 11,-27,-45,-30, 7, 39, 43, 15,-23,-45,-34, 2, 36, 44, 19,-19,-44,-36, -2, 34, 45, 23,-15,-43,-39, -7, 30, 45, 27,-11,-41 },// { 40, 4,-35,-43,-13, 29, 45, 21,-21,-45,-29, 13, 43, 35, -4,-40,-40, -4, 35, 43, 13,-29,-45,-21, 21, 45, 29,-13,-43,-35, 4, 40 },//10 { 39, -2,-41,-36, 7, 43, 34,-11,-44,-30, 15, 45, 27,-19,-45,-23, 23, 45, 19,-27,-45,-15, 30, 44, 11,-34,-43, -7, 36, 41, 2,-39 },// { 38, -9,-44,-25, 25, 44, 9,-38,-38, 9, 44, 25,-25,-44, -9, 38, 38, -9,-44,-25, 25, 44, 9,-38,-38, 9, 44, 25,-25,-44, -9, 38 },//12 { 36,-15,-45,-11, 39, 34,-19,-45, -7, 41, 30,-23,-44, -2, 43, 27,-27,-43, 2, 44, 23,-30,-41, 7, 45, 19,-34,-39, 11, 45, 15,-36 },// { 35,-21,-43, 4, 45, 13,-40,-29, 29, 40,-13,-45, -4, 43, 21,-35,-35, 21, 43, -4,-45,-13, 40, 29,-29,-40, 13, 45, 4,-43,-21, 35 },//14 { 34,-27,-39, 19, 43,-11,-45, 2, 45, 7,-44,-15, 41, 23,-36,-30, 30, 36,-23,-41, 15, 44, -7,-45, -2, 45, 11,-43,-19, 39, 27,-34 },// { 32,-32,-32, 32, 32,-32,-32, 32, 32,-32,-32, 32, 32,-32,-32, 32, 32,-32,-32, 32, 32,-32,-32, 32, 32,-32,-32, 32, 32,-32,-32, 32 },//16 { 30,-36,-23, 41, 15,-44, -7, 45, -2,-45, 11, 43,-19,-39, 27, 34,-34,-27, 39, 19,-43,-11, 45, 2,-45, 7, 44,-15,-41, 23, 36,-30 },// { 29,-40,-13, 45, -4,-43, 21, 35,-35,-21, 43, 4,-45, 13, 40,-29,-29, 40, 13,-45, 4, 43,-21,-35, 35, 21,-43, -4, 45,-13,-40, 29 },//18 { 27,-43, -2, 44,-23,-30, 41, 7,-45, 19, 34,-39,-11, 45,-15,-36, 36, 15,-45, 11, 39,-34,-19, 45, -7,-41, 30, 23,-44, 2, 43,-27 },// { 25,-44, 9, 38,-38, -9, 44,-25,-25, 44, -9,-38, 38, 9,-44, 25, 25,-44, 9, 38,-38, -9, 44,-25,-25, 44, -9,-38, 38, 9,-44, 25 },//20 { 23,-45, 19, 27,-45, 15, 30,-44, 11, 34,-43, 7, 36,-41, 2, 39,-39, -2, 41,-36, -7, 43,-34,-11, 44,-30,-15, 45,-27,-19, 45,-23 },// { 21,-45, 29, 13,-43, 35, 4,-40, 40, -4,-35, 43,-13,-29, 45,-21,-21, 45,-29,-13, 43,-35, -4, 40,-40, 4, 35,-43, 13, 29,-45, 21 },//22 { 19,-44, 36, -2,-34, 45,-23,-15, 43,-39, 7, 30,-45, 27, 11,-41, 41,-11,-27, 45,-30, -7, 39,-43, 15, 23,-45, 34, 2,-36, 44,-19 },// { 17,-42, 42,-17,-17, 42,-42, 17, 17,-42, 42,-17,-17, 42,-42, 17, 17,-42, 42,-17,-17, 42,-42, 17, 17,-42, 42,-17,-17, 42,-42, 17 },//24 { 15,-39, 45,-30, 2, 27,-44, 41,-19,-11, 36,-45, 34, -7,-23, 43,-43, 23, 7,-34, 45,-36, 11, 19,-41, 44,-27, -2, 30,-45, 39,-15 },// { 13,-35, 45,-40, 21, 4,-29, 43,-43, 29, -4,-21, 40,-45, 35,-13,-13, 35,-45, 40,-21, -4, 29,-43, 43,-29, 4, 21,-40, 45,-35, 13 },//26 { 11,-30, 43,-45, 36,-19, -2, 23,-39, 45,-41, 27, -7,-15, 34,-44, 44,-34, 15, 7,-27, 41,-45, 39,-23, 2, 19,-36, 45,-43, 30,-11 },// { 9,-25, 38,-44, 44,-38, 25, -9, -9, 25,-38, 44,-44, 38,-25, 9, 9,-25, 38,-44, 44,-38, 25, -9, -9, 25,-38, 44,-44, 38,-25, 9 },//28 { 7,-19, 30,-39, 44,-45, 43,-36, 27,-15, 2, 11,-23, 34,-41, 45,-45, 41,-34, 23,-11, -2, 15,-27, 36,-43, 45,-44, 39,-30, 19, -7 },// { 4,-13, 21,-29, 35,-40, 43,-45, 45,-43, 40,-35, 29,-21, 13, -4, -4, 13,-21, 29,-35, 40,-43, 45,-45, 43,-40, 35,-29, 21,-13, 4 },//30 { 2, -7, 11,-15, 19,-23, 27,-30, 34,-36, 39,-41, 43,-44, 45,-45, 45,-45, 44,-43, 41,-39, 36,-34, 30,-27, 23,-19, 15,-11, 7, -2 } //31 }; /* --------------------------------------------------------------------------- * secondary transform */ ALIGN16(const int16_t g_2T[SEC_TR_SIZE * SEC_TR_SIZE]) = { 123, -35, -8, -3, -32, -120, 30, 10, 14, 25, 123, -22, 8, 13, 19, 126 }; /* --------------------------------------------------------------------------- * secondary transform (only for 4x4) */ ALIGN16(const int16_t g_2T_C[SEC_TR_SIZE * SEC_TR_SIZE]) = { 34, 58, 72, 81, 77, 69, -7, -75, 79, -33, -75, 58, 55, -84, 73, -28 }; /** * =========================================================================== * local function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * dst = g_T4 x src_T */ static void partialButterfly4(const coeff_t *src, coeff_t *dst, int shift, int line) { int E[2], O[2]; const int add = ((1 << shift) >> 1); int j; for (j = 0; j < line; j++) { /* E and O */ E[0] = src[0] + src[3]; O[0] = src[0] - src[3]; E[1] = src[1] + src[2]; O[1] = src[1] - src[2]; dst[0 ] = (coeff_t)((g_T4[0][0] * E[0] + g_T4[0][1] * E[1] + add) >> shift); dst[2 * line] = (coeff_t)((g_T4[2][0] * E[0] + g_T4[2][1] * E[1] + add) >> shift); dst[ line] = (coeff_t)((g_T4[1][0] * O[0] + g_T4[1][1] * O[1] + add) >> shift); dst[3 * line] = (coeff_t)((g_T4[3][0] * O[0] + g_T4[3][1] * O[1] + add) >> shift); src += 4; dst++; } } /* --------------------------------------------------------------------------- */ static void partialButterflyInverse4(const coeff_t *src, coeff_t *dst, int shift, int line, int clip_depth) { int E[2], O[2]; const int max_val = ((1 << clip_depth) >> 1) - 1; const int min_val = -max_val - 1; const int add = ((1 << shift) >> 1); int j; for (j = 0; j < line; j++) { /* utilizing symmetry properties to the maximum to * minimize the number of multiplications */ O[0] = g_T4[1][0] * src[line] + g_T4[3][0] * src[3 * line]; O[1] = g_T4[1][1] * src[line] + g_T4[3][1] * src[3 * line]; E[0] = g_T4[0][0] * src[0 ] + g_T4[2][0] * src[2 * line]; E[1] = g_T4[0][1] * src[0 ] + g_T4[2][1] * src[2 * line]; /* combining even and odd terms at each hierarchy levels to * calculate the final spatial domain vector */ dst[0] = (coeff_t)XAVS2_CLIP3(min_val, max_val, ((E[0] + O[0] + add) >> shift)); dst[1] = (coeff_t)XAVS2_CLIP3(min_val, max_val, ((E[1] + O[1] + add) >> shift)); dst[2] = (coeff_t)XAVS2_CLIP3(min_val, max_val, ((E[1] - O[1] + add) >> shift)); dst[3] = (coeff_t)XAVS2_CLIP3(min_val, max_val, ((E[0] - O[0] + add) >> shift)); src++; dst += 4; } } /* --------------------------------------------------------------------------- */ static void partialButterfly8(const coeff_t *src, coeff_t *dst, int shift, int line) { int E[4], O[4]; int EE[2], EO[2]; const int add = ((1 << shift) >> 1); int j, k; for (j = 0; j < line; j++) { /* E and O*/ for (k = 0; k < 4; k++) { E[k] = src[k] + src[7 - k]; O[k] = src[k] - src[7 - k]; } /* EE and EO */ EE[0] = E[0] + E[3]; EO[0] = E[0] - E[3]; EE[1] = E[1] + E[2]; EO[1] = E[1] - E[2]; dst[0 ] = (coeff_t)((g_T8[0][0] * EE[0] + g_T8[0][1] * EE[1] + add) >> shift); dst[4 * line] = (coeff_t)((g_T8[4][0] * EE[0] + g_T8[4][1] * EE[1] + add) >> shift); dst[2 * line] = (coeff_t)((g_T8[2][0] * EO[0] + g_T8[2][1] * EO[1] + add) >> shift); dst[6 * line] = (coeff_t)((g_T8[6][0] * EO[0] + g_T8[6][1] * EO[1] + add) >> shift); dst[ line] = (coeff_t)((g_T8[1][0] * O[0] + g_T8[1][1] * O[1] + g_T8[1][2] * O[2] + g_T8[1][3] * O[3] + add) >> shift); dst[3 * line] = (coeff_t)((g_T8[3][0] * O[0] + g_T8[3][1] * O[1] + g_T8[3][2] * O[2] + g_T8[3][3] * O[3] + add) >> shift); dst[5 * line] = (coeff_t)((g_T8[5][0] * O[0] + g_T8[5][1] * O[1] + g_T8[5][2] * O[2] + g_T8[5][3] * O[3] + add) >> shift); dst[7 * line] = (coeff_t)((g_T8[7][0] * O[0] + g_T8[7][1] * O[1] + g_T8[7][2] * O[2] + g_T8[7][3] * O[3] + add) >> shift); src += 8; dst++; } } /* --------------------------------------------------------------------------- */ static void partialButterflyInverse8(const coeff_t *src, coeff_t *dst, int shift, int line, int clip_depth) { int E[4], O[4]; int EE[2], EO[2]; const int max_val = ((1 << clip_depth) >> 1) - 1; const int min_val = -max_val - 1; const int add = ((1 << shift) >> 1); int j, k; for (j = 0; j < line; j++) { /* utilizing symmetry properties to the maximum to * minimize the number of multiplications */ for (k = 0; k < 4; k++) { O[k] = g_T8[1][k] * src[ line] + g_T8[3][k] * src[3 * line] + g_T8[5][k] * src[5 * line] + g_T8[7][k] * src[7 * line]; } EO[0] = g_T8[2][0] * src[2 * line] + g_T8[6][0] * src[6 * line]; EO[1] = g_T8[2][1] * src[2 * line] + g_T8[6][1] * src[6 * line]; EE[0] = g_T8[0][0] * src[0 ] + g_T8[4][0] * src[4 * line]; EE[1] = g_T8[0][1] * src[0 ] + g_T8[4][1] * src[4 * line]; /* combining even and odd terms at each hierarchy levels to * calculate the final spatial domain vector */ E[0] = EE[0] + EO[0]; E[3] = EE[0] - EO[0]; E[1] = EE[1] + EO[1]; E[2] = EE[1] - EO[1]; for (k = 0; k < 4; k++) { dst[k ] = (coeff_t)XAVS2_CLIP3(min_val, max_val, ((E[k ] + O[k ] + add) >> shift)); dst[k + 4] = (coeff_t)XAVS2_CLIP3(min_val, max_val, ((E[3 - k] - O[3 - k] + add) >> shift)); } src++; dst += 8; } } /* --------------------------------------------------------------------------- */ static void partialButterfly16(const coeff_t *src, coeff_t *dst, int shift, int line) { int E[8], O[8]; int EE[4], EO[4]; int EEE[2], EEO[2]; const int add = ((1 << shift) >> 1); int j, k; for (j = 0; j < line; j++) { /* E and O*/ for (k = 0; k < 8; k++) { E[k] = src[k] + src[15 - k]; O[k] = src[k] - src[15 - k]; } /* EE and EO */ for (k = 0; k < 4; k++) { EE[k] = E[k] + E[7 - k]; EO[k] = E[k] - E[7 - k]; } /* EEE and EEO */ EEE[0] = EE[0] + EE[3]; EEO[0] = EE[0] - EE[3]; EEE[1] = EE[1] + EE[2]; EEO[1] = EE[1] - EE[2]; dst[ 0 ] = (coeff_t)((g_T16[ 0][0] * EEE[0] + g_T16[ 0][1] * EEE[1] + add) >> shift); dst[ 8 * line] = (coeff_t)((g_T16[ 8][0] * EEE[0] + g_T16[ 8][1] * EEE[1] + add) >> shift); dst[ 4 * line] = (coeff_t)((g_T16[ 4][0] * EEO[0] + g_T16[ 4][1] * EEO[1] + add) >> shift); dst[12 * line] = (coeff_t)((g_T16[12][0] * EEO[0] + g_T16[12][1] * EEO[1] + add) >> shift); for (k = 2; k < 16; k += 4) { dst[k * line] = (coeff_t)((g_T16[k][0] * EO[0] + g_T16[k][1] * EO[1] + g_T16[k][2] * EO[2] + g_T16[k][3] * EO[3] + add) >> shift); } for (k = 1; k < 16; k += 2) { dst[k * line] = (coeff_t)((g_T16[k][0] * O[0] + g_T16[k][1] * O[1] + g_T16[k][2] * O[2] + g_T16[k][3] * O[3] + g_T16[k][4] * O[4] + g_T16[k][5] * O[5] + g_T16[k][6] * O[6] + g_T16[k][7] * O[7] + add) >> shift); } src += 16; dst++; } } /* --------------------------------------------------------------------------- */ static void partialButterflyInverse16(const coeff_t *src, coeff_t *dst, int shift, int line, int clip_depth) { int E[8], O[8]; int EE[4], EO[4]; int EEE[2], EEO[2]; const int max_val = ((1 << clip_depth) >> 1) - 1; const int min_val = -max_val - 1; const int add = ((1 << shift) >> 1); int j, k; for (j = 0; j < line; j++) { /* utilizing symmetry properties to the maximum to * minimize the number of multiplications */ for (k = 0; k < 8; k++) { O[k] = g_T16[ 1][k] * src[ line] + g_T16[ 3][k] * src[ 3 * line] + g_T16[ 5][k] * src[ 5 * line] + g_T16[ 7][k] * src[ 7 * line] + g_T16[ 9][k] * src[ 9 * line] + g_T16[11][k] * src[11 * line] + g_T16[13][k] * src[13 * line] + g_T16[15][k] * src[15 * line]; } for (k = 0; k < 4; k++) { EO[k] = g_T16[ 2][k] * src[ 2 * line] + g_T16[ 6][k] * src[ 6 * line] + g_T16[10][k] * src[10 * line] + g_T16[14][k] * src[14 * line]; } EEO[0] = g_T16[4][0] * src[4 * line] + g_T16[12][0] * src[12 * line]; EEE[0] = g_T16[0][0] * src[0 ] + g_T16[ 8][0] * src[ 8 * line]; EEO[1] = g_T16[4][1] * src[4 * line] + g_T16[12][1] * src[12 * line]; EEE[1] = g_T16[0][1] * src[0 ] + g_T16[ 8][1] * src[ 8 * line]; /* combining even and odd terms at each hierarchy levels to * calculate the final spatial domain vector */ for (k = 0; k < 2; k++) { EE[k ] = EEE[k ] + EEO[k ]; EE[k + 2] = EEE[1 - k] - EEO[1 - k]; } for (k = 0; k < 4; k++) { E[k ] = EE[k ] + EO[k ]; E[k + 4] = EE[3 - k] - EO[3 - k]; } for (k = 0; k < 8; k++) { dst[k ] = (coeff_t)XAVS2_CLIP3(min_val, max_val, ((E[k ] + O[k ] + add) >> shift)); dst[k + 8] = (coeff_t)XAVS2_CLIP3(min_val, max_val, ((E[7 - k] - O[7 - k] + add) >> shift)); } src++; dst += 16; } } /* --------------------------------------------------------------------------- */ static void partialButterfly32(const coeff_t *src, coeff_t *dst, int shift, int line) { int E[16], O[16]; int EE[8], EO[8]; int EEE[4], EEO[4]; int EEEE[2], EEEO[2]; const int add = ((1 << shift) >> 1); int j, k; for (j = 0; j < line; j++) { /* E and O*/ for (k = 0; k < 16; k++) { E[k] = src[k] + src[31 - k]; O[k] = src[k] - src[31 - k]; } /* EE and EO */ for (k = 0; k < 8; k++) { EE[k] = E[k] + E[15 - k]; EO[k] = E[k] - E[15 - k]; } /* EEE and EEO */ for (k = 0; k < 4; k++) { EEE[k] = EE[k] + EE[7 - k]; EEO[k] = EE[k] - EE[7 - k]; } /* EEEE and EEEO */ EEEE[0] = EEE[0] + EEE[3]; EEEO[0] = EEE[0] - EEE[3]; EEEE[1] = EEE[1] + EEE[2]; EEEO[1] = EEE[1] - EEE[2]; dst[ 0 ] = (coeff_t)((g_T32[ 0][0] * EEEE[0] + g_T32[ 0][1] * EEEE[1] + add) >> shift); dst[ 8 * line] = (coeff_t)((g_T32[ 8][0] * EEEO[0] + g_T32[ 8][1] * EEEO[1] + add) >> shift); dst[16 * line] = (coeff_t)((g_T32[16][0] * EEEE[0] + g_T32[16][1] * EEEE[1] + add) >> shift); dst[24 * line] = (coeff_t)((g_T32[24][0] * EEEO[0] + g_T32[24][1] * EEEO[1] + add) >> shift); for (k = 4; k < 32; k += 8) { dst[k * line] = (coeff_t)((g_T32[k][0] * EEO[0] + g_T32[k][1] * EEO[1] + g_T32[k][2] * EEO[2] + g_T32[k][3] * EEO[3] + add) >> shift); } for (k = 2; k < 32; k += 4) { dst[k * line] = (coeff_t)((g_T32[k][0] * EO[0] + g_T32[k][1] * EO[1] + g_T32[k][2] * EO[2] + g_T32[k][3] * EO[3] + g_T32[k][4] * EO[4] + g_T32[k][5] * EO[5] + g_T32[k][6] * EO[6] + g_T32[k][7] * EO[7] + add) >> shift); } for (k = 1; k < 32; k += 2) { dst[k * line] = (coeff_t)((g_T32[k][ 0] * O[ 0] + g_T32[k][ 1] * O[ 1] + g_T32[k][ 2] * O[ 2] + g_T32[k][ 3] * O[ 3] + g_T32[k][ 4] * O[ 4] + g_T32[k][ 5] * O[ 5] + g_T32[k][ 6] * O[ 6] + g_T32[k][ 7] * O[ 7] + g_T32[k][ 8] * O[ 8] + g_T32[k][ 9] * O[ 9] + g_T32[k][10] * O[10] + g_T32[k][11] * O[11] + g_T32[k][12] * O[12] + g_T32[k][13] * O[13] + g_T32[k][14] * O[14] + g_T32[k][15] * O[15] + add) >> shift); } src += 32; dst++; } } /* --------------------------------------------------------------------------- */ static void partialButterflyInverse32(const coeff_t *src, coeff_t *dst, int shift, int line, int clip_depth) { int E[16], O[16]; int EE[8], EO[8]; int EEE[4], EEO[4]; int EEEE[2], EEEO[2]; const int max_val = ((1 << clip_depth) >> 1) - 1; const int min_val = -max_val - 1; const int add = ((1 << shift) >> 1); int j, k; for (j = 0; j < line; j++) { /* utilizing symmetry properties to the maximum to * minimize the number of multiplications */ for (k = 0; k < 16; k++) { O[k] = g_T32[ 1][k] * src[ line] + g_T32[ 3][k] * src[ 3 * line] + g_T32[ 5][k] * src[ 5 * line] + g_T32[ 7][k] * src[ 7 * line] + g_T32[ 9][k] * src[ 9 * line] + g_T32[11][k] * src[11 * line] + g_T32[13][k] * src[13 * line] + g_T32[15][k] * src[15 * line] + g_T32[17][k] * src[17 * line] + g_T32[19][k] * src[19 * line] + g_T32[21][k] * src[21 * line] + g_T32[23][k] * src[23 * line] + g_T32[25][k] * src[25 * line] + g_T32[27][k] * src[27 * line] + g_T32[29][k] * src[29 * line] + g_T32[31][k] * src[31 * line]; } for (k = 0; k < 8; k++) { EO[k] = g_T32[ 2][k] * src[ 2 * line] + g_T32[ 6][k] * src[ 6 * line] + g_T32[10][k] * src[10 * line] + g_T32[14][k] * src[14 * line] + g_T32[18][k] * src[18 * line] + g_T32[22][k] * src[22 * line] + g_T32[26][k] * src[26 * line] + g_T32[30][k] * src[30 * line]; } for (k = 0; k < 4; k++) { EEO[k] = g_T32[ 4][k] * src[ 4 * line] + g_T32[12][k] * src[12 * line] + g_T32[20][k] * src[20 * line] + g_T32[28][k] * src[28 * line]; } EEEO[0] = g_T32[8][0] * src[8 * line] + g_T32[24][0] * src[24 * line]; EEEO[1] = g_T32[8][1] * src[8 * line] + g_T32[24][1] * src[24 * line]; EEEE[0] = g_T32[0][0] * src[0 ] + g_T32[16][0] * src[16 * line]; EEEE[1] = g_T32[0][1] * src[0 ] + g_T32[16][1] * src[16 * line]; /* combining even and odd terms at each hierarchy levels to * calculate the final spatial domain vector */ EEE[0] = EEEE[0] + EEEO[0]; EEE[3] = EEEE[0] - EEEO[0]; EEE[1] = EEEE[1] + EEEO[1]; EEE[2] = EEEE[1] - EEEO[1]; for (k = 0; k < 4; k++) { EE[k ] = EEE[k ] + EEO[k ]; EE[k + 4] = EEE[3 - k] - EEO[3 - k]; } for (k = 0; k < 8; k++) { E[k ] = EE[k ] + EO[k ]; E[k + 8] = EE[7 - k] - EO[7 - k]; } for (k = 0; k < 16; k++) { dst[k ] = (coeff_t)XAVS2_CLIP3(min_val, max_val, ((E[k ] + O[k ] + add) >> shift)); dst[k + 16] = (coeff_t)XAVS2_CLIP3(min_val, max_val, ((E[15 - k] - O[15 - k] + add) >> shift)); } src++; dst += 32; } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void wavelet_64x64_c(const coeff_t *src, coeff_t *dst) { ALIGN32(coeff_t row_buf[64 + LOT_MAX_WLT_TAP * 2]); coeff_t *pExt = row_buf + LOT_MAX_WLT_TAP; const int N0 = 64; const int N1 = 64 >> 1; int x, y, offset; /* step 1: horizontal transform */ for (y = 0, offset = 0; y < N0; y++, offset += N0) { /* copy */ memcpy(pExt, src + offset, N0 * sizeof(coeff_t)); /* reflection */ pExt[-1 ] = pExt[1 ]; pExt[-2 ] = pExt[2 ]; pExt[N0 ] = pExt[N0 - 2]; pExt[N0 + 1] = pExt[N0 - 3]; /* filtering (H) */ for (x = -1; x < N0; x += 2) { pExt[x] -= (pExt[x - 1] + pExt[x + 1]) >> 1; } /* filtering (L) */ for (x = 0; x < N0; x += 2) { pExt[x] += (pExt[x - 1] + pExt[x + 1] + 2) >> 2; } /* copy */ for (x = 0; x < N1; x++) { dst[x + offset] = pExt[x << 1]; } } /* step 2: vertical transform */ for (x = 0; x < N1; x++) { /* copy */ for (y = 0, offset = 0; y < N0; y++, offset += N0) { pExt[y] = dst[x + offset]; } /* reflection */ pExt[-1 ] = pExt[1 ]; pExt[-2 ] = pExt[2 ]; pExt[N0 ] = pExt[N0 - 2]; pExt[N0 + 1] = pExt[N0 - 3]; /* filtering (H) */ for (y = -1; y < N0; y += 2) { pExt[y] -= (pExt[y - 1] + pExt[y + 1]) >> 1; } /* filtering (L) */ for (y = 0; y < N0; y += 2) { pExt[y] = (pExt[y] << 1) + ((pExt[y - 1] + pExt[y + 1] + 1) >> 1); } /* copy */ for (y = 0, offset = 0; y < N1; y++, offset += 32) { dst[x + offset] = pExt[y << 1]; } } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void wavelet_64x16_c(const coeff_t *src, coeff_t *dst) { ALIGN32(coeff_t row_buf[64 + LOT_MAX_WLT_TAP * 2]); coeff_t *pExt = row_buf + LOT_MAX_WLT_TAP; const int N0 = 64; const int N1 = 16; int x, y, offset; /* step 1: horizontal transform */ for (y = 0, offset = 0; y < N1; y++, offset += N0) { /* copy */ memcpy(pExt, src + offset, N0 * sizeof(coeff_t)); /* reflection */ pExt[-1 ] = pExt[1 ]; pExt[-2 ] = pExt[2 ]; pExt[N0 ] = pExt[N0 - 2]; pExt[N0 + 1] = pExt[N0 - 3]; /* filtering (H) */ for (x = -1; x < N0; x += 2) { pExt[x] -= (pExt[x - 1] + pExt[x + 1]) >> 1; } /* filtering (L) */ for (x = 0; x < N0; x += 2) { pExt[x] += (pExt[x - 1] + pExt[x + 1] + 2) >> 2; } /* copy */ for (x = 0; x < N0 >> 1; x++) { dst[x + offset] = pExt[x << 1]; } } /* step 2: vertical transform */ for (x = 0; x < (N0 >> 1); x++) { /* copy */ for (y = 0, offset = 0; y < N1; y++, offset += N0) { pExt[y] = dst[x + offset]; } /* reflection */ pExt[-1 ] = pExt[1 ]; pExt[-2 ] = pExt[2 ]; pExt[N1 ] = pExt[N1 - 2]; pExt[N1 + 1] = pExt[N1 - 3]; /* filtering (H) */ for (y = -1; y < N1; y += 2) { pExt[y] -= (pExt[y - 1] + pExt[y + 1]) >> 1; } /* filtering (L) */ for (y = 0; y < N1; y += 2) { pExt[y] = (pExt[y] << 1) + ((pExt[y - 1] + pExt[y + 1] + 1) >> 1); } /* copy */ for (y = 0, offset = 0; y < N1 >> 1; y++, offset += N0) { dst[x + (offset >> 1)] = pExt[y << 1]; } } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void wavelet_16x64_c(const coeff_t *src, coeff_t *dst) { ALIGN32(coeff_t row_buf[64 + LOT_MAX_WLT_TAP * 2]); coeff_t *pExt = row_buf + LOT_MAX_WLT_TAP; const int N0 = 16; const int N1 = 64; int x, y, offset; /* step 1: horizontal transform */ for (y = 0, offset = 0; y < N1; y++, offset += N0) { /* copy */ memcpy(pExt, src + offset, N0 * sizeof(coeff_t)); /* reflection */ pExt[-1 ] = pExt[1 ]; pExt[-2 ] = pExt[2 ]; pExt[N0 ] = pExt[N0 - 2]; pExt[N0 + 1] = pExt[N0 - 3]; /* filtering (H) */ for (x = -1; x < N0; x += 2) { pExt[x] -= (pExt[x - 1] + pExt[x + 1]) >> 1; } /* filtering (L) */ for (x = 0; x < N0; x += 2) { pExt[x] += (pExt[x - 1] + pExt[x + 1] + 2) >> 2; } /* copy */ for (x = 0; x < N0 >> 1; x++) { dst[x + offset] = pExt[x << 1]; } } /* step 2: vertical transform */ for (x = 0; x < (N0 >> 1); x++) { /* copy */ for (y = 0, offset = 0; y < N1; y++, offset += N0) { pExt[y] = dst[x + offset]; } /* reflection */ pExt[-1 ] = pExt[1 ]; pExt[-2 ] = pExt[2 ]; pExt[N1 ] = pExt[N1 - 2]; pExt[N1 + 1] = pExt[N1 - 3]; /* filtering (H) */ for (y = -1; y < N1; y += 2) { pExt[y] -= (pExt[y - 1] + pExt[y + 1]) >> 1; } /* filtering (L) */ for (y = 0; y < N1; y += 2) { pExt[y] = (pExt[y] << 1) + ((pExt[y - 1] + pExt[y + 1] + 1) >> 1); } /* copy */ for (y = 0, offset = 0; y < N1 >> 1; y++, offset += N0) { dst[x + (offset >> 1)] = pExt[y << 1]; } } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void inv_wavelet_64x64_c(coeff_t *coeff) { ALIGN32(coeff_t row_buf[64 + LOT_MAX_WLT_TAP * 2]); coeff_t *pExt = row_buf + LOT_MAX_WLT_TAP; const int N0 = 64; const int N1 = 64 >> 1; int x, y, offset; /* step 1: vertical transform */ for (x = 0; x < N0; x++) { /* copy */ for (y = 0, offset = 0; y < N1; y++, offset += 32) { pExt[y << 1] = coeff[x + offset]; } /* reflection */ pExt[N0] = pExt[N0 - 2]; /* filtering (even pixel) */ for (y = 0; y <= N0; y += 2) { pExt[y] >>= 1; } /* filtering (odd pixel) */ for (y = 1; y < N0; y += 2) { pExt[y] = (pExt[y - 1] + pExt[y + 1]) >> 1; } /* copy */ for (y = 0, offset = 0; y < N0; y++, offset += N0) { coeff[x + offset] = pExt[y]; } } /* step 2: horizontal transform */ for (y = 0, offset = 0; y < N0; y++, offset += N0) { /* copy */ for (x = 0; x < N1; x++) { pExt[x << 1] = coeff[offset + x]; } /* reflection */ pExt[N0] = pExt[N0 - 2]; /* filtering (odd pixel) */ for (x = 1; x < N0; x += 2) { pExt[x] = (pExt[x - 1] + pExt[x + 1]) >> 1; } /* copy */ memcpy(coeff + offset, pExt, N0 * sizeof(coeff_t)); } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void inv_wavelet_64x16_c(coeff_t *coeff) { ALIGN32(coeff_t row_buf[64 + LOT_MAX_WLT_TAP * 2]); coeff_t *pExt = row_buf + LOT_MAX_WLT_TAP; const int N0 = 64; const int N1 = 16; int x, y, offset; /* step 1: vertical transform */ for (x = 0; x < (N0 >> 1); x++) { /* copy */ for (y = 0, offset = 0; y < N1 >> 1; y++, offset += (N0 >> 1)) { pExt[y << 1] = coeff[x + offset]; } /* reflection */ pExt[N1] = pExt[N1 - 2]; /* filtering (even pixel) */ for (y = 0; y <= N1; y += 2) { pExt[y] >>= 1; } /* filtering (odd pixel) */ for (y = 1; y < N1; y += 2) { pExt[y] = (pExt[y - 1] + pExt[y + 1]) >> 1; } /* copy */ for (y = 0, offset = 0; y < N1; y++, offset += N0) { coeff[x + offset] = pExt[y]; } } /* step 2: horizontal transform */ for (y = 0, offset = 0; y < N1; y++, offset += N0) { /* copy */ for (x = 0; x < N0 >> 1; x++) { pExt[x << 1] = coeff[offset + x]; } /* reflection */ pExt[N0] = pExt[N0 - 2]; /* filtering (odd pixel) */ for (x = 1; x < N0; x += 2) { pExt[x] = (pExt[x - 1] + pExt[x + 1]) >> 1; } /* copy */ memcpy(coeff + offset, pExt, N0 * sizeof(coeff_t)); } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void inv_wavelet_16x64_c(coeff_t *coeff) { ALIGN32(coeff_t row_buf[64 + LOT_MAX_WLT_TAP * 2]); coeff_t *pExt = row_buf + LOT_MAX_WLT_TAP; const int N0 = 16; const int N1 = 64; int x, y, offset; /* step 1: vertical transform */ for (x = 0; x < (N0 >> 1); x++) { /* copy */ for (y = 0, offset = 0; y < N1 >> 1; y++, offset += (N0 >> 1)) { pExt[y << 1] = coeff[x + offset]; } /* reflection */ pExt[N1] = pExt[N1 - 2]; /* filtering (even pixel) */ for (y = 0; y <= N1; y += 2) { pExt[y] >>= 1; } /* filtering (odd pixel) */ for (y = 1; y < N1; y += 2) { pExt[y] = (pExt[y - 1] + pExt[y + 1]) >> 1; } /* copy */ for (y = 0, offset = 0; y < N1; y++, offset += N0) { coeff[x + offset] = pExt[y]; } } /* step 2: horizontal transform */ for (y = 0, offset = 0; y < N1; y++, offset += N0) { /* copy */ for (x = 0; x < N0 >> 1; x++) { pExt[x << 1] = coeff[offset + x]; } /* reflection */ pExt[N0] = pExt[N0 - 2]; /* filtering (odd pixel) */ for (x = 1; x < N0; x += 2) { pExt[x] = (pExt[x - 1] + pExt[x + 1]) >> 1; } /* copy */ memcpy(coeff + offset, pExt, N0 * sizeof(coeff_t)); } } /** * =========================================================================== * local function defines for secondary transform * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void xTr2nd_4_1d_Hor(coeff_t *coeff, int i_coeff, int i_shift, const int16_t *tc) { int tmp_dct[SEC_TR_SIZE * SEC_TR_SIZE]; const int add = (1 << i_shift) >> 1; int i, j, k, sum; for (i = 0; i < SEC_TR_SIZE; i++) { for (j = 0; j < SEC_TR_SIZE; j++) { tmp_dct[i * SEC_TR_SIZE + j] = coeff[i * i_coeff + j]; } } for (i = 0; i < SEC_TR_SIZE; i++) { for (j = 0; j < SEC_TR_SIZE; j++) { sum = add; for (k = 0; k < SEC_TR_SIZE; k++) { sum += tc[i * SEC_TR_SIZE + k] * tmp_dct[j * SEC_TR_SIZE + k]; } coeff[j * i_coeff + i] = (coeff_t)XAVS2_CLIP3(-32768, 32767, sum >> i_shift); } } } /* --------------------------------------------------------------------------- */ static void xTr2nd_4_1d_Ver(coeff_t *coeff, int i_coeff, int i_shift, const int16_t *tc) { int tmp_dct[SEC_TR_SIZE * SEC_TR_SIZE]; const int add = (1 << i_shift) >> 1; int i, j, k, sum; for (i = 0; i < SEC_TR_SIZE; i++) { for (j = 0; j < SEC_TR_SIZE; j++) { tmp_dct[i * SEC_TR_SIZE + j] = coeff[i * i_coeff + j]; } } for (i = 0; i < SEC_TR_SIZE; i++) { for (j = 0; j < SEC_TR_SIZE; j++) { sum = add; for (k = 0; k < SEC_TR_SIZE; k++) { sum += tc[i * SEC_TR_SIZE + k] * tmp_dct[k * SEC_TR_SIZE + j]; } coeff[i * i_coeff + j] = (coeff_t)XAVS2_CLIP3(-32768, 32767, sum >> i_shift); } } } /* --------------------------------------------------------------------------- */ static void xTr2nd_4_1d_Inv_Ver(coeff_t *coeff, int i_coeff, int i_shift, const int16_t *tc) { int tmp_dct[SEC_TR_SIZE * SEC_TR_SIZE]; const int add = (1 << i_shift) >> 1; int i, j, k, sum; for (i = 0; i < SEC_TR_SIZE; i++) { for (j = 0; j < SEC_TR_SIZE; j++) { tmp_dct[i * SEC_TR_SIZE + j] = coeff[i * i_coeff + j]; } } for (i = 0; i < SEC_TR_SIZE; i++) { for (j = 0; j < SEC_TR_SIZE; j++) { sum = add; for (k = 0; k < SEC_TR_SIZE; k++) { sum += tc[k * SEC_TR_SIZE + i] * tmp_dct[k * SEC_TR_SIZE + j]; } coeff[i * i_coeff + j] = (coeff_t)XAVS2_CLIP3(-32768, 32767, sum >> i_shift); } } } /* --------------------------------------------------------------------------- */ static void xTr2nd_4_1d_Inv_Hor(coeff_t *coeff, int i_coeff, int i_shift, int clip_depth, const int16_t *tc) { int tmp_dct[SEC_TR_SIZE * SEC_TR_SIZE]; const int max_val = ((1 << clip_depth) >> 1) - 1; const int min_val = -max_val - 1; const int add = (1 << i_shift) >> 1; int i, j, k, sum; for (i = 0; i < SEC_TR_SIZE; i++) { for (j = 0; j < SEC_TR_SIZE; j++) { tmp_dct[i * SEC_TR_SIZE + j] = coeff[i * i_coeff + j]; } } for (i = 0; i < SEC_TR_SIZE; i++) { for (j = 0; j < SEC_TR_SIZE; j++) { sum = add; for (k = 0; k < SEC_TR_SIZE; k++) { sum += tc[k * SEC_TR_SIZE + i] * tmp_dct[j * SEC_TR_SIZE + k]; } coeff[j * i_coeff + i] = (coeff_t)XAVS2_CLIP3(min_val, max_val, sum >> i_shift); } } } /* --------------------------------------------------------------------------- */ static void transform_4x4_2nd_c(coeff_t *coeff, int i_coeff) { const int shift1 = B4X4_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + 1; const int shift2 = B4X4_IN_BIT + FACTO_BIT + 1; xTr2nd_4_1d_Hor(coeff, i_coeff, shift1, g_2T_C); xTr2nd_4_1d_Ver(coeff, i_coeff, shift2, g_2T_C); } /* --------------------------------------------------------------------------- */ static void inv_transform_4x4_2nd_c(coeff_t *coeff, int i_coeff) { const int shift1 = 5; const int shift2 = 20 - g_bit_depth + 2; const int clip_depth2 = g_bit_depth + 1; xTr2nd_4_1d_Inv_Ver(coeff, i_coeff, shift1, g_2T_C); xTr2nd_4_1d_Inv_Hor(coeff, i_coeff, shift2, clip_depth2, g_2T_C); } /* --------------------------------------------------------------------------- * i_mode - real intra mode (luma) * b_top - block top available? * b_left - block left available? */ static void transform_2nd_c(coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left) { int vt = (i_mode >= 0 && i_mode <= 23); int ht = (i_mode >= 13 && i_mode <= 32) || (i_mode >= 0 && i_mode <= 2); if (vt && b_top) { xTr2nd_4_1d_Ver(coeff, i_coeff, 7, g_2T); } if (ht && b_left) { xTr2nd_4_1d_Hor(coeff, i_coeff, 7, g_2T); } } /* --------------------------------------------------------------------------- * i_mode - real intra mode (luma) * b_top - block top available? * b_left - block left available? */ static void inv_transform_2nd_c(coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left) { int vt = (i_mode >= 0 && i_mode <= 23); int ht = (i_mode >= 13 && i_mode <= 32) || (i_mode >= 0 && i_mode <= 2); if (ht && b_left) { xTr2nd_4_1d_Inv_Hor(coeff, i_coeff, 7, 16, g_2T); } if (vt && b_top) { xTr2nd_4_1d_Inv_Ver(coeff, i_coeff, 7, g_2T); } } /* --------------------------------------------------------------------------- */ static void dct_4x4_c(const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE 4 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int shift1 = B4X4_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; // 0 int shift2 = B4X4_IN_BIT + FACTO_BIT; // 7 int i; for (i = 0; i < BSIZE; i++) { memcpy(&block[i * BSIZE], &src[i * i_src], BSIZE * sizeof(coeff_t)); } // coeff = g_T4 x block^T partialButterfly4(block, coeff, shift1, BSIZE); // dst = g_T4 x coeff^T = g_T4 x (g_T4 x block ^T)^T = g_T4 x block x g_T4^T partialButterfly4(coeff, dst, shift2, BSIZE); #undef BSIZE } /* --------------------------------------------------------------------------- */ static void idct_4x4_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE 4 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int shift1 = 5; int shift2 = 20 - g_bit_depth; int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1; int i; partialButterflyInverse4(src, coeff, shift1, BSIZE, clip_depth1); partialButterflyInverse4(coeff, block, shift2, BSIZE, clip_depth2); for (i = 0; i < BSIZE; i++) { memcpy(&dst[i * i_dst], &block[i * BSIZE], BSIZE * sizeof(coeff_t)); } #undef BSIZE } /* --------------------------------------------------------------------------- */ static void dct_8x8_c(const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE 8 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int shift1 = B8X8_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; int shift2 = B8X8_IN_BIT + FACTO_BIT; int i; for (i = 0; i < BSIZE; i++) { memcpy(&block[i * BSIZE], &src[i * i_src], BSIZE * sizeof(coeff_t)); } partialButterfly8(block, coeff, shift1, BSIZE); partialButterfly8(coeff, dst, shift2, BSIZE); #undef BSIZE } /* --------------------------------------------------------------------------- */ static void idct_8x8_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE 8 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int shift1 = 5; int shift2 = 20 - g_bit_depth; int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1; int i; partialButterflyInverse8(src, coeff, shift1, BSIZE, clip_depth1); partialButterflyInverse8(coeff, block, shift2, BSIZE, clip_depth2); for (i = 0; i < BSIZE; i++) { memcpy(&dst[i * i_dst], &block[i * BSIZE], BSIZE * sizeof(coeff_t)); } #undef BSIZE } /* --------------------------------------------------------------------------- */ static void dct_16x16_c(const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE 16 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int shift1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; int shift2 = B16X16_IN_BIT + FACTO_BIT; int i; for (i = 0; i < BSIZE; i++) { memcpy(&block[i * BSIZE], &src[i * i_src], BSIZE * sizeof(coeff_t)); } partialButterfly16(block, coeff, shift1, BSIZE); partialButterfly16(coeff, dst, shift2, BSIZE); #undef BSIZE } /* --------------------------------------------------------------------------- */ static void idct_16x16_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE 16 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int shift1 = 5; int shift2 = 20 - g_bit_depth; int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1; int i; partialButterflyInverse16(src, coeff, shift1, BSIZE, clip_depth1); partialButterflyInverse16(coeff, block, shift2, BSIZE, clip_depth2); for (i = 0; i < BSIZE; i++) { memcpy(&dst[i * i_dst], &block[i * BSIZE], BSIZE * sizeof(coeff_t)); } #undef BSIZE } /* --------------------------------------------------------------------------- * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ static void dct_32x32_c(const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE 32 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01); int shift2 = B32X32_IN_BIT + FACTO_BIT; int i; i_src &= 0xFE; /* remember to remove the flag bit */ for (i = 0; i < BSIZE; i++) { memcpy(&block[i * BSIZE], &src[i * i_src], BSIZE * sizeof(coeff_t)); } partialButterfly32(block, coeff, shift1, BSIZE); partialButterfly32(coeff, dst, shift2, BSIZE); #undef BSIZE } /* --------------------------------------------------------------------------- * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ static void dct_32x32_half_c(const coeff_t *src, coeff_t *dst, int i_src) { int i; dct_32x32_c(src, dst, i_src); for (i = 0; i < 16; i++) { memset(dst + 16, 0, 16 * sizeof(coeff_t)); dst += 32; } memset(dst, 0, 32 * 16 * sizeof(coeff_t)); } /* --------------------------------------------------------------------------- * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ static void idct_32x32_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE 32 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int a_flag = i_dst & 0x01; int shift1 = 5; int shift2 = 20 - g_bit_depth - a_flag; int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1 + a_flag; int i; i_dst &= 0xFE; /* remember to remove the flag bit */ partialButterflyInverse32(src, coeff, shift1, BSIZE, clip_depth1); partialButterflyInverse32(coeff, block, shift2, BSIZE, clip_depth2); for (i = 0; i < BSIZE; i++) { memcpy(&dst[i * i_dst], &block[i * BSIZE], BSIZE * sizeof(coeff_t)); } #undef BSIZE } /* --------------------------------------------------------------------------- */ static void dct_16x4_c(const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE_H 16 #define BSIZE_V 4 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; int shift2 = B16X16_IN_BIT + FACTO_BIT - 2; int i; for (i = 0; i < BSIZE_V; i++) { memcpy(&block[i * BSIZE_H], &src[i * i_src], BSIZE_H * sizeof(coeff_t)); } partialButterfly16(block, coeff, shift1, BSIZE_V); partialButterfly4 (coeff, dst, shift2, BSIZE_H); #undef BSIZE_H #undef BSIZE_V } /* --------------------------------------------------------------------------- */ static void idct_16x4_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE_H 16 #define BSIZE_V 4 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = 5; int shift2 = 20 - g_bit_depth; int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1; int i; partialButterflyInverse4 (src, coeff, shift1, BSIZE_H, clip_depth1); partialButterflyInverse16(coeff, block, shift2, BSIZE_V, clip_depth2); for (i = 0; i < BSIZE_V; i++) { memcpy(&dst[i * i_dst], &block[i * BSIZE_H], BSIZE_H * sizeof(coeff_t)); } #undef BSIZE_H #undef BSIZE_V } /* --------------------------------------------------------------------------- */ static void dct_4x16_c(const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE_H 4 #define BSIZE_V 16 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT - 2; int shift2 = B16X16_IN_BIT + FACTO_BIT; int i; for (i = 0; i < BSIZE_V; i++) { memcpy(&block[i * BSIZE_H], &src[i * i_src], BSIZE_H * sizeof(coeff_t)); } partialButterfly4 (block, coeff, shift1, BSIZE_V); partialButterfly16(coeff, dst, shift2, BSIZE_H); #undef BSIZE_H #undef BSIZE_V } /* --------------------------------------------------------------------------- */ static void idct_4x16_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE_H 4 #define BSIZE_V 16 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = 5; int shift2 = 20 - g_bit_depth; int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1; int i; partialButterflyInverse16(src, coeff, shift1, BSIZE_H, clip_depth1); partialButterflyInverse4 (coeff, block, shift2, BSIZE_V, clip_depth2); for (i = 0; i < BSIZE_V; i++) { memcpy(&dst[i * i_dst], &block[i * BSIZE_H], BSIZE_H * sizeof(coeff_t)); } #undef BSIZE_H #undef BSIZE_V } /* --------------------------------------------------------------------------- * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ static void dct_32x8_c(const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE_H 32 #define BSIZE_V 8 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; int shift2 = B32X32_IN_BIT + FACTO_BIT - 2 - (i_src & 0x01); int i; i_src &= 0xFE; for (i = 0; i < BSIZE_V; i++) { memcpy(&block[i * BSIZE_H], &src[i * i_src], BSIZE_H * sizeof(coeff_t)); } partialButterfly32(block, coeff, shift1, BSIZE_V); partialButterfly8 (coeff, dst, shift2, BSIZE_H); #undef BSIZE_H #undef BSIZE_V } /* --------------------------------------------------------------------------- * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ static void idct_32x8_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE_H 32 #define BSIZE_V 8 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = 5; int shift2 = 20 - g_bit_depth - (i_dst & 0x01); int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01); int i; partialButterflyInverse8 (src, coeff, shift1, BSIZE_H, clip_depth1); partialButterflyInverse32(coeff, block, shift2, BSIZE_V, clip_depth2); i_dst &= 0xFE; for (i = 0; i < BSIZE_V; i++) { memcpy(&dst[i * i_dst], &block[i * BSIZE_H], BSIZE_H * sizeof(coeff_t)); } #undef BSIZE_H #undef BSIZE_V } /* --------------------------------------------------------------------------- * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ static void dct_8x32_c(const coeff_t *src, coeff_t *dst, int i_src) { #define BSIZE_H 8 #define BSIZE_V 32 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT - 2 + (i_src & 0x01); int shift2 = B32X32_IN_BIT + FACTO_BIT; int i; i_src &= 0xFE; for (i = 0; i < BSIZE_V; i++) { memcpy(&block[i * BSIZE_H], &src[i * i_src], BSIZE_H * sizeof(coeff_t)); } partialButterfly8 (block, coeff, shift1, BSIZE_V); partialButterfly32(coeff, dst, shift2, BSIZE_H); #undef BSIZE_H #undef BSIZE_V } /* --------------------------------------------------------------------------- * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ static void idct_8x32_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE_H 8 #define BSIZE_V 32 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = 5; int shift2 = 20 - g_bit_depth - (i_dst & 0x01); int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01); int i; partialButterflyInverse32(src, coeff, shift1, BSIZE_H, clip_depth1); partialButterflyInverse8 (coeff, block, shift2, BSIZE_V, clip_depth2); i_dst &= 0xFE; for (i = 0; i < BSIZE_V; i++) { memcpy(&dst[i * i_dst], &block[i * BSIZE_H], BSIZE_H * sizeof(coeff_t)); } #undef BSIZE_H #undef BSIZE_V } /* --------------------------------------------------------------------------- * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ static void dct_64x64_c(const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(i_src); wavelet_64x64_c(src, dst); dct_32x32_c(dst, dst, 32 | 0x01); /* 32x32 dct */ } /* --------------------------------------------------------------------------- * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ static void dct_64x64_half_c(const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(i_src); wavelet_64x64_c(src, dst); dct_32x32_half_c(dst, dst, 32 | 0x01); /* 32x32 dct */ } /* --------------------------------------------------------------------------- * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ static void idct_64x64_c(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_32x32_c(src, dst, 32 | 0x01); /* 32x32 idct */ inv_wavelet_64x64_c(dst); } /* --------------------------------------------------------------------------- * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ static void dct_64x16_c(const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(i_src); wavelet_64x16_c(src, dst); dct_32x8_c(dst, dst, 32 | 0x01); } /* --------------------------------------------------------------------------- * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ static void idct_64x16_c(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_32x8_c(src, dst, 32 | 0x01); inv_wavelet_64x16_c(dst); } /* --------------------------------------------------------------------------- * NOTE: * i_src - the stride of src (the lowest bit is additional wavelet flag) */ static void dct_16x64_c(const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(i_src); wavelet_16x64_c(src, dst); dct_8x32_c(dst, dst, 8 | 0x01); } /* --------------------------------------------------------------------------- * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ static void idct_16x64_c(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_8x32_c(src, dst, 8 | 0x01); inv_wavelet_16x64_c(dst); } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * init dct function handles */ void xavs2_dct_init(uint32_t cpuid, dct_funcs_t *dctf) { /* ------------------------------------------------------------- * set handles with default c functions */ /* dct: square */ dctf->dct [LUMA_4x4 ] = dct_4x4_c; dctf->dct [LUMA_8x8 ] = dct_8x8_c; dctf->dct [LUMA_16x16] = dct_16x16_c; dctf->dct [LUMA_32x32] = dct_32x32_c; dctf->dct [LUMA_64x64] = dct_64x64_c; /* dct: asymmetrical */ dctf->dct [LUMA_16x4 ] = dct_16x4_c; dctf->dct [LUMA_4x16 ] = dct_4x16_c; dctf->dct [LUMA_32x8 ] = dct_32x8_c; dctf->dct [LUMA_8x32 ] = dct_8x32_c; dctf->dct [LUMA_64x16] = dct_64x16_c; dctf->dct [LUMA_16x64] = dct_16x64_c; /* idct: square */ dctf->idct[LUMA_4x4 ] = idct_4x4_c; dctf->idct[LUMA_8x8 ] = idct_8x8_c; dctf->idct[LUMA_16x16] = idct_16x16_c; dctf->idct[LUMA_32x32] = idct_32x32_c; dctf->idct[LUMA_64x64] = idct_64x64_c; /* idct: asymmetrical */ dctf->idct[LUMA_16x4 ] = idct_16x4_c; dctf->idct[LUMA_4x16 ] = idct_4x16_c; dctf->idct[LUMA_32x8 ] = idct_32x8_c; dctf->idct[LUMA_8x32 ] = idct_8x32_c; dctf->idct[LUMA_64x16] = idct_64x16_c; dctf->idct[LUMA_16x64] = idct_16x64_c; /* 2nd transform */ dctf->transform_4x4_2nd = transform_4x4_2nd_c; dctf->inv_transform_4x4_2nd = inv_transform_4x4_2nd_c; dctf->transform_2nd = transform_2nd_c; dctf->inv_transform_2nd = inv_transform_2nd_c; /* DCT half */ dctf->dct_half[LUMA_32x32] = dct_32x32_half_c; dctf->dct_half[LUMA_64x64] = dct_64x64_half_c; #if HAVE_MMX /* ------------------------------------------------------------- * set handles with asm functions */ /* functions defined in file intrinsic_dct.c */ if (cpuid & XAVS2_CPU_SSE42) { /* dct: square */ dctf->dct [LUMA_4x4 ] = dct_c_4x4_sse128; dctf->dct [LUMA_8x8 ] = dct_c_8x8_sse128; dctf->dct [LUMA_16x16] = dct_c_16x16_sse128; dctf->dct [LUMA_32x32] = dct_c_32x32_sse128; dctf->dct [LUMA_64x64] = dct_c_64x64_sse128; /* dct: asymmetrical */ dctf->dct[LUMA_16x4 ] = dct_c_16x4_sse128; dctf->dct[LUMA_4x16 ] = dct_c_4x16_sse128;//һα任ûдλ dctf->dct[LUMA_32x8 ] = dct_c_32x8_sse128; dctf->dct[LUMA_8x32 ] = dct_c_8x32_sse128; dctf->dct[LUMA_64x16] = dct_c_64x16_sse128; dctf->dct[LUMA_16x64] = dct_c_16x64_sse128; /* idct: square */ dctf->idct[LUMA_4x4 ] = idct_c_4x4_sse128; dctf->idct[LUMA_8x8 ] = idct_c_8x8_sse128; dctf->idct[LUMA_16x16] = idct_c_16x16_sse128; dctf->idct[LUMA_32x32] = idct_c_32x32_sse128; dctf->idct[LUMA_64x64] = idct_c_64x64_sse128; /* idct: asymmetrical */ dctf->idct[LUMA_16x4 ] = idct_c_16x4_sse128; dctf->idct[LUMA_4x16 ] = idct_c_4x16_sse128; dctf->idct[LUMA_32x8 ] = idct_c_32x8_sse128; dctf->idct[LUMA_8x32 ] = idct_c_8x32_sse128; dctf->idct[LUMA_64x16] = idct_c_64x16_sse128; dctf->idct[LUMA_16x64] = idct_c_16x64_sse128; /* 2nd transform */ dctf->transform_4x4_2nd = transform_4x4_2nd_sse128; dctf->inv_transform_4x4_2nd = inv_transform_4x4_2nd_sse128; dctf->transform_2nd = transform_2nd_sse128; dctf->inv_transform_2nd = inv_transform_2nd_sse128; // half transform dctf->dct_half[LUMA_32x32] = dct_c_32x32_half_sse128; dctf->dct_half[LUMA_64x64] = dct_c_64x64_half_sse128; } if (cpuid & XAVS2_CPU_SSE2) { dctf->dct [LUMA_4x4 ] = xavs2_dct_4x4_sse2; dctf->dct [LUMA_8x8 ] = xavs2_dct_8x8_sse2; dctf->idct[LUMA_4x4 ] = xavs2_idct_4x4_sse2; #if ARCH_X86_64 dctf->idct[LUMA_8x8 ] = xavs2_idct_8x8_sse2; #endif } if (cpuid & XAVS2_CPU_SSSE3) { dctf->idct[LUMA_8x8 ] = xavs2_idct_8x8_ssse3; } if (cpuid & XAVS2_CPU_SSE4) { dctf->dct[LUMA_8x8 ] = xavs2_dct_8x8_sse4; } if (cpuid & XAVS2_CPU_AVX2) { dctf->dct [LUMA_4x4 ] = xavs2_dct_4x4_avx2; #if ARCH_X86_64 dctf->dct [LUMA_8x8 ] = xavs2_dct_8x8_avx2; dctf->dct [LUMA_16x16 ] = xavs2_dct_16x16_avx2; // slower than dct_16x16_avx2 dctf->dct [LUMA_32x32 ] = xavs2_dct_32x32_avx2; dctf->idct[LUMA_4x4 ] = xavs2_idct_4x4_avx2; dctf->idct[LUMA_8x8 ] = xavs2_idct_8x8_avx2; dctf->idct[LUMA_16x16 ] = xavs2_idct_16x16_avx2; dctf->idct[LUMA_32x32 ] = xavs2_idct_32x32_avx2; #endif } #if ARCH_X86_64 if (cpuid & XAVS2_CPU_AVX2) { // dctf->dct[LUMA_4x4 ] = dct_c_4x4_avx2; /* futl: dct_4x4_avx2ٶȱdct_4x4_sse128һ */ // dctf->dct[LUMA_8x8 ] = dct_c_8x8_avx2; /* futl: dct_8x8_avx2ٶȱxavs2_dct_8x8_avx2 */ // dctf->dct[LUMA_4x16] = dct_c_4x16_avx2; /* futl: dct_4x16_avx2ٶȱdct_4x16_sse128 */ dctf->dct[LUMA_16x4 ] = dct_c_16x4_avx2; /* ٶȱsse128 */ dctf->dct[LUMA_8x32 ] = dct_c_8x32_avx2; dctf->dct[LUMA_32x8 ] = dct_c_32x8_avx2; dctf->dct[LUMA_16x16] = dct_c_16x16_avx2; // dctf->dct[LUMA_32x32] = dct_c_32x32_avx2; /* asm faster than intrinsic */ dctf->dct[LUMA_64x64] = dct_c_64x64_avx2; dctf->dct[LUMA_64x16] = dct_c_64x16_avx2; dctf->dct[LUMA_16x64] = dct_c_16x64_avx2; dctf->idct[LUMA_8x8] = idct_c_8x8_avx2; dctf->idct[LUMA_16x16] = idct_c_16x16_avx2; dctf->idct[LUMA_32x32] = idct_c_32x32_avx2; dctf->idct[LUMA_64x64] = idct_c_64x64_avx2; dctf->idct[LUMA_64x16] = idct_c_64x16_avx2; dctf->idct[LUMA_16x64] = idct_c_16x64_avx2; dctf->dct_half[LUMA_32x32] = dct_c_32x32_half_avx2; dctf->dct_half[LUMA_64x64] = dct_c_64x64_half_avx2; } #endif // ARCH_X86_64 #else UNUSED_PARAMETER(cpuid); #endif // if HAVE_MMX } /*---------------------------------------------------------------------------------- * dct ͬСҪʱ䣨ms䣨ͬСԴܲͬ *---------------------------------------------------------------------------------- * С intrinsic asm * | sse | avx | | sse | avx | *---------------------------------------------------------------------------------- * 4 x 4 | | | | | | * 4 x 16 | | | | | | * 8 x 8 | | | | | | * 8 x 32 | | | | | | * 16 x 16 | | 33301 | | | 35150 | * 16 x 4 | | | | | | * 32 x 32 | 773186 | 154968 | | | 134811 | * 32 x 8 | | | | | | * 64 x 64 | | | | | | * ---------------------------------------------------------------------------------- * * idct ͬСҪʱ䣨ms *---------------------------------------------------------------------------------- * С intrinsic asm * | sse | avx | | sse | avx | *---------------------------------------------------------------------------------- * 4 x 4 | | | | | | * 4 x 16 | | | | | | * 8 x 8 | | | | | | * 8 x 32 | | | | | | * 16 x 16 | | | | | | * 16 x 4 | | | | | | * 32 x 32 | 18401 | 8258 | | | 13413 | * 32 x 8 | | | | | | * 64 x 64 | | | | | | * ---------------------------------------------------------------------------------- * * * * */ xavs2-1.3/source/common/transform.h000066400000000000000000000036231340660520300173770ustar00rootroot00000000000000/* * transform.h * * Description of this file: * transform functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_TRANSFORM_H #define XAVS2_TRANSFORM_H /** * =========================================================================== * type defines * =========================================================================== */ /** * =========================================================================== * function declares * =========================================================================== */ #endif // XAVS2_TRANSFORM_H xavs2-1.3/source/common/vec/000077500000000000000000000000001340660520300157645ustar00rootroot00000000000000xavs2-1.3/source/common/vec/intrinsic.c000066400000000000000000000706271340660520300201460ustar00rootroot00000000000000/* * intrinsic.c * * Description of this file: * tables used in SIMD assembly functions of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * Jiaqi ZHANG * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../basic_types.h" #include "intrinsic.h" #include #include #include #include ALIGN32(const int8_t intrinsic_mask[15][16]) = { { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0 } }; ALIGN32(const int8_t intrinsic_mask_256_8bit[16][32]) = { { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }; ALIGN32(const int8_t intrinsic_mask32[32][32]) = { { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0 } }; ALIGN32(const int8_t tab_log2[65]) = { -1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6 }; const uint8_t tab_idx_mode_7[64] = { 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 15, 16, 17, 18, 18, 19, 20, 21, 21, 22, 23, 23, 24, 25, 26, 26, 27, 28, 29, 29, 30, 31, 31, 32, 33, 34, 34, 35, 36, 37, 37, 38, 39, 39, 40, 41, 42, 42, 43, 44, 45, 45, 46 }; ALIGN16(const pel_t tab_coeff_mode_7[64][16]) = { { 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23 },//0 { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29 }, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26 }, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17 },//8 { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23 }, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 },//16 { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 }, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22 }, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 },//24 { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 }, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 },//32 { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22 }, { 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19 }, { 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 },//40 { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }, { 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22 }, { 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13 }, { 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19 },//48 { 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10 }, { 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 }, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }, { 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7 }, { 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30 }, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22 }, { 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13 },//56 { 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4 }, { 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27 }, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19 }, { 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10 }, { 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1 }, { 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24 }, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }//63 }; ALIGN32(const pel_t tab_coeff_mode_7_avx[64][32]) = { { 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23},//0 { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14}, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5}, { 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29}, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20}, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11}, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2}, { 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26}, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17},//8 { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8}, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31}, { 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23}, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14}, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5}, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28}, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20}, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11},//16 { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2}, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25}, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17}, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8}, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31}, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22}, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14}, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5},//24 { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28}, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19}, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11}, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2}, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25}, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16}, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8}, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31},//32 { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22}, { 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13}, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5}, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28}, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19}, { 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10}, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2}, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25},//40 { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16}, { 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7}, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31}, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22}, { 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13}, { 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4}, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28}, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19},//48 { 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10}, { 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1}, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25}, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16}, { 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7}, { 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30}, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22}, { 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13},//56 { 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4}, { 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27}, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19}, { 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10}, { 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1}, { 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24}, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16}//63 }; ALIGN16(const int8_t tab_coeff_mode_9[64][16]) = { { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17 }, { 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 } }; const uint8_t tab_idx_mode_9[64] = { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 23 }; const ALIGN16(int8_t tab_coeff_mode_11[64][16]) = { { 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 20, 52, 44, 12, 20, 52, 44, 12, 20, 52, 44, 12, 20, 52, 44, 12 }, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20 }, { 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 32, 64, 32, 0, 32, 64, 32, 0, 32, 64, 32, 0, 32, 64, 32, 0 } }; xavs2-1.3/source/common/vec/intrinsic.h000066400000000000000000000764331340660520300201540ustar00rootroot00000000000000/* * intrinsic.h * * Description of this file: * SIMD assembly functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_INTRINSIC_H #define XAVS2_INTRINSIC_H /* --------------------------------------------------------------------------- * macros used for quick access of __m128i */ #define M128_U64(mx, idx) *(((uint64_t *)&mx) + idx) #define M128_U32(mx, idx) *(((uint32_t *)&mx) + idx) #define M128_I32(mx, idx) *((( int32_t *)&mx) + idx) #define M128_U16(mx, idx) *(((uint16_t *)&mx) + idx) #define M128_I16(mx, idx) *((( int16_t *)&mx) + idx) #if _MSC_VER // vsimmintrin.hûжЩ #define _mm256_extract_epi64(a, i) (a.m256i_i64[i]) #define _mm256_extract_epi32(a, i) (a.m256i_i32[i]) #define _mm256_extract_epi16(a, i) (a.m256i_i16[i]) #define _mm256_extract_epi8(a, i) (a.m256i_i8 [i]) #define _mm256_insert_epi64(a, v, i) (a.m256i_i64[i] = v) #define _mm_extract_epi64(r, i) r.m128i_i64[i] //insert integrate to dst #define _mm256_insert_epi64(a, value, index) (a.m256i_i64[index] = value) #define _mm256_insert_epi32(a, value, index) (a.m256i_i32[index] = value) #define _mm256_insert_epi16(a, value, index) (a.m256i_i16[index] = value) #define _mm256_insert_epi8 (a, value, index) (a.m256i_i8 [index] = value) #else // Ӳgccȱٵavx #define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \ _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1) #define _mm256_loadu2_m128i(/* __m128i const* */ hiaddr, \ /* __m128i const* */ loaddr) \ _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr)) #define _mm256_storeu2_m128i(/* __m128i* */ hiaddr, /* __m128i* */ loaddr, \ /* __m256i */ a) \ do { \ __m256i _a = (a); /* reference a only once in macro body */ \ _mm_storeu_si128((loaddr), _mm256_castsi256_si128(_a)); \ _mm_storeu_si128((hiaddr), _mm256_extractf128_si256(_a, 0x1)); \ } while (0) #endif /* --------------------------------------------------------------------------- * global variables */ ALIGN32(extern const int8_t intrinsic_mask[15][16]); ALIGN32(extern const int8_t intrinsic_mask_256_8bit[16][32]); ALIGN32(extern const int8_t intrinsic_mask32[32][32]); ALIGN32(extern const int16_t intrinsic_mask_10bit[15][16]); ALIGN32(extern const int8_t tab_log2[65]); ALIGN16(extern const pel_t tab_coeff_mode_7[64][16]); ALIGN32(extern const uint8_t tab_idx_mode_7[64]); ALIGN32(extern const pel_t tab_coeff_mode_7_avx[64][32]); ALIGN16(extern const int8_t tab_coeff_mode_9[64][16]); extern const uint8_t tab_idx_mode_9[64]; ALIGN16(extern const int8_t tab_coeff_mode_11[64][16]); /* --------------------------------------------------------------------------- * functions */ #define intpl_copy_block_sse128 FPFX(intpl_copy_block_sse128) void intpl_copy_block_sse128 (pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height); #define intpl_luma_block_hor_sse128 FPFX(intpl_luma_block_hor_sse128) void intpl_luma_block_hor_sse128 (pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ver_sse128 FPFX(intpl_luma_block_ver_sse128) void intpl_luma_block_ver_sse128 (pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ext_sse128 FPFX(intpl_luma_block_ext_sse128) void intpl_luma_block_ext_sse128 (pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v); #define intpl_luma_hor_sse128 FPFX(intpl_luma_hor_sse128) void intpl_luma_hor_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_ver_sse128 FPFX(intpl_luma_ver_sse128) void intpl_luma_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_ext_sse128 FPFX(intpl_luma_ext_sse128) void intpl_luma_ext_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff); #define intpl_luma_hor_avx2 FPFX(intpl_luma_hor_avx2) void intpl_luma_hor_avx2(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff); #define intpl_luma_ver_avx2 FPFX(intpl_luma_ver_avx2) void intpl_luma_ver_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff); #define intpl_luma_ext_avx2 FPFX(intpl_luma_ext_avx2) void intpl_luma_ext_avx2(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff); #define intpl_luma_hor_x3_sse128 FPFX(intpl_luma_hor_x3_sse128) void intpl_luma_hor_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff); #define intpl_luma_ver_x3_sse128 FPFX(intpl_luma_ver_x3_sse128) void intpl_luma_ver_x3_sse128(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, const int8_t **coeff); #define intpl_luma_ext_x3_sse128 FPFX(intpl_luma_ext_x3_sse128) void intpl_luma_ext_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff); #define intpl_chroma_block_hor_sse128 FPFX(intpl_chroma_block_hor_sse128) void intpl_chroma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_chroma_block_ver_sse128 FPFX(intpl_chroma_block_ver_sse128) void intpl_chroma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_chroma_block_ext_sse128 FPFX(intpl_chroma_block_ext_sse128) void intpl_chroma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v); #define intpl_luma_block_hor_avx2 FPFX(intpl_luma_block_hor_avx2) void intpl_luma_block_hor_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ver_avx2 FPFX(intpl_luma_block_ver_avx2) void intpl_luma_block_ver_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ext_avx2 FPFX(intpl_luma_block_ext_avx2) void intpl_luma_block_ext_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v); #define intpl_chroma_block_hor_avx2 FPFX(intpl_chroma_block_hor_avx2) void intpl_chroma_block_hor_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_chroma_block_ver_avx2 FPFX(intpl_chroma_block_ver_avx2) void intpl_chroma_block_ver_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_chroma_block_ext_avx2 FPFX(intpl_chroma_block_ext_avx2) void intpl_chroma_block_ext_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v); #define intpl_luma_hor_x3_avx2 FPFX(intpl_luma_hor_x3_avx2) void intpl_luma_hor_x3_avx2(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff); #define intpl_luma_ver_x3_avx2 FPFX(intpl_luma_ver_x3_avx2) void intpl_luma_ver_x3_avx2(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, const int8_t **coeff); #define intpl_luma_ext_x3_avx2 FPFX(intpl_luma_ext_x3_avx2) void intpl_luma_ext_x3_avx2(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff); /* memory operation */ #define cpy_pel_I420_to_uchar_YUY2_sse128 FPFX(cpy_pel_I420_to_uchar_YUY2_sse128) void cpy_pel_I420_to_uchar_YUY2_sse128(const pel_t *srcy, const pel_t *srcu, const pel_t *srcv, int i_src, int i_srcc, unsigned char *dst, int i_dst, int width, int height, int bit_size); #define add_pel_clip_sse128 FPFX(add_pel_clip_sse128) void add_pel_clip_sse128(const pel_t *src1, int i_src1, const int16_t *src2, int i_src2, pel_t *dst, int i_dst, int width, int height, int bit_depth); #define xavs2_pixel_average_sse128 FPFX(pixel_average_sse128) void xavs2_pixel_average_sse128(pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height); #define xavs2_pixel_average_avx FPFX(pixel_average_avx) void xavs2_pixel_average_avx (pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height); #define padding_rows_sse128 FPFX(padding_rows_sse128) void padding_rows_sse128 (pel_t *src, int i_src, int width, int height, int start, int rows, int pad); #define padding_rows_lr_sse128 FPFX(padding_rows_lr_sse128) void padding_rows_lr_sse128(pel_t *src, int i_src, int width, int height, int start, int rows, int pad); #define padding_rows_sse256 FPFX(padding_rows_sse256) void padding_rows_sse256(pel_t *src, int i_src, int width, int height, int start, int rows, int pad); #define padding_rows_sse256_10bit FPFX(padding_rows_sse256_10bit) void padding_rows_sse256_10bit(pel_t *src, int i_src, int width, int height, int start, int rows, int pad); #define padding_rows_lr_sse256 FPFX(padding_rows_lr_sse256_10bit) void padding_rows_lr_sse256(pel_t *src, int i_src, int width, int height, int start, int rows, int pad); #define padding_rows_lr_sse256_10bit FPFX(padding_rows_lr_sse256) void padding_rows_lr_sse256_10bit(pel_t *src, int i_src, int width, int height, int start, int rows, int pad); #define xavs2_memzero_aligned_c_sse2 FPFX(memzero_aligned_c_sse2) void *xavs2_memzero_aligned_c_sse2(void *dst, size_t n); #define xavs2_memzero_aligned_c_avx FPFX(memzero_aligned_c_avx) void *xavs2_memzero_aligned_c_avx (void *dst, size_t n); #define xavs2_mem_repeat_i_c_sse2 FPFX(mem_repeat_i_c_sse2) void xavs2_mem_repeat_i_c_sse2 (void *dst, int val, size_t count); #define xavs2_mem_repeat_i_c_avx FPFX(mem_repeat_i_c_avx) void xavs2_mem_repeat_i_c_avx (void *dst, int val, size_t count); #define xavs2_memcpy_aligned_c_sse2 FPFX(memcpy_aligned_c_sse2) void *xavs2_memcpy_aligned_c_sse2 (void *dst, const void *src, size_t n); #define deblock_edge_ver_sse128 FPFX(deblock_edge_ver_sse128) void deblock_edge_ver_sse128 (pel_t *SrcPtr, int stride, int Alpha, int Beta, unsigned char *flt_flag); #define deblock_edge_hor_sse128 FPFX(deblock_edge_hor_sse128) void deblock_edge_hor_sse128 (pel_t *SrcPtr, int stride, int Alpha, int Beta, unsigned char *flt_flag); #define deblock_edge_ver_c_sse128 FPFX(deblock_edge_ver_c_sse128) void deblock_edge_ver_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, unsigned char *flt_flag); #define deblock_edge_hor_c_sse128 FPFX(deblock_edge_hor_c_sse128) void deblock_edge_hor_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, unsigned char *flt_flag); //--------avx2-------- add by zhangjiaqi 2016-12-02 #define deblock_edge_hor_avx2 FPFX(deblock_edge_hor_avx2) void deblock_edge_hor_avx2(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag); #define deblock_edge_ver_avx2 FPFX(deblock_edge_ver_avx2) void deblock_edge_ver_avx2(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag); #define deblock_edge_hor_c_avx2 FPFX(deblock_edge_hor_c_avx2) void deblock_edge_hor_c_avx2(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag); #define deblock_edge_ver_c_avx2 FPFX(deblock_edge_ver_c_avx2) void deblock_edge_ver_c_avx2(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag); #define dct_c_4x4_sse128 FPFX(dct_c_4x4_sse128) void dct_c_4x4_sse128 (const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_8x8_sse128 FPFX(dct_c_8x8_sse128) void dct_c_8x8_sse128 (const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_16x16_sse128 FPFX(dct_c_16x16_sse128) void dct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_32x32_sse128 FPFX(dct_c_32x32_sse128) void dct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_64x64_sse128 FPFX(dct_c_64x64_sse128) void dct_c_64x64_sse128(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_4x16_sse128 FPFX(dct_c_4x16_sse128) void dct_c_4x16_sse128 (const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_8x32_sse128 FPFX(dct_c_8x32_sse128) void dct_c_8x32_sse128 (const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_16x4_sse128 FPFX(dct_c_16x4_sse128) void dct_c_16x4_sse128 (const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_32x8_sse128 FPFX(dct_c_32x8_sse128) void dct_c_32x8_sse128 (const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_64x16_sse128 FPFX(dct_c_64x16_sse128) void dct_c_64x16_sse128(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_16x64_sse128 FPFX(dct_c_16x64_sse128) void dct_c_16x64_sse128(const coeff_t *src, coeff_t *dst, int i_src); //futl #define dct_c_4x4_avx2 FPFX(dct_c_4x4_avx2) void dct_c_4x4_avx2(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_8x8_avx2 FPFX(dct_c_8x8_avx2) void dct_c_8x8_avx2(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_4x16_avx2 FPFX(dct_c_4x16_avx2) void dct_c_4x16_avx2(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_16x4_avx2 FPFX(dct_c_16x4_avx2) void dct_c_16x4_avx2(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_16x16_avx2 FPFX(dct_c_16x16_avx2) void dct_c_16x16_avx2(const coeff_t * src, coeff_t * dst, int i_src); #define dct_c_8x32_avx2 FPFX(dct_c_8x32_avx2) void dct_c_8x32_avx2(const coeff_t *src, coeff_t *dst, int i_src); //avx2 function -zhangjiaqi #define dct_c_32x32_avx2 FPFX(dct_c_32x32_avx2) void dct_c_32x32_avx2(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_32x8_avx2 FPFX(dct_c_32x8_avx2) void dct_c_32x8_avx2(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_64x64_avx2 FPFX(dct_c_64x64_avx2) void dct_c_64x64_avx2(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_64x16_avx2 FPFX(dct_c_64x16_avx2) void dct_c_64x16_avx2(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_16x64_avx2 FPFX(dct_c_16x64_avx2) void dct_c_16x64_avx2(const coeff_t *src, coeff_t *dst, int i_src); /* half DCT, only keep low frequency coefficients */ #define dct_c_32x32_half_sse128 FPFX(dct_c_32x32_half_sse128) void dct_c_32x32_half_sse128(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_64x64_half_sse128 FPFX(dct_c_64x64_half_sse128) void dct_c_64x64_half_sse128(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_32x32_half_avx2 FPFX(dct_c_32x32_half_avx2) void dct_c_32x32_half_avx2(const coeff_t *src, coeff_t *dst, int i_src); #define dct_c_64x64_half_avx2 FPFX(dct_c_64x64_half_avx2) void dct_c_64x64_half_avx2(const coeff_t *src, coeff_t *dst, int i_src); #define transform_4x4_2nd_sse128 FPFX(transform_4x4_2nd_sse128) void transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff); #define transform_2nd_sse128 FPFX(transform_2nd_sse128) void transform_2nd_sse128 (coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left); #define idct_c_4x4_sse128 FPFX(idct_c_4x4_sse128) void idct_c_4x4_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_8x8_sse128 FPFX(idct_c_8x8_sse128) void idct_c_8x8_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_16x16_sse128 FPFX(idct_c_16x16_sse128) void idct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_32x32_sse128 FPFX(idct_c_32x32_sse128) void idct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_64x64_sse128 FPFX(idct_c_64x64_sse128) void idct_c_64x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_16x4_sse128 FPFX(idct_c_16x4_sse128) void idct_c_16x4_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_32x8_sse128 FPFX(idct_c_32x8_sse128) void idct_c_32x8_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_64x16_sse128 FPFX(idct_c_64x16_sse128) void idct_c_64x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_4x16_sse128 FPFX(idct_c_4x16_sse128) void idct_c_4x16_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_8x32_sse128 FPFX(idct_c_8x32_sse128) void idct_c_8x32_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_16x64_sse128 FPFX(idct_c_16x64_sse128) void idct_c_16x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define inv_transform_4x4_2nd_sse128 FPFX(inv_transform_4x4_2nd_sse128) void inv_transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff); #define inv_transform_2nd_sse128 FPFX(inv_transform_2nd_sse128) void inv_transform_2nd_sse128 (coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left); //zhangjiaqi add 2016.11.30 avx2 #define idct_c_8x8_avx2 FPFX(idct_c_8x8_avx2) void idct_c_8x8_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_16x16_avx2 FPFX(idct_c_16x16_avx2) void idct_c_16x16_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_32x32_avx2 FPFX(idct_c_32x32_avx2) void idct_c_32x32_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_64x64_avx2 FPFX(idct_c_64x64_avx2) void idct_c_64x64_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_64x16_avx2 FPFX(idct_c_64x16_avx2) void idct_c_64x16_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_c_16x64_avx2 FPFX(idct_c_16x64_avx2) void idct_c_16x64_avx2(const coeff_t *src, coeff_t *dst, int i_dst); // scan the cg coefficient #define coeff_scan_4x4_xy_sse128 FPFX(coeff_scan_4x4_xy_sse128) void coeff_scan_4x4_xy_sse128(coeff_t *dst, const coeff_t *src, int i_src_shift); #define coeff_scan_4x4_yx_sse128 FPFX(coeff_scan_4x4_yx_sse128) void coeff_scan_4x4_yx_sse128(coeff_t *dst, const coeff_t *src, int i_src_shift); #define coeff_scan4_xy_sse128 FPFX(coeff_scan4_xy_sse128) void coeff_scan4_xy_sse128(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4); #define coeff_scan4_yx_sse128 FPFX(coeff_scan4_yx_sse128) void coeff_scan4_yx_sse128(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4); #define coeff_scan4_xy_avx FPFX(coeff_scan4_xy_avx) void coeff_scan4_xy_avx(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4); #define coeff_scan4_yx_avx FPFX(coeff_scan4_yx_avx) void coeff_scan4_yx_avx(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4); #define abs_coeff_sse128 FPFX(abs_coeff_sse128) void abs_coeff_sse128(coeff_t *dst, const coeff_t *src, const int i_coef); #define add_sign_sse128 FPFX(add_sign_sse128) int add_sign_sse128(coeff_t *dst, const coeff_t *abs_val, const int i_coef); #define quant_c_avx2 FPFX(quant_c_avx2) int quant_c_avx2(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add); #define dequant_c_avx2 FPFX(dequant_c_avx2) void dequant_c_avx2(coeff_t *coef, const int i_coef, const int scale, const int shift); #define quant_c_sse128 FPFX(quant_c_avx2) int quant_c_sse128(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add); #define dequant_c_sse128 FPFX(dequant_c_sse128) void dequant_c_sse128(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add); #define abs_coeff_avx2 FPFX(abs_coeff_avx2) void abs_coeff_avx2(coeff_t *dst, const coeff_t *src, const int i_coef); #define add_sign_avx2 FPFX(add_sign_avx2) int add_sign_avx2(coeff_t *dst, const coeff_t *abs_val, const int i_coef); #define SAO_on_block_sse128 FPFX(SAO_on_block_sse128) void SAO_on_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int i_block_w, int i_block_h, int *lcu_avail, SAOBlkParam *sao_param); #define SAO_on_block_sse256 FPFX(SAO_on_block_sse256) void SAO_on_block_sse256(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src,int i_block_w, int i_block_h, int *lcu_avail, SAOBlkParam *sao_param); #define alf_flt_one_block_sse128 FPFX(alf_flt_one_block_sse128) void alf_flt_one_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, int *alf_coeff, int b_top_avail, int b_down_avail); #define intra_pred_dc_sse128 FPFX(intra_pred_dc_sse128) void intra_pred_dc_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_plane_sse128 FPFX(intra_pred_plane_sse128) void intra_pred_plane_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_bilinear_sse128 FPFX(intra_pred_bilinear_sse128) void intra_pred_bilinear_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_hor_sse128 FPFX(intra_pred_hor_sse128) void intra_pred_hor_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ver_sse128 FPFX(intra_pred_ver_sse128) void intra_pred_ver_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_3_sse128 FPFX(intra_pred_ang_x_3_sse128) void intra_pred_ang_x_3_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_4_sse128 FPFX(intra_pred_ang_x_4_sse128) void intra_pred_ang_x_4_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_5_sse128 FPFX(intra_pred_ang_x_5_sse128) void intra_pred_ang_x_5_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_6_sse128 FPFX(intra_pred_ang_x_6_sse128) void intra_pred_ang_x_6_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_7_sse128 FPFX(intra_pred_ang_x_7_sse128) void intra_pred_ang_x_7_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_8_sse128 FPFX(intra_pred_ang_x_8_sse128) void intra_pred_ang_x_8_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_9_sse128 FPFX(intra_pred_ang_x_9_sse128) void intra_pred_ang_x_9_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_10_sse128 FPFX(intra_pred_ang_x_10_sse128) void intra_pred_ang_x_10_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_11_sse128 FPFX(intra_pred_ang_x_11_sse128) void intra_pred_ang_x_11_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_25_sse128 FPFX(intra_pred_ang_y_25_sse128) void intra_pred_ang_y_25_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_26_sse128 FPFX(intra_pred_ang_y_26_sse128) void intra_pred_ang_y_26_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_28_sse128 FPFX(intra_pred_ang_y_28_sse128) void intra_pred_ang_y_28_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_30_sse128 FPFX(intra_pred_ang_y_30_sse128) void intra_pred_ang_y_30_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_31_sse128 FPFX(intra_pred_ang_y_31_sse128) void intra_pred_ang_y_31_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_32_sse128 FPFX(intra_pred_ang_y_32_sse128) void intra_pred_ang_y_32_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_13_sse128 FPFX(intra_pred_ang_xy_13_sse128) void intra_pred_ang_xy_13_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_14_sse128 FPFX(intra_pred_ang_xy_14_sse128) void intra_pred_ang_xy_14_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_16_sse128 FPFX(intra_pred_ang_xy_16_sse128) void intra_pred_ang_xy_16_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_18_sse128 FPFX(intra_pred_ang_xy_18_sse128) void intra_pred_ang_xy_18_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_20_sse128 FPFX(intra_pred_ang_xy_20_sse128) void intra_pred_ang_xy_20_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_22_sse128 FPFX(intra_pred_ang_xy_22_sse128) void intra_pred_ang_xy_22_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_23_sse128 FPFX(intra_pred_ang_xy_23_sse128) void intra_pred_ang_xy_23_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define fill_edge_samples_0_sse128 FPFX(fill_edge_samples_0_sse128) void fill_edge_samples_0_sse128 (const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); #define fill_edge_samples_x_sse128 FPFX(fill_edge_samples_x_sse128) void fill_edge_samples_x_sse128 (const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); #define fill_edge_samples_y_sse128 FPFX(fill_edge_samples_y_sse128) void fill_edge_samples_y_sse128 (const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); #define fill_edge_samples_xy_sse128 FPFX(fill_edge_samples_xy_sse128) void fill_edge_samples_xy_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); //intra prediction avx functions #define intra_pred_ver_avx FPFX(intra_pred_ver_avx) void intra_pred_ver_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_hor_avx FPFX(intra_pred_hor_avx) void intra_pred_hor_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_dc_avx FPFX(intra_pred_dc_avx) void intra_pred_dc_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_plane_avx FPFX(intra_pred_plane_avx) void intra_pred_plane_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_bilinear_avx FPFX(intra_pred_bilinear_avx) void intra_pred_bilinear_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_3_avx FPFX(intra_pred_ang_x_3_avx) void intra_pred_ang_x_3_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_4_avx FPFX(intra_pred_ang_x_4_avx) void intra_pred_ang_x_4_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_5_avx FPFX(intra_pred_ang_x_5_avx) void intra_pred_ang_x_5_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_6_avx FPFX(intra_pred_ang_x_6_avx) void intra_pred_ang_x_6_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_7_avx FPFX(intra_pred_ang_x_7_avx) void intra_pred_ang_x_7_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_8_avx FPFX(intra_pred_ang_x_8_avx) void intra_pred_ang_x_8_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_9_avx FPFX(intra_pred_ang_x_9_avx) void intra_pred_ang_x_9_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_10_avx FPFX(intra_pred_ang_x_10_avx) void intra_pred_ang_x_10_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_11_avx FPFX(intra_pred_ang_x_11_avx) void intra_pred_ang_x_11_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_13_avx FPFX(intra_pred_ang_xy_13_avx) void intra_pred_ang_xy_13_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_14_avx FPFX(intra_pred_ang_xy_14_avx) void intra_pred_ang_xy_14_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_16_avx FPFX(intra_pred_ang_xy_16_avx) void intra_pred_ang_xy_16_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_18_avx FPFX(intra_pred_ang_xy_18_avx) void intra_pred_ang_xy_18_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_20_avx FPFX(intra_pred_ang_xy_20_avx) void intra_pred_ang_xy_20_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_22_avx FPFX(intra_pred_ang_xy_22_avx) void intra_pred_ang_xy_22_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_23_avx FPFX(intra_pred_ang_xy_23_avx) void intra_pred_ang_xy_23_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_25_avx FPFX(intra_pred_ang_y_25_avx) void intra_pred_ang_y_25_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_26_avx FPFX(intra_pred_ang_y_26_avx) void intra_pred_ang_y_26_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_28_avx FPFX(intra_pred_ang_y_28_avx) void intra_pred_ang_y_28_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_30_avx FPFX(intra_pred_ang_y_30_avx) void intra_pred_ang_y_30_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_31_avx FPFX(intra_pred_ang_y_31_avx) void intra_pred_ang_y_31_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_32_avx FPFX(intra_pred_ang_y_32_avx) void intra_pred_ang_y_32_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define mad_16x16_sse128 FPFX(mad_16x16_sse128) int mad_16x16_sse128(pel_t *p_src, int i_src, int cu_size); #define mad_32x32_sse128 FPFX(mad_32x32_sse128) int mad_32x32_sse128(pel_t *p_src, int i_src, int cu_size); #define mad_64x64_sse128 FPFX(mad_64x64_sse128) int mad_64x64_sse128(pel_t *p_src, int i_src, int cu_size); #endif // #ifndef XAVS2_INTRINSIC_H xavs2-1.3/source/common/vec/intrinsic_alf.c000066400000000000000000000206211340660520300207550ustar00rootroot00000000000000/* * intrinsic_alf.c * * Description of this file: * SSE assembly functions of ALF module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include void alf_flt_one_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, int *alf_coeff, int b_top_avail, int b_down_avail) { pel_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6; __m128i T00, T01, T10, T11, T20, T21, T30, T31, T40, T41, T50, T51; __m128i T1, T2, T3, T4, T5, T6, T7, T8; __m128i E00, E01, E10, E11, E20, E21, E30, E31, E40, E41; __m128i C0, C1, C2, C3, C4, C30, C31, C32, C33; __m128i S0, S00, S01, S1, S10, S11, S2, S20, S21, S3, S30, S31, S4, S40, S41, S5, S50, S51, S6, S60, S61, S7, S8, SS1, SS2, S; __m128i mSwitch1, mSwitch2, mSwitch3, mSwitch4, mSwitch5; __m128i mAddOffset; __m128i mZero = _mm_set1_epi16(0); __m128i mMax = _mm_set1_epi16((short)(max_pel_value)); __m128i mask; int i, j; int startPos = b_top_avail ? (lcu_pix_y - 4) : lcu_pix_y; int endPos = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height); int xPosEnd = lcu_pix_x + lcu_width; int xPosEnd16 = xPosEnd - (lcu_width & 0x0f); int yUp, yBottom; mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(lcu_width & 15) - 1])); p_src += (startPos * i_src); p_dst += (startPos * i_dst); C0 = _mm_set1_epi8((char)alf_coeff[0]); C1 = _mm_set1_epi8((char)alf_coeff[1]); C2 = _mm_set1_epi8((char)alf_coeff[2]); C3 = _mm_set1_epi8((char)alf_coeff[3]); C4 = _mm_set1_epi8((char)alf_coeff[4]); mSwitch1 = _mm_setr_epi8(0, 1, 2, 3, 2, 1, 0, 3, 0, 1, 2, 3, 2, 1, 0, 3); C30 = _mm_loadu_si128((__m128i*)&alf_coeff[5]); C31 = _mm_packs_epi32(C30, C30); C32 = _mm_packs_epi16(C31, C31); C33 = _mm_shuffle_epi8(C32, mSwitch1); mSwitch2 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, -1, 1, 2, 3, 4, 5, 6, 7, -1); mSwitch3 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, -1, 3, 4, 5, 6, 7, 8, 9, -1); mSwitch4 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, -1, 5, 6, 7, 8, 9, 10, 11, -1); mSwitch5 = _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, -1, 7, 8, 9, 10, 11, 12, 13, -1); mAddOffset = _mm_set1_epi16(32); for (i = startPos; i < endPos; i++) { yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 1); yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 1); p_src1 = p_src + (yBottom - i) * i_src; p_src2 = p_src + (yUp - i) * i_src; yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 2); yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 2); p_src3 = p_src + (yBottom - i) * i_src; p_src4 = p_src + (yUp - i) * i_src; yUp = XAVS2_CLIP3(startPos, endPos - 1, i - 3); yBottom = XAVS2_CLIP3(startPos, endPos - 1, i + 3); p_src5 = p_src + (yBottom - i) * i_src; p_src6 = p_src + (yUp - i) * i_src; for (j = lcu_pix_x; j < xPosEnd; j += 16) { T00 = _mm_loadu_si128((__m128i*)&p_src6[j]); T01 = _mm_loadu_si128((__m128i*)&p_src5[j]); E00 = _mm_unpacklo_epi8(T00, T01); E01 = _mm_unpackhi_epi8(T00, T01); S00 = _mm_maddubs_epi16(E00, C0);//ǰ8C0*P0Ľ S01 = _mm_maddubs_epi16(E01, C0);//8C0*P0Ľ T10 = _mm_loadu_si128((__m128i*)&p_src4[j]); T11 = _mm_loadu_si128((__m128i*)&p_src3[j]); E10 = _mm_unpacklo_epi8(T10, T11); E11 = _mm_unpackhi_epi8(T10, T11); S10 = _mm_maddubs_epi16(E10, C1);//ǰ8C1*P1Ľ S11 = _mm_maddubs_epi16(E11, C1);//8C1*P1Ľ T20 = _mm_loadu_si128((__m128i*)&p_src2[j - 1]); T21 = _mm_loadu_si128((__m128i*)&p_src1[j + 1]); E20 = _mm_unpacklo_epi8(T20, T21); E21 = _mm_unpackhi_epi8(T20, T21); S20 = _mm_maddubs_epi16(E20, C2); S21 = _mm_maddubs_epi16(E21, C2); T30 = _mm_loadu_si128((__m128i*)&p_src2[j]); T31 = _mm_loadu_si128((__m128i*)&p_src1[j]); E30 = _mm_unpacklo_epi8(T30, T31); E31 = _mm_unpackhi_epi8(T30, T31); S30 = _mm_maddubs_epi16(E30, C3); S31 = _mm_maddubs_epi16(E31, C3); T40 = _mm_loadu_si128((__m128i*)&p_src2[j + 1]); T41 = _mm_loadu_si128((__m128i*)&p_src1[j - 1]); E40 = _mm_unpacklo_epi8(T40, T41); E41 = _mm_unpackhi_epi8(T40, T41); S40 = _mm_maddubs_epi16(E40, C4); S41 = _mm_maddubs_epi16(E41, C4); T50 = _mm_loadu_si128((__m128i*)&p_src[j - 3]); T51 = _mm_loadu_si128((__m128i*)&p_src[j + 5]); T1 = _mm_shuffle_epi8(T50, mSwitch2); T2 = _mm_shuffle_epi8(T50, mSwitch3); T3 = _mm_shuffle_epi8(T50, mSwitch4); T4 = _mm_shuffle_epi8(T50, mSwitch5); T5 = _mm_shuffle_epi8(T51, mSwitch2); T6 = _mm_shuffle_epi8(T51, mSwitch3); T7 = _mm_shuffle_epi8(T51, mSwitch4); T8 = _mm_shuffle_epi8(T51, mSwitch5); S5 = _mm_maddubs_epi16(T1, C33); S6 = _mm_maddubs_epi16(T2, C33); S7 = _mm_maddubs_epi16(T3, C33); S8 = _mm_maddubs_epi16(T4, C33); S50 = _mm_hadds_epi16(S5, S6); S51 = _mm_hadds_epi16(S7, S8); S5 = _mm_hadds_epi16(S50, S51);//ǰ8 S4 = _mm_maddubs_epi16(T5, C33); S6 = _mm_maddubs_epi16(T6, C33); S7 = _mm_maddubs_epi16(T7, C33); S8 = _mm_maddubs_epi16(T8, C33); S60 = _mm_hadds_epi16(S4, S6); S61 = _mm_hadds_epi16(S7, S8); S6 = _mm_hadds_epi16(S60, S61);//8 S0 = _mm_adds_epi16(S00, S10); S1 = _mm_adds_epi16(S30, S20); S2 = _mm_adds_epi16(S40, S5); S3 = _mm_adds_epi16(S1, S0); SS1 = _mm_adds_epi16(S2, S3);//ǰ8 S0 = _mm_adds_epi16(S01, S11); S1 = _mm_adds_epi16(S31, S21); S2 = _mm_adds_epi16(S41, S6); S3 = _mm_adds_epi16(S1, S0); SS2 = _mm_adds_epi16(S2, S3);//8 SS1 = _mm_adds_epi16(SS1, mAddOffset); SS1 = _mm_srai_epi16(SS1, 6); SS1 = _mm_min_epi16(SS1, mMax); SS1 = _mm_max_epi16(SS1, mZero); SS2 = _mm_adds_epi16(SS2, mAddOffset); SS2 = _mm_srai_epi16(SS2, 6); SS2 = _mm_min_epi16(SS2, mMax); SS2 = _mm_max_epi16(SS2, mZero); S = _mm_packus_epi16(SS1, SS2); if (j != xPosEnd16){ _mm_storeu_si128((__m128i*)(p_dst + j), S); } else{ _mm_maskmoveu_si128(S, mask, (char *)(p_dst + j)); break; } } p_src += i_src; p_dst += i_dst; } } xavs2-1.3/source/common/vec/intrinsic_cg_scan.c000066400000000000000000000166261340660520300216220ustar00rootroot00000000000000/* * intrinsic_cg_scan.c * * Description of this file: * SSE assembly functions of CG-Scanning module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * Jiaqi ZHANG * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include /* --------------------------------------------------------------------------- */ void coeff_scan_4x4_xy_sse128(coeff_t *dst, const coeff_t *src, int i_src_shift) { __m128i row0, row1, row2, row3; __m128i dst1, dst2; __m128i order1, order2; int int1, int2; order1 = _mm_setr_epi16(0x0100, 0x0302, 0x0908, 0x0F0E, 0x0B0A, 0x0504, 0x0706, 0x0D0C); order2 = _mm_setr_epi16(0x0302, 0x0908, 0x0B0A, 0x0504, 0x0100, 0x0706, 0x0D0C, 0x0F0E); row0 = _mm_loadl_epi64((const __m128i*)&src[0 << i_src_shift]); row1 = _mm_loadl_epi64((const __m128i*)&src[(int64_t)(1 << i_src_shift)]); row2 = _mm_loadl_epi64((const __m128i*)&src[2 << i_src_shift]); row3 = _mm_loadl_epi64((const __m128i*)&src[3 << i_src_shift]); dst1 = _mm_unpacklo_epi64(row0, row1); //0 1 2 3 4 5 6 7 dst2 = _mm_unpacklo_epi64(row2, row3); //8 9 10 11 12 13 14 15 int1 = _mm_extract_epi16(dst1, 7); int2 = _mm_extract_epi16(dst2, 0); dst1 = _mm_insert_epi16(dst1, int2, 7); //0 1 2 3 4 5 6 8 dst2 = _mm_insert_epi16(dst2, int1, 0); //7 9 10 11 12 13 14 15 //0 1 2 3 4 5 6 8 -------> 0 1 4 8 5 2 3 6 dst1 = _mm_shuffle_epi8(dst1, order1); //0 1 2 3 4 5 6 7 //7 9 10 11 12 13 14 15 --------> 9 12 13 10 7 11 14 15 dst2 = _mm_shuffle_epi8(dst2, order2); _mm_store_si128((__m128i*)(dst + 0), dst1); _mm_store_si128((__m128i*)(dst + 8), dst2); } /* --------------------------------------------------------------------------- */ void coeff_scan_4x4_yx_sse128(coeff_t *dst, const coeff_t *src, int i_src_shift) { __m128i row0, row1, row2, row3; __m128i dst1, dst2; __m128i order1, order2; int int1, int2; order1 = _mm_setr_epi16(0x0100, 0x0908, 0x0302, 0x0504, 0x0B0A, 0x0D0C, 0x0706, 0x0F0E); order2 = _mm_setr_epi16(0x0100, 0x0908, 0x0302, 0x0504, 0x0B0A, 0x0D0C, 0x0706, 0x0F0E); row0 = _mm_loadl_epi64((const __m128i*)&src[0 << i_src_shift]); // 0 1 2 3 row1 = _mm_loadl_epi64((const __m128i*)&src[(int64_t)1 << i_src_shift]); // 4 5 6 7 row2 = _mm_loadl_epi64((const __m128i*)&src[2 << i_src_shift]); // 8 9 10 11 row3 = _mm_loadl_epi64((const __m128i*)&src[3 << i_src_shift]); //12 13 14 15 dst1 = _mm_unpacklo_epi64(row0, row1); //0 1 2 3 4 5 6 7 dst2 = _mm_unpacklo_epi64(row2, row3); //8 9 10 11 12 13 14 15 int1 = _mm_extract_epi32(dst1, 3); int2 = _mm_extract_epi32(dst2, 0); dst1 = _mm_insert_epi32(dst1, int2, 3); //0 1 2 3 4 5 8 9 dst2 = _mm_insert_epi32(dst2, int1, 0); //6 7 10 11 12 13 14 15 int1 = _mm_extract_epi16(dst1, 3); int2 = _mm_extract_epi16(dst2, 4); dst1 = _mm_insert_epi16(dst1, int2, 3); //0 1 2 12 4 5 8 9 dst2 = _mm_insert_epi16(dst2, int1, 4); //6 7 10 11 3 13 14 15 //0 1 2 3 4 5 6 7 //0 1 2 12 4 5 8 9 -------> 0 4 1 2 5 8 12 9 dst1 = _mm_shuffle_epi8(dst1, order1); //0 1 2 3 4 5 6 7 //6 7 10 11 3 13 14 15 --------> 6 3 7 10 13 14 11 15 dst2 = _mm_shuffle_epi8(dst2, order2); _mm_store_si128((__m128i*)(dst + 0), dst1); _mm_store_si128((__m128i*)(dst + 8), dst2); } #if ARCH_X86_64 /* --------------------------------------------------------------------------- */ void coeff_scan4_xy_sse128(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4) { __m128i dst1, dst2; __m128i order1, order2; int int1, int2; order1 = _mm_setr_epi16(0x0100, 0x0302, 0x0908, 0x0F0E, 0x0B0A, 0x0504, 0x0706, 0x0D0C); order2 = _mm_setr_epi16(0x0302, 0x0908, 0x0B0A, 0x0504, 0x0100, 0x0706, 0x0D0C, 0x0F0E); dst1 = _mm_set_epi64x((int64_t)r2, (int64_t)r1); //0 1 2 3 4 5 6 7 dst2 = _mm_set_epi64x((int64_t)r4, (int64_t)r3); //8 9 10 11 12 13 14 15 int1 = _mm_extract_epi16(dst1, 7); int2 = _mm_extract_epi16(dst2, 0); dst1 = _mm_insert_epi16(dst1, int2, 7); //0 1 2 3 4 5 6 8 dst2 = _mm_insert_epi16(dst2, int1, 0); //7 9 10 11 12 13 14 15 //0 1 2 3 4 5 6 8 -------> 0 1 4 8 5 2 3 6 dst1 = _mm_shuffle_epi8(dst1, order1); //0 1 2 3 4 5 6 7 //7 9 10 11 12 13 14 15 --------> 9 12 13 10 7 11 14 15 dst2 = _mm_shuffle_epi8(dst2, order2); _mm_store_si128((__m128i*)(dst + 0), dst1); _mm_store_si128((__m128i*)(dst + 8), dst2); } /* --------------------------------------------------------------------------- */ void coeff_scan4_yx_sse128(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4) { __m128i dst1, dst2; __m128i order1, order2; int int1, int2; order1 = _mm_setr_epi16(0x0100, 0x0908, 0x0302, 0x0504, 0x0B0A, 0x0D0C, 0x0706, 0x0F0E); order2 = _mm_setr_epi16(0x0100, 0x0908, 0x0302, 0x0504, 0x0B0A, 0x0D0C, 0x0706, 0x0F0E); dst1 = _mm_set_epi64x((int64_t)r2, (int64_t)r1); //0 1 2 3 4 5 6 7 dst2 = _mm_set_epi64x((int64_t)r4, (int64_t)r3); //8 9 10 11 12 13 14 15 int1 = _mm_extract_epi32(dst1, 3); int2 = _mm_extract_epi32(dst2, 0); dst1 = _mm_insert_epi32(dst1, int2, 3); //0 1 2 3 4 5 8 9 dst2 = _mm_insert_epi32(dst2, int1, 0); //6 7 10 11 12 13 14 15 int1 = _mm_extract_epi16(dst1, 3); int2 = _mm_extract_epi16(dst2, 4); dst1 = _mm_insert_epi16(dst1, int2, 3); //0 1 2 12 4 5 8 9 dst2 = _mm_insert_epi16(dst2, int1, 4); //6 7 10 11 3 13 14 15 //0 1 2 3 4 5 6 7 //0 1 2 12 4 5 8 9 -------> 0 4 1 2 5 8 12 9 dst1 = _mm_shuffle_epi8(dst1, order1); //0 1 2 3 4 5 6 7 //6 7 10 11 3 13 14 15 --------> 6 3 7 10 13 14 11 15 dst2 = _mm_shuffle_epi8(dst2, order2); _mm_store_si128((__m128i*)(dst + 0), dst1); _mm_store_si128((__m128i*)(dst + 8), dst2); } #endif // ARCH_X86_64 xavs2-1.3/source/common/vec/intrinsic_cg_scan_avx.c000066400000000000000000000075321340660520300224740ustar00rootroot00000000000000/* * intrinsic_cg_scan_avx.c * * Description of this file: * AVX2 assembly functions of CG-Scanning module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * Jiaqi ZHANG * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include #include #include #include #include #include "../common.h" #include "intrinsic.h" #if ARCH_X86_64 /* --------------------------------------------------------------------------- */ void coeff_scan4_xy_avx(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4) { __m256i order1; __m256i m_in; int16_t int1, int2; order1 = _mm256_setr_epi16(0x0100, 0x0302, 0x0908, 0x0F0E, 0x0B0A, 0x0504, 0x0706, 0x0D0C, 0x0302, 0x0908, 0x0B0A, 0x0504, 0x0100, 0x0706, 0x0D0C, 0x0F0E); m_in = _mm256_setr_epi64x(r1, r2, r3, r4); // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 int1 = _mm256_extract_epi16(m_in, 7); int2 = _mm256_extract_epi16(m_in, 8); _mm256_insert_epi16(m_in, int2, 7); //0 1 2 3 4 5 6 8 _mm256_insert_epi16(m_in, int1, 8); //7 9 10 11 12 13 14 15 //0 1 2 3 4 5 6 8 --------> 0 1 4 8 5 2 3 6 //7 9 10 11 12 13 14 15 --------> 9 12 13 10 7 11 14 15 m_in = _mm256_shuffle_epi8(m_in, order1); _mm256_storeu_si256((__m256i*)dst, m_in); } /* --------------------------------------------------------------------------- */ void coeff_scan4_yx_avx(coeff_t *dst, uint64_t r1, uint64_t r2, uint64_t r3, uint64_t r4) { __m256i order1; __m256i m_in; int32_t int1, int2; order1 = _mm256_setr_epi16(0x0100, 0x0908, 0x0302, 0x0504, 0x0B0A, 0x0D0C, 0x0706, 0x0F0E, 0x0100, 0x0908, 0x0302, 0x0504, 0x0B0A, 0x0D0C, 0x0706, 0x0F0E); m_in = _mm256_setr_epi64x(r1, r2, r3, r4); // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 int1 = _mm256_extract_epi32(m_in, 3); int2 = _mm256_extract_epi32(m_in, 4); _mm256_insert_epi32(m_in, int2, 3); //0 1 2 3 4 5 8 9 _mm256_insert_epi32(m_in, int1, 4); //6 7 10 11 12 13 14 15 int1 = _mm256_extract_epi16(m_in, 3); int2 = _mm256_extract_epi16(m_in, 12); _mm256_insert_epi16(m_in, (int16_t)int2, 3); //0 1 2 12 4 5 8 9 _mm256_insert_epi16(m_in, (int16_t)int1, 12); //6 7 10 11 3 13 14 15 //0 1 2 12 4 5 8 9 --------> 0 4 1 2 5 8 12 9 //6 7 10 11 3 13 14 15 --------> 6 3 7 10 13 14 11 15 m_in= _mm256_shuffle_epi8(m_in, order1); _mm256_storeu_si256((__m256i*)dst, m_in); } #endif // ARCH_X86_64 xavs2-1.3/source/common/vec/intrinsic_dct.c000066400000000000000000010411561340660520300207740ustar00rootroot00000000000000/* * intrinsic_dct.c * * Description of this file: * SSE assembly functions of DCT module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include #include #include #include #include #include "../basic_types.h" #include "../avs2_defs.h" #include "intrinsic.h" void *xavs2_fast_memzero_mmx(void *dst, size_t n); #define pair_set_epi16(a, b) \ _mm_set_epi16(b, a, b, a, b, a, b, a) /* --------------------------------------------------------------------------- * functions defined in this file: * dct16, dct32 */ ALIGN32(static const int16_t tab_dct_4[][8]) = { { 32, 32, 32, 32, 32, 32, 32, 32 }, { 42, 17, 42, 17, 42, 17, 42, 17 }, { 32, -32, 32, -32, 32, -32, 32, -32 }, { 17, -42, 17, -42, 17, -42, 17, -42 }, }; ALIGN32(static const int16_t tab_dct1_4[][8]) = { { 32, 32, 32, 32, 32, 32, 32, 32 }, { 42, 17, -17, -42, 42, 17, -17, -42 }, { 32, -32, -32, 32, 32, -32, -32, 32 }, { 17, -42, 42, -17, 17, -42, 42, -17 } }; ALIGN32(static const int16_t tab_dct_8[][8]) = { { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, { 32, 32, 32, 32, 32, 32, 32, 32 }, { 32, -32, 32, -32, 32, -32, 32, -32 }, { 42, 17, 42, 17, 42, 17, 42, 17 }, { 17, -42, 17, -42, 17, -42, 17, -42 }, { 44, 9, 38, 25, 44, 9, 38, 25 }, { 38, -25, -9, -44, 38, -25, -9, -44 }, { 25, 38, -44, 9, 25, 38, -44, 9 }, { 9, -44, -25, 38, 9, -44, -25, 38 }, { 42, 42, -42, -42, 17, 17, -17, -17 }, { 17, 17, -17, -17, -42, -42, 42, 42 }, { 44, -44, 9, -9, 38, -38, 25, -25 }, { 38, -38, -25, 25, -9, 9, -44, 44 }, { 25, -25, 38, -38, -44, 44, 9, -9 }, { 9, -9, -44, 44, -25, 25, 38, -38 } }; ALIGN32(static const int16_t tab_dct_8_1[][8]) = { { 32, 32, 32, 32, 32, 32, 32, 32 }, { 44, 38, 25, 9, - 9, -25, -38, -44 }, { 42, 17, -17, -42, -42, -17, 17, 42 }, { 38, - 9, -44, -25, 25, 44, 9, -38 }, { 32, -32, -32, 32, 32, -32, -32, 32 }, { 25, -44, 9, 38, -38, - 9, 44, -25 }, { 17, -42, 42, -17, -17, 42, -42, 17 }, { 9, -25, 38, -44, 44, -38, 25, - 9 } }; ALIGN32(static const int16_t tab_dct_16_0[][8]) = { { 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100 }, // 0 { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, // 1 { 0x0100, 0x0706, 0x0302, 0x0504, 0x0F0E, 0x0908, 0x0D0C, 0x0B0A }, // 2 [0 3 1 2 7 4 6 5] { 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, 0x0100, 0x0706, 0x0302, 0x0504 } // 3 [7 4 6 5 0 3 1 2] }; ALIGN32(static const int16_t tab_dct_16_1[][8]) = { { 45, 43, 40, 35, 29, 21, 13, 4 }, // 0 { 43, 29, 4, -21, -40, -45, -35, -13 }, // 1 { 40, 4, -35, -43, -13, 29, 45, 21 }, // 2 { 35, -21, -43, 4, 45, 13, -40, -29 }, // 3 { 29, -40, -13, 45, -4, -43, 21, 35 }, // 4 { 21, -45, 29, 13, -43, 35, 4, -40 }, // 5 { 13, -35, 45, -40, 21, 4, -29, 43 }, // 6 { 4, -13, 21, -29, 35, -40, 43, -45 }, // 7 { 42, 42, -42, -42, 17, 17, -17, -17 }, // 8 { 17, 17, -17, -17, -42, -42, 42, 42 }, // 9 { 44, 44, 9, 9, 38, 38, 25, 25 }, // 10 { 38, 38, -25, -25, -9, -9, -44, -44 }, // 11 { 25, 25, 38, 38, -44, -44, 9, 9 }, // 12 { 9, 9, -44, -44, -25, -25, 38, 38 }, // 13 #define MAKE_COEF(a0, a1, a2, a3, a4, a5, a6, a7) \ { (a0), -(a0), (a3), -(a3), (a1), -(a1), (a2), -(a2) }, \ { (a7), -(a7), (a4), -(a4), (a6), -(a6), (a5), -(a5) }, MAKE_COEF(45, 43, 40, 35, 29, 21, 13, 4) MAKE_COEF(43, 29, 4, -21, -40, -45, -35, -13) MAKE_COEF(40, 4, -35, -43, -13, 29, 45, 21) MAKE_COEF(35, -21, -43, 4, 45, 13, -40, -29) MAKE_COEF(29, -40, -13, 45, -4, -43, 21, 35) MAKE_COEF(21, -45, 29, 13, -43, 35, 4, -40) MAKE_COEF(13, -35, 45, -40, 21, 4, -29, 43) MAKE_COEF( 4, -13, 21, -29, 35, -40, 43, -45) #undef MAKE_COEF }; ALIGN32(static const int32_t tab_dct_16_new_coeff[][4]) = { { 32, 32, 32, 32 }, // 0 { 44, 9, 38, 25 }, // 2 { 42, 17, 42, 17 }, // 4 { 38, -25, -9, -44 }, // 6 { 32, 32, 32, 32 }, // 8 { 25, 38, -44, 9 }, // 10 { 17, -42, 17, -42 }, // 12 { 9, -44, -25, 38 }, // 14 }; ALIGN32(static const int16_t tab_dct_32_0[][8]) = { { 0x0F0E, 0x0100, 0x0908, 0x0706, 0x0D0C, 0x0302, 0x0B0A, 0x0504 }, // 0 }; ALIGN32(static const int16_t tab_dct_32_1[][8]) = { { 44, -44, 9, -9, 38, -38, 25, -25 }, // 0 { 38, -38, -25, 25, -9, 9, -44, 44 }, // 1 { 25, -25, 38, -38, -44, 44, 9, -9 }, // 2 { 9, -9, -44, 44, -25, 25, 38, -38 }, // 3 #define MAKE_COEF8(a0, a1, a2, a3, a4, a5, a6, a7) \ { (a0), (a7), (a3), (a4), (a1), (a6), (a2), (a5) }, MAKE_COEF8(45, 43, 40, 35, 29, 21, 13, 4) // 4 MAKE_COEF8(43, 29, 4, -21, -40, -45, -35, -13) // 5 MAKE_COEF8(40, 4, -35, -43, -13, 29, 45, 21) // 6 MAKE_COEF8(35, -21, -43, 4, 45, 13, -40, -29) // 7 MAKE_COEF8(29, -40, -13, 45, -4, -43, 21, 35) // 8 MAKE_COEF8(21, -45, 29, 13, -43, 35, 4, -40) // 9 MAKE_COEF8(13, -35, 45, -40, 21, 4, -29, 43) // 10 MAKE_COEF8( 4, -13, 21, -29, 35, -40, 43, -45) // 11 #undef MAKE_COEF8 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ { (a00), (a07), (a03), (a04), (a01), (a06), (a02), (a05) }, \ { (a15), (a08), (a12), (a11), (a14), (a09), (a13), (a10) }, MAKE_COEF16(45, 45, 44, 43, 41, 39, 36, 34, 30, 27, 23, 19, 15, 11, 7, 2) // 12 MAKE_COEF16(45, 41, 34, 23, 11, -2, -15, -27, -36, -43, -45, -44, -39, -30, -19, -7) // 14 MAKE_COEF16(44, 34, 15, -7, -27, -41, -45, -39, -23, -2, 19, 36, 45, 43, 30, 11) // 16 MAKE_COEF16(43, 23, -7, -34, -45, -36, -11, 19, 41, 44, 27, -2, -30, -45, -39, -15) // 18 MAKE_COEF16(41, 11, -27, -45, -30, 7, 39, 43, 15, -23, -45, -34, 2, 36, 44, 19) // 20 MAKE_COEF16(39, -2, -41, -36, 7, 43, 34, -11, -44, -30, 15, 45, 27, -19, -45, -23) // 22 MAKE_COEF16(36, -15, -45, -11, 39, 34, -19, -45, -7, 41, 30, -23, -44, -2, 43, 27) // 24 MAKE_COEF16(34, -27, -39, 19, 43, -11, -45, 2, 45, 7, -44, -15, 41, 23, -36, -30) // 26 MAKE_COEF16(30, -36, -23, 41, 15, -44, -7, 45, -2, -45, 11, 43, -19, -39, 27, 34) // 28 MAKE_COEF16(27, -43, -2, 44, -23, -30, 41, 7, -45, 19, 34, -39, -11, 45, -15, -36) // 30 MAKE_COEF16(23, -45, 19, 27, -45, 15, 30, -44, 11, 34, -43, 7, 36, -41, 2, 39) // 32 MAKE_COEF16(19, -44, 36, -2, -34, 45, -23, -15, 43, -39, 7, 30, -45, 27, 11, -41) // 34 MAKE_COEF16(15, -39, 45, -30, 2, 27, -44, 41, -19, -11, 36, -45, 34, -7, -23, 43) // 36 MAKE_COEF16(11, -30, 43, -45, 36, -19, -2, 23, -39, 45, -41, 27, -7, -15, 34, -44) // 38 MAKE_COEF16( 7, -19, 30, -39, 44, -45, 43, -36, 27, -15, 2, 11, -23, 34, -41, 45) // 40 MAKE_COEF16( 2, -7, 11, -15, 19, -23, 27, -30, 34, -36, 39, -41, 43, -44, 45, -45) // 42 #undef MAKE_COEF16 { 32, 32, 32, 32, 32, 32, 32, 32 }, // 44 { 32, 32, -32, -32, -32, -32, 32, 32 }, // 45 { 42, 42, 17, 17, -17, -17, -42, -42 }, // 46 { -42, -42, -17, -17, 17, 17, 42, 42 }, // 47 { 17, 17, -42, -42, 42, 42, -17, -17 }, // 48 { -17, -17, 42, 42, -42, -42, 17, 17 }, // 49 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ { (a00), (a00), (a01), (a01), (a02), (a02), (a03), (a03) }, \ { (a04), (a04), (a05), (a05), (a06), (a06), (a07), (a07) }, \ { (a08), (a08), (a09), (a09), (a10), (a10), (a11), (a11) }, \ { (a12), (a12), (a13), (a13), (a14), (a14), (a15), (a15) }, MAKE_COEF16(44, 38, 25, 9, -9, -25, -38, -44, -44, -38, -25, -9, 9, 25, 38, 44) // 50 MAKE_COEF16(38, -9, -44, -25, 25, 44, 9, -38, -38, 9, 44, 25, -25, -44, -9, 38) // 54 // TODO: convert below table here #undef MAKE_COEF16 { 25, 25, -44, -44, 9, 9, 38, 38 }, // 58 { -38, -38, -9, -9, 44, 44, -25, -25 }, // 59 { -25, -25, 44, 44, -9, -9, -38, -38 }, // 60 { 38, 38, 9, 9, -44, -44, 25, 25 }, // 61 { 9, 9, -25, -25, 38, 38, -44, -44 }, // 62 { 44, 44, -38, -38, 25, 25, -9, -9 }, // 63 { -9, -9, 25, 25, -38, -38, 44, 44 }, // 64 { -44, -44, 38, 38, -25, -25, 9, 9 }, // 65 { 45, 45, 43, 43, 40, 40, 35, 35 }, // 66 { 29, 29, 21, 21, 13, 13, 4, 4 }, // 67 { -4, -4, -13, -13, -21, -21, -29, -29 }, // 68 { -35, -35, -40, -40, -43, -43, -45, -45 }, // 69 { 43, 43, 29, 29, 4, 4, -21, -21 }, // 70 { -40, -40, -45, -45, -35, -35, -13, -13 }, // 71 { 13, 13, 35, 35, 45, 45, 40, 40 }, // 72 { 21, 21, -4, -4, -29, -29, -43, -43 }, // 73 { 40, 40, 4, 4, -35, -35, -43, -43 }, // 74 { -13, -13, 29, 29, 45, 45, 21, 21 }, // 75 { -21, -21, -45, -45, -29, -29, 13, 13 }, // 76 { 43, 43, 35, 35, -4, -4, -40, -40 }, // 77 { 35, 35, -21, -21, -43, -43, 4, 4 }, // 78 { 45, 45, 13, 13, -40, -40, -29, -29 }, // 79 { 29, 29, 40, 40, -13, -13, -45, -45 }, // 80 { -4, -4, 43, 43, 21, 21, -35, -35 }, // 81 { 29, 29, -40, -40, -13, -13, 45, 45 }, // 82 { -4, -4, -43, -43, 21, 21, 35, 35 }, // 83 { -35, -35, -21, -21, 43, 43, 4, 4 }, // 84 { -45, -45, 13, 13, 40, 40, -29, -29 }, // 85 { 21, 21, -45, -45, 29, 29, 13, 13 }, // 86 { -43, -43, 35, 35, 4, 4, -40, -40 }, // 87 { 40, 40, -4, -4, -35, -35, 43, 43 }, // 88 { -13, -13, -29, -29, 45, 45, -21, -21 }, // 89 { 13, 13, -35, -35, 45, 45, -40, -40 }, // 90 { 21, 21, 4, 4, -29, -29, 43, 43 }, // 91 { -43, -43, 29, 29, -4, -4, -21, -21 }, // 92 { 40, 40, -45, -45, 35, 35, -13, -13 }, // 93 { 4, 4, -13, -13, 21, 21, -29, -29 }, // 94 { 35, 35, -40, -40, 43, 43, -45, -45 }, // 95 { 45, 45, -43, -43, 40, 40, -35, -35 }, // 96 { 29, 29, -21, -21, 13, 13, -4, -4 }, // 97 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ { (a00), -(a00), (a01), -(a01), (a02), -(a02), (a03), -(a03) }, \ { (a04), -(a04), (a05), -(a05), (a06), -(a06), (a07), -(a07) }, \ { (a08), -(a08), (a09), -(a09), (a10), -(a10), (a11), -(a11) }, \ { (a12), -(a12), (a13), -(a13), (a14), -(a14), (a15), -(a15) }, MAKE_COEF16(45, 45, 44, 43, 41, 39, 36, 34, 30, 27, 23, 19, 15, 11, 7, 2) // 98 MAKE_COEF16(45, 41, 34, 23, 11, -2, -15, -27, -36, -43, -45, -44, -39, -30, -19, -7) //102 MAKE_COEF16(44, 34, 15, -7, -27, -41, -45, -39, -23, -2, 19, 36, 45, 43, 30, 11) //106 MAKE_COEF16(43, 23, -7, -34, -45, -36, -11, 19, 41, 44, 27, -2, -30, -45, -39, -15) //110 MAKE_COEF16(41, 11, -27, -45, -30, 7, 39, 43, 15, -23, -45, -34, 2, 36, 44, 19) //114 MAKE_COEF16(39, -2, -41, -36, 7, 43, 34, -11, -44, -30, 15, 45, 27, -19, -45, -23) //118 MAKE_COEF16(36, -15, -45, -11, 39, 34, -19, -45, -7, 41, 30, -23, -44, -2, 43, 27) //122 MAKE_COEF16(34, -27, -39, 19, 43, -11, -45, 2, 45, 7, -44, -15, 41, 23, -36, -30) //126 MAKE_COEF16(30, -36, -23, 41, 15, -44, -7, 45, -2, -45, 11, 43, -19, -39, 27, 34) //130 MAKE_COEF16(27, -43, -2, 44, -23, -30, 41, 7, -45, 19, 34, -39, -11, 45, -15, -36) //134 MAKE_COEF16(23, -45, 19, 27, -45, 15, 30, -44, 11, 34, -43, 7, 36, -41, 2, 39) //138 MAKE_COEF16(19, -44, 36, -2, -34, 45, -23, -15, 43, -39, 7, 30, -45, 27, 11, -41) //142 MAKE_COEF16(15, -39, 45, -30, 2, 27, -44, 41, -19, -11, 36, -45, 34, -7, -23, 43) //146 MAKE_COEF16(11, -30, 43, -45, 36, -19, -2, 23, -39, 45, -41, 27, -7, -15, 34, -44) //150 MAKE_COEF16( 7, -19, 30, -39, 44, -45, 43, -36, 27, -15, 2, 11, -23, 34, -41, 45) //154 MAKE_COEF16( 2, -7, 11, -15, 19, -23, 27, -30, 34, -36, 39, -41, 43, -44, 45, -45) //158 #undef MAKE_COEF16 }; ALIGN32(static const int32_t tab_dct_32_zhangjiaqi[][4]) = { { 32, 32, 32, 32 }, // order:0 // 0 / 16 { 42, -42, 17, -17 }, // order:1 // 8 { 17, -17, -42, 42 }, // order:2 // 24 #define MAKE_COEF8(a0, a1, a2, a3, a4, a5, a6, a7) \ { (a0), (a1), (a2), (a3) }, \ { (a4), (a5), (a6), (a7) }, MAKE_COEF8(44, -44, 9, -9, 38, -38, 25, -25) // order:3/4 // 4 MAKE_COEF8(38, -38, -25, 25, -9, 9, -44, 44) // order:5/6 // 12 MAKE_COEF8(25, -25, 38, -38, -44, 44, 9, -9) // order:7/8 // 20 MAKE_COEF8(9, -9, -44, 44, -25, 25, 38, -38) // order:9/10 // 28 #undef MAKE_COEF8 #define MAKE_COEF8(a0, a1, a2, a3, a4, a5, a6, a7) \ {(a0), (a7), (a3), (a4)}, \ { (a1), (a6), (a2), (a5) }, MAKE_COEF8(45, 43, 40, 35, 29, 21, 13, 4) // order:11/12 // 2 MAKE_COEF8(43, 29, 4, -21, -40, -45, -35, -13) // order:13/14 // 6 MAKE_COEF8(40, 4, -35, -43, -13, 29, 45, 21) // order:15/16 // 10 MAKE_COEF8(35, -21, -43, 4, 45, 13, -40, -29) // order:17/18 // 14 MAKE_COEF8(29, -40, -13, 45, -4, -43, 21, 35) // order:19/20 // 18 MAKE_COEF8(21, -45, 29, 13, -43, 35, 4, -40) // order:21/22 // 22 MAKE_COEF8(13, -35, 45, -40, 21, 4, -29, 43) // order:23/24 // 26 MAKE_COEF8(4, -13, 21, -29, 35, -40, 43, -45) // order:25/26 // 30 #undef MAKE_COEF8 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ { (a00), (a07), (a03), (a04) }, \ { (a01), (a06), (a02), (a05) }, \ { (a15), (a08), (a12), (a11) }, \ { (a14), (a09), (a13), (a10) }, MAKE_COEF16(45, 45, 44, 43, 41, 39, 36, 34, 30, 27, 23, 19, 15, 11, 7, 2) // order:27 // 1 MAKE_COEF16(45, 41, 34, 23, 11, -2, -15, -27, -36, -43, -45, -44, -39, -30, -19, -7) // order:31 // 3 MAKE_COEF16(44, 34, 15, -7, -27, -41, -45, -39, -23, -2, 19, 36, 45, 43, 30, 11) // order:35 // 5 MAKE_COEF16(43, 23, -7, -34, -45, -36, -11, 19, 41, 44, 27, -2, -30, -45, -39, -15) // order:39 // 7 MAKE_COEF16(41, 11, -27, -45, -30, 7, 39, 43, 15, -23, -45, -34, 2, 36, 44, 19) // order:43 // 9 MAKE_COEF16(39, -2, -41, -36, 7, 43, 34, -11, -44, -30, 15, 45, 27, -19, -45, -23) // order:47 // 11 MAKE_COEF16(36, -15, -45, -11, 39, 34, -19, -45, -7, 41, 30, -23, -44, -2, 43, 27) // order:51 // 13 MAKE_COEF16(34, -27, -39, 19, 43, -11, -45, 2, 45, 7, -44, -15, 41, 23, -36, -30) // order:55 // 15 MAKE_COEF16(30, -36, -23, 41, 15, -44, -7, 45, -2, -45, 11, 43, -19, -39, 27, 34) // order:59 // 17 MAKE_COEF16(27, -43, -2, 44, -23, -30, 41, 7, -45, 19, 34, -39, -11, 45, -15, -36) // order:63 // 19 MAKE_COEF16(23, -45, 19, 27, -45, 15, 30, -44, 11, 34, -43, 7, 36, -41, 2, 39) // order:67 // 21 MAKE_COEF16(19, -44, 36, -2, -34, 45, -23, -15, 43, -39, 7, 30, -45, 27, 11, -41) // order:71 // 23 MAKE_COEF16(15, -39, 45, -30, 2, 27, -44, 41, -19, -11, 36, -45, 34, -7, -23, 43) // order:75 // 25 MAKE_COEF16(11, -30, 43, -45, 36, -19, -2, 23, -39, 45, -41, 27, -7, -15, 34, -44) // order:79 // 27 MAKE_COEF16(7, -19, 30, -39, 44, -45, 43, -36, 27, -15, 2, 11, -23, 34, -41, 45) // order:83 // 29 MAKE_COEF16(2, -7, 11, -15, 19, -23, 27, -30, 34, -36, 39, -41, 43, -44, 45, -45) // order:87 // 31 #undef MAKE_COEF16 }; /* --------------------------------------------------------------------------- * secondary transform */ //ALIGN16(const int16_t g_2T[SEC_TR_SIZE * SEC_TR_SIZE]) = { // 123, -35, -8, -3, // e0 e1 e2 e3 // -32, -120, 30, 10, // f0 f1 f2 f3 // 14, 25, 123, -22, // g0 g1 g2 g3 // 8, 13, 19, 126 // h0 h1 h2 h3 //}; ALIGN16(static const int16_t g_2T_H[2 * (2 * SEC_TR_SIZE)]) = { 123, -35, -32, -120, 14, 25, 8, 13, // e0 e1 f0 f1 g0 g1 h0 h1 -8, -3, 30, 10, 123, -22, 19, 126 // e2 e3 f2 f3 g2 g3 h2 h3 }; ALIGN16(static const int16_t g_2T_V[8 * (2 * SEC_TR_SIZE)]) = { 123, -35, 123, -35, 123, -35, 123, -35, // e0 e1 e0 e1 e0 e1 e0 e1 -8, -3, -8, -3, -8, -3, -8, -3, // e2 e3 e2 e3 e2 e3 e2 e3 -32, -120, -32, -120, -32, -120, -32, -120, // f0 f1 f0 f1 f0 f1 f0 f1 30, 10, 30, 10, 30, 10, 30, 10, // f2 f3 f2 f3 f2 f3 f2 f3 14, 25, 14, 25, 14, 25, 14, 25, // g0 g1 g0 g1 g0 g1 g0 g1 123, -22, 123, -22, 123, -22, 123, -22, // g2 g3 g2 g3 g2 g3 g2 g3 8, 13, 8, 13, 8, 13, 8, 13, // h0 h1 h0 h1 h0 h1 h0 h1 19, 126, 19, 126, 19, 126, 19, 126, // h2 h3 h2 h3 h2 h3 h2 h3 }; /* --------------------------------------------------------------------------- * secondary transform (only for 4x4) */ //ALIGN16(const int16_t g_2T_C[SEC_TR_SIZE * SEC_TR_SIZE]) = { // 34, 58, 72, 81, // e0 e1 e2 e3 // 77, 69, -7, -75, // f0 f1 f2 f3 // 79, -33, -75, 58, // g0 g1 g2 g3 // 55, -84, 73, -28 // h0 h1 h2 h3 //}; ALIGN16(static const int16_t g_2TC_H[2 * (2 * SEC_TR_SIZE)]) = { 34, 58, 77, 69, 79, -33, 55, -84, // e0 e1 f0 f1 g0 g1 h0 h1 72, 81, -7, -75, -75, 58, 73, -28 // e2 e3 f2 f3 g2 g3 h2 h3 }; ALIGN16(static const int16_t g_2TC_V[8 * (2 * SEC_TR_SIZE)]) = { 34, 58, 34, 58, 34, 58, 34, 58, // e0 e1 e0 e1 e0 e1 e0 e1 72, 81, 72, 81, 72, 81, 72, 81, // e2 e3 e2 e3 e2 e3 e2 e3 77, 69, 77, 69, 77, 69, 77, 69, // f0 f1 f0 f1 f0 f1 f0 f1 -7, -75, -7, -75, -7, -75, -7, -75, // f2 f3 f2 f3 f2 f3 f2 f3 79, -33, 79, -33, 79, -33, 79, -33, // g0 g1 g0 g1 g0 g1 g0 g1 -75, 58, -75, 58, -75, 58, -75, 58, // g2 g3 g2 g3 g2 g3 g2 g3 55, -84, 55, -84, 55, -84, 55, -84, // h0 h1 h0 h1 h0 h1 h0 h1 73, -28, 73, -28, 73, -28, 73, -28, // h2 h3 h2 h3 h2 h3 h2 h3 }; /* --------------------------------------------------------------------------- futl change 2016.12.19*/ void dct_c_4x4_sse128(const coeff_t *src, coeff_t *dst, int i_src) { const int SHIFT1 = B4X4_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; const int SHIFT2 = B4X4_IN_BIT + FACTO_BIT; const int ADD1 = (1 << SHIFT1) >> 1; const int ADD2 = (1 << SHIFT2) >> 1; // Const __m128i c_add1 = _mm_set1_epi32(ADD1); __m128i c_add2 = _mm_set1_epi32(ADD2); __m128i T20, T21; __m128i T30, T31, T32, T33; __m128i T40, T41, T50, T51, T60, T61, T62, T63, T70, T71, T72, T73; __m128i T50_, T51_; __m128i Tab0, Tab1, Tab2, Tab3; Tab0 = _mm_load_si128((__m128i*)tab_dct_4[0]); Tab1 = _mm_load_si128((__m128i*)tab_dct_4[1]); Tab2 = _mm_load_si128((__m128i*)tab_dct_4[2]); Tab3 = _mm_load_si128((__m128i*)tab_dct_4[3]); T20 = _mm_loadu_si128((__m128i*)(src + 0 * i_src)); T21 = _mm_loadu_si128((__m128i*)(src + 2 * i_src)); // DCT1 T30 = _mm_shuffle_epi32(T20, 0xD8); // [13 12 03 02 11 10 01 00] T31 = _mm_shuffle_epi32(T21, 0xD8); // [33 32 23 22 31 30 21 20] T32 = _mm_shufflehi_epi16(T30, 0xB1); // [12 13 02 03 11 10 01 00] T33 = _mm_shufflehi_epi16(T31, 0xB1); // [32 33 22 23 31 30 21 20] T40 = _mm_unpacklo_epi64(T32, T33); // [31 30 21 20 11 10 01 00] T41 = _mm_unpackhi_epi64(T32, T33); // [32 33 22 23 12 13 02 03] T50 = _mm_add_epi16(T40, T41); // [1+2 0+3] T51 = _mm_sub_epi16(T40, T41); // [1-2 0-3] T60 = _mm_madd_epi16(T50, Tab0); // [ 32*s12 + 32*s03] = [03 02 01 00] T61 = _mm_madd_epi16(T51, Tab1); // [ 17*d12 + 42*d03] = [13 12 11 10] T62 = _mm_madd_epi16(T50, Tab2); // [-32*s12 + 32*s03] = [23 22 21 20] T63 = _mm_madd_epi16(T51, Tab3); // [-42*d12 + 17*d03] = [33 32 31 30] T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_add1), SHIFT1); // [30 20 10 00] T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_add1), SHIFT1); // [31 21 11 01] T62 = _mm_srai_epi32(_mm_add_epi32(T62, c_add1), SHIFT1); // [32 22 12 02] T63 = _mm_srai_epi32(_mm_add_epi32(T63, c_add1), SHIFT1); // [33 23 13 03] // Transpose T20 = _mm_packs_epi32(T60, T61); // [13 12 11 10 03 02 01 00] T21 = _mm_packs_epi32(T62, T63); // [33 32 31 30 23 22 21 20] T30 = _mm_shuffle_epi32(T20, 0xD8); // [13 12 03 02 11 10 01 00] T31 = _mm_shuffle_epi32(T21, 0xD8); // [33 32 23 22 31 30 21 20] T32 = _mm_shufflehi_epi16(T30, 0xB1); // [12 13 02 03 11 10 01 00] T33 = _mm_shufflehi_epi16(T31, 0xB1); // [32 33 22 23 31 30 21 20] T40 = _mm_unpacklo_epi64(T32, T33); // [31 30 21 20 11 10 01 00] T41 = _mm_unpackhi_epi64(T32, T33); // [32 33 22 23 12 13 02 03] T50_ = _mm_madd_epi16(T40, Tab0); T51_ = _mm_madd_epi16(T41, Tab0); T60 = _mm_add_epi32(T50_, T51_); T50_ = _mm_madd_epi16(T40, Tab1); T51_ = _mm_madd_epi16(T41, Tab1); T61 = _mm_sub_epi32(T50_, T51_); T50_ = _mm_madd_epi16(T40, Tab2); T51_ = _mm_madd_epi16(T41, Tab2); T62 = _mm_add_epi32(T50_, T51_); T50_ = _mm_madd_epi16(T40, Tab3); T51_ = _mm_madd_epi16(T41, Tab3); T63 = _mm_sub_epi32(T50_, T51_); T70 = _mm_srai_epi32(_mm_add_epi32(T60, c_add2), SHIFT2); // [30 20 10 00] T71 = _mm_srai_epi32(_mm_add_epi32(T61, c_add2), SHIFT2); // [31 21 11 01] T72 = _mm_srai_epi32(_mm_add_epi32(T62, c_add2), SHIFT2); // [32 22 12 02] T73 = _mm_srai_epi32(_mm_add_epi32(T63, c_add2), SHIFT2); // [33 23 13 03] T70 = _mm_packs_epi32(T70, T70); T71 = _mm_packs_epi32(T71, T71); T72 = _mm_packs_epi32(T72, T72); T73 = _mm_packs_epi32(T73, T73); _mm_storel_epi64((__m128i*)(dst + 0 * 4), T70); _mm_storel_epi64((__m128i*)(dst + 1 * 4), T71); _mm_storel_epi64((__m128i*)(dst + 2 * 4), T72); _mm_storel_epi64((__m128i*)(dst + 3 * 4), T73); } /* --------------------------------------------------------------------------- futl change 2016.12.19*/ void dct_c_8x8_sse128(const coeff_t *src, coeff_t *dst, int i_src) { const int SHIFT1 = B8X8_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; const int SHIFT2 = B8X8_IN_BIT + FACTO_BIT; const int ADD1 = (1 << SHIFT1) >> 1; const int ADD2 = (1 << SHIFT2) >> 1; // Const __m128i c_add1 = _mm_set1_epi32(ADD1); // add1 = 1 __m128i c_add2 = _mm_set1_epi32(ADD2); // add2 = 128 // DCT1 __m128i T00, T01, T02, T03, T04, T05, T06, T07; __m128i T10, T11, T12, T13, T14, T15, T16, T17; __m128i T20, T21, T22, T23, T24, T25, T26, T27; __m128i T30, T31, T32, T33; __m128i T40, T41, T42, T43, T44, T45, T46, T47; __m128i T50, T51, T52, T53, T54, T55, T56, T57; __m128i Tab; T00 = _mm_load_si128((__m128i*)(src + 0 * i_src)); // [07 06 05 04 03 02 01 00] T01 = _mm_load_si128((__m128i*)(src + 1 * i_src)); // [17 16 15 14 13 12 11 10] T02 = _mm_load_si128((__m128i*)(src + 2 * i_src)); // [27 26 25 24 23 22 21 20] T03 = _mm_load_si128((__m128i*)(src + 3 * i_src)); // [37 36 35 34 33 32 31 30] T04 = _mm_load_si128((__m128i*)(src + 4 * i_src)); // [47 46 45 44 43 42 41 40] T05 = _mm_load_si128((__m128i*)(src + 5 * i_src)); // [57 56 55 54 53 52 51 50] T06 = _mm_load_si128((__m128i*)(src + 6 * i_src)); // [67 66 65 64 63 62 61 60] T07 = _mm_load_si128((__m128i*)(src + 7 * i_src)); // [77 76 75 74 73 72 71 70] Tab = _mm_load_si128((__m128i*)tab_dct_8[0]); T10 = _mm_shuffle_epi8(T00, Tab); // [05 02 06 01 04 03 07 00] T11 = _mm_shuffle_epi8(T01, Tab); T12 = _mm_shuffle_epi8(T02, Tab); T13 = _mm_shuffle_epi8(T03, Tab); T14 = _mm_shuffle_epi8(T04, Tab); T15 = _mm_shuffle_epi8(T05, Tab); T16 = _mm_shuffle_epi8(T06, Tab); T17 = _mm_shuffle_epi8(T07, Tab); T20 = _mm_hadd_epi16(T10, T11); // [s25_1 s16_1 s34_1 s07_1 s25_0 s16_0 s34_0 s07_0] T21 = _mm_hadd_epi16(T12, T13); // [s25_3 s16_3 s34_3 s07_3 s25_2 s16_2 s34_2 s07_2] T22 = _mm_hadd_epi16(T14, T15); // [s25_5 s16_5 s34_5 s07_5 s25_4 s16_4 s34_4 s07_4] T23 = _mm_hadd_epi16(T16, T17); // [s25_7 s16_7 s34_7 s07_7 s25_6 s16_6 s34_6 s07_6] T24 = _mm_hsub_epi16(T10, T11); // [d25_1 d16_1 d34_1 d07_1 d25_0 d16_0 d34_0 d07_0] T25 = _mm_hsub_epi16(T12, T13); // [d25_3 d16_3 d34_3 d07_3 d25_2 d16_2 d34_2 d07_2] T26 = _mm_hsub_epi16(T14, T15); // [d25_5 d16_5 d34_5 d07_5 d25_4 d16_4 d34_4 d07_4] T27 = _mm_hsub_epi16(T16, T17); // [d25_7 d16_7 d34_7 d07_7 d25_6 d16_6 d34_6 d07_6] T30 = _mm_hadd_epi16(T20, T21); // [EE1_3 EE0_3 EE1_2 EE0_2 EE1_1 EE0_1 EE1_0 EE0_0] T31 = _mm_hadd_epi16(T22, T23); // [EE1_7 EE0_7 EE1_6 EE0_6 EE1_5 EE0_5 EE1_4 EE0_4] T32 = _mm_hsub_epi16(T20, T21); // [EO1_3 EO0_3 EO1_2 EO0_2 EO1_1 EO0_1 EO1_0 EO0_0] T33 = _mm_hsub_epi16(T22, T23); // [EO1_7 EO0_7 EO1_6 EO0_6 EO1_5 EO0_5 EO1_4 EO0_4] Tab = _mm_load_si128((__m128i*)tab_dct_8[1]); T40 = _mm_madd_epi16(T30, Tab); T41 = _mm_madd_epi16(T31, Tab); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add1), SHIFT1); T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_add1), SHIFT1); T50 = _mm_packs_epi32(T40, T41); Tab = _mm_load_si128((__m128i*)tab_dct_8[2]); T42 = _mm_madd_epi16(T30, Tab); T43 = _mm_madd_epi16(T31, Tab); T42 = _mm_srai_epi32(_mm_add_epi32(T42, c_add1), SHIFT1); T43 = _mm_srai_epi32(_mm_add_epi32(T43, c_add1), SHIFT1); T54 = _mm_packs_epi32(T42, T43); Tab = _mm_load_si128((__m128i*)tab_dct_8[3]); T44 = _mm_madd_epi16(T32, Tab); T45 = _mm_madd_epi16(T33, Tab); T44 = _mm_srai_epi32(_mm_add_epi32(T44, c_add1), SHIFT1); T45 = _mm_srai_epi32(_mm_add_epi32(T45, c_add1), SHIFT1); T52 = _mm_packs_epi32(T44, T45); Tab = _mm_load_si128((__m128i*)tab_dct_8[4]); T46 = _mm_madd_epi16(T32, Tab); T47 = _mm_madd_epi16(T33, Tab); T46 = _mm_srai_epi32(_mm_add_epi32(T46, c_add1), SHIFT1); T47 = _mm_srai_epi32(_mm_add_epi32(T47, c_add1), SHIFT1); T56 = _mm_packs_epi32(T46, T47); Tab = _mm_load_si128((__m128i*)tab_dct_8[5]); T40 = _mm_madd_epi16(T24, Tab); T41 = _mm_madd_epi16(T25, Tab); T42 = _mm_madd_epi16(T26, Tab); T43 = _mm_madd_epi16(T27, Tab); T40 = _mm_hadd_epi32(T40, T41); T42 = _mm_hadd_epi32(T42, T43); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add1), SHIFT1); T42 = _mm_srai_epi32(_mm_add_epi32(T42, c_add1), SHIFT1); T51 = _mm_packs_epi32(T40, T42); Tab = _mm_load_si128((__m128i*)tab_dct_8[6]); T40 = _mm_madd_epi16(T24, Tab); T41 = _mm_madd_epi16(T25, Tab); T42 = _mm_madd_epi16(T26, Tab); T43 = _mm_madd_epi16(T27, Tab); T40 = _mm_hadd_epi32(T40, T41); T42 = _mm_hadd_epi32(T42, T43); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add1), SHIFT1); T42 = _mm_srai_epi32(_mm_add_epi32(T42, c_add1), SHIFT1); T53 = _mm_packs_epi32(T40, T42); Tab = _mm_load_si128((__m128i*)tab_dct_8[7]); T40 = _mm_madd_epi16(T24, Tab); T41 = _mm_madd_epi16(T25, Tab); T42 = _mm_madd_epi16(T26, Tab); T43 = _mm_madd_epi16(T27, Tab); T40 = _mm_hadd_epi32(T40, T41); T42 = _mm_hadd_epi32(T42, T43); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add1), SHIFT1); T42 = _mm_srai_epi32(_mm_add_epi32(T42, c_add1), SHIFT1); T55 = _mm_packs_epi32(T40, T42); Tab = _mm_load_si128((__m128i*)tab_dct_8[8]); T40 = _mm_madd_epi16(T24, Tab); T41 = _mm_madd_epi16(T25, Tab); T42 = _mm_madd_epi16(T26, Tab); T43 = _mm_madd_epi16(T27, Tab); T40 = _mm_hadd_epi32(T40, T41); T42 = _mm_hadd_epi32(T42, T43); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add1), SHIFT1); T42 = _mm_srai_epi32(_mm_add_epi32(T42, c_add1), SHIFT1); T57 = _mm_packs_epi32(T40, T42); Tab = _mm_load_si128((__m128i*)tab_dct_8[0]); T10 = _mm_shuffle_epi8(T50, Tab); // [05 02 06 01 04 03 07 00] T11 = _mm_shuffle_epi8(T51, Tab); T12 = _mm_shuffle_epi8(T52, Tab); T13 = _mm_shuffle_epi8(T53, Tab); T14 = _mm_shuffle_epi8(T54, Tab); T15 = _mm_shuffle_epi8(T55, Tab); T16 = _mm_shuffle_epi8(T56, Tab); T17 = _mm_shuffle_epi8(T57, Tab); // DCT2 Tab = _mm_load_si128((__m128i*)tab_dct_8[1]); T20 = _mm_madd_epi16(T10, Tab); // [64*s25_0 64*s16_0 64*s34_0 64*s07_0] T21 = _mm_madd_epi16(T11, Tab); // [64*s25_1 64*s16_1 64*s34_1 64*s07_1] T22 = _mm_madd_epi16(T12, Tab); // [64*s25_2 64*s16_2 64*s34_2 64*s07_2] T23 = _mm_madd_epi16(T13, Tab); // [64*s25_3 64*s16_3 64*s34_3 64*s07_3] T24 = _mm_madd_epi16(T14, Tab); // [64*s25_4 64*s16_4 64*s34_4 64*s07_4] T25 = _mm_madd_epi16(T15, Tab); // [64*s25_5 64*s16_5 64*s34_5 64*s07_5] T26 = _mm_madd_epi16(T16, Tab); // [64*s25_6 64*s16_6 64*s34_6 64*s07_6] T27 = _mm_madd_epi16(T17, Tab); // [64*s25_7 64*s16_7 64*s34_7 64*s07_7] T30 = _mm_hadd_epi32(T20, T21); // [64*(s16+s25)_1 64*(s07+s34)_1 64*(s16+s25)_0 64*(s07+s34)_0] T31 = _mm_hadd_epi32(T22, T23); // [64*(s16+s25)_3 64*(s07+s34)_3 64*(s16+s25)_2 64*(s07+s34)_2] T32 = _mm_hadd_epi32(T24, T25); // [64*(s16+s25)_5 64*(s07+s34)_5 64*(s16+s25)_4 64*(s07+s34)_4] T33 = _mm_hadd_epi32(T26, T27); // [64*(s16+s25)_7 64*(s07+s34)_7 64*(s16+s25)_6 64*(s07+s34)_6] T40 = _mm_hadd_epi32(T30, T31); // [64*((s07+s34)+(s16+s25))_3 64*((s07+s34)+(s16+s25))_2 64*((s07+s34)+(s16+s25))_1 64*((s07+s34)+(s16+s25))_0] T41 = _mm_hadd_epi32(T32, T33); // [64*((s07+s34)+(s16+s25))_7 64*((s07+s34)+(s16+s25))_6 64*((s07+s34)+(s16+s25))_5 64*((s07+s34)+(s16+s25))_4] T42 = _mm_hsub_epi32(T30, T31); // [64*((s07+s34)-(s16+s25))_3 64*((s07+s34)-(s16+s25))_2 64*((s07+s34)-(s16+s25))_1 64*((s07+s34)-(s16+s25))_0] T43 = _mm_hsub_epi32(T32, T33); // [64*((s07+s34)-(s16+s25))_7 64*((s07+s34)-(s16+s25))_6 64*((s07+s34)-(s16+s25))_5 64*((s07+s34)-(s16+s25))_4] T50 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T51 = _mm_srai_epi32(_mm_add_epi32(T41, c_add2), SHIFT2); T52 = _mm_srai_epi32(_mm_add_epi32(T42, c_add2), SHIFT2); T53 = _mm_srai_epi32(_mm_add_epi32(T43, c_add2), SHIFT2); T50 = _mm_packs_epi32(T50, T51); T52 = _mm_packs_epi32(T52, T53); _mm_store_si128((__m128i*)(dst + 0 * 8), T50); _mm_store_si128((__m128i*)(dst + 4 * 8), T52); #define MAKE_ODD(tab, dstPos) \ Tab = _mm_load_si128((__m128i const*)tab_dct_8[(tab)]); \ T20 = _mm_madd_epi16(T10, Tab); \ T21 = _mm_madd_epi16(T11, Tab); \ T22 = _mm_madd_epi16(T12, Tab); \ T23 = _mm_madd_epi16(T13, Tab); \ T24 = _mm_madd_epi16(T14, Tab); \ T25 = _mm_madd_epi16(T15, Tab); \ T26 = _mm_madd_epi16(T16, Tab); \ T27 = _mm_madd_epi16(T17, Tab); \ T30 = _mm_hadd_epi32(T20, T21); \ T31 = _mm_hadd_epi32(T22, T23); \ T32 = _mm_hadd_epi32(T24, T25); \ T33 = _mm_hadd_epi32(T26, T27); \ T40 = _mm_hadd_epi32(T30, T31); \ T41 = _mm_hadd_epi32(T32, T33); \ T50 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); \ T51 = _mm_srai_epi32(_mm_add_epi32(T41, c_add2), SHIFT2); \ T50 = _mm_packs_epi32(T50, T51); \ _mm_store_si128((__m128i*)(dst + (dstPos)* 8), T50); MAKE_ODD(9, 2); MAKE_ODD(10, 6); MAKE_ODD(11, 1); MAKE_ODD(12, 3); MAKE_ODD(13, 5); MAKE_ODD(14, 7); #undef MAKE_ODD } /* --------------------------------------------------------------------------- */ void dct_c_16x4_sse128(const coeff_t *src, coeff_t *dst, int i_src) { const int shift1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; const int shift2 = B16X16_IN_BIT + FACTO_BIT - 2; const int ADD1 = (1 << shift1) >> 1; const int ADD2 = (1 << shift2) >> 1; const __m128i c_2 = _mm_set1_epi32(ADD1); // TODO: shift1 = 2 const __m128i k_ROUNDING2 = _mm_set1_epi32(ADD2); __m128i T00A, T01A, T02A, T03A, T00B, T01B, T02B, T03B; __m128i T10, T11, T12, T13; __m128i T20, T21, T22, T23; __m128i T30, T31, T32, T33; __m128i T40, T41, T44, T45; __m128i T50, T52; __m128i T60, T61, T62, T63; __m128i T70; __m128i r0, r1, t0, t1, u0, u1, u2, u3, v0, v1, v2, v3, w0, w1, w2, w3; __m128i res0, res1, res2, res3, res4, res5, res6, res7; __m128i d0, d1, d2, d3, d4, d5, d6, d7; __m128i im[16]; __m128i tmpZero = _mm_setzero_si128(); //////// DCT1 16x4->4x16 /////// //input data T00A = _mm_loadu_si128((__m128i*)&src[0 * i_src + 0]); // [07 06 05 04 03 02 01 00] T00B = _mm_loadu_si128((__m128i*)&src[0 * i_src + 8]); // [0F 0E 0D 0C 0B 0A 09 08] T01A = _mm_loadu_si128((__m128i*)&src[1 * i_src + 0]); // [17 16 15 14 13 12 11 10] T01B = _mm_loadu_si128((__m128i*)&src[1 * i_src + 8]); // [1F 1E 1D 1C 1B 1A 19 18] T02A = _mm_loadu_si128((__m128i*)&src[2 * i_src + 0]); // [27 26 25 24 23 22 21 20] T02B = _mm_loadu_si128((__m128i*)&src[2 * i_src + 8]); // [2F 2E 2D 2C 2B 2A 29 28] T03A = _mm_loadu_si128((__m128i*)&src[3 * i_src + 0]); // [37 36 35 34 33 32 31 30] T03B = _mm_loadu_si128((__m128i*)&src[3 * i_src + 8]); // [3F 3E 3D 3C 3B 3A 39 38] //shuffle T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T10 = _mm_add_epi16(T00A, T00B); T11 = _mm_add_epi16(T01A, T01B); T12 = _mm_add_epi16(T02A, T02B); T13 = _mm_add_epi16(T03A, T03B); T20 = _mm_sub_epi16(T00A, T00B); T21 = _mm_sub_epi16(T01A, T01B); T22 = _mm_sub_epi16(T02A, T02B); T23 = _mm_sub_epi16(T03A, T03B); T30 = _mm_shuffle_epi8(T10, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T31 = _mm_shuffle_epi8(T11, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T32 = _mm_shuffle_epi8(T12, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T33 = _mm_shuffle_epi8(T13, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T40 = _mm_hadd_epi16(T30, T31); T41 = _mm_hadd_epi16(T32, T33); T44 = _mm_hsub_epi16(T30, T31); T45 = _mm_hsub_epi16(T32, T33); T50 = _mm_hadd_epi16(T40, T41); T52 = _mm_hsub_epi16(T40, T41); T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1])); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_2), shift1); T70 = _mm_packs_epi32(T60, tmpZero); im[0] = T70; //_mm_storel_epi64((__m128i*)&dst[0 * 4], T70); T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2])); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_2), shift1); T70 = _mm_packs_epi32(T60, tmpZero); im[8] = T70; //_mm_storel_epi64((__m128i*)&dst[8 * 4], T70); T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3])); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_2), shift1); T70 = _mm_packs_epi32(T60, tmpZero); im[4] = T70; //_mm_storel_epi64((__m128i*)&dst[4 * 4], T70); T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4])); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_2), shift1); T70 = _mm_packs_epi32(T60, tmpZero); im[12] = T70; //_mm_storel_epi64((__m128i*)&dst[12 * 4], T70); T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[5])); T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[5])); T60 = _mm_hadd_epi32(T60, T61); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_2), shift1); T70 = _mm_packs_epi32(T60, tmpZero); im[2] = T70; //_mm_storel_epi64((__m128i*)&dst[2 * 4], T70); T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[6])); T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[6])); T60 = _mm_hadd_epi32(T60, T61); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_2), shift1); T70 = _mm_packs_epi32(T60, tmpZero); im[6] = T70; //_mm_storel_epi64((__m128i*)&dst[6 * 4], T70); T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[7])); T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[7])); T60 = _mm_hadd_epi32(T60, T61); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_2), shift1); T70 = _mm_packs_epi32(T60, tmpZero); im[10] = T70; //_mm_storel_epi64((__m128i*)&dst[10 * 4], T70); T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[8])); T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[8])); T60 = _mm_hadd_epi32(T60, T61); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_2), shift1); T70 = _mm_packs_epi32(T60, T61); im[14] = T70; //_mm_storel_epi64((__m128i*)&dst[14 * 4], T70); #define MAKE_ODD(tab, dstPos) \ T60 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ T61 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ T62 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ T63 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ T60 = _mm_hadd_epi32(T60, T61); \ T61 = _mm_hadd_epi32(T62, T63); \ T60 = _mm_hadd_epi32(T60, T61); \ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_2), shift1); \ T70 = _mm_packs_epi32(T60, tmpZero); \ im[dstPos] = T70; //_mm_storel_epi64((__m128i*)&dst[(dstPos)* 4], T70); MAKE_ODD(0, 1); MAKE_ODD(1, 3); MAKE_ODD(2, 5); MAKE_ODD(3, 7); MAKE_ODD(4, 9); MAKE_ODD(5, 11); MAKE_ODD(6, 13); MAKE_ODD(7, 15); #undef MAKE_ODD //////// DCT2 16x4->4x16 /////// //1st 4x4 t0 = _mm_unpacklo_epi64(im[0], im[1]); t1 = _mm_unpacklo_epi64(im[2], im[3]); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[0])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[0])); u0 = _mm_hadd_epi32(r0, r1); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[1])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[1])); u1 = _mm_hadd_epi32(r0, r1); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[2])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[2])); u2 = _mm_hadd_epi32(r0, r1); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[3])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[3])); u3 = _mm_hadd_epi32(r0, r1); v0 = _mm_add_epi32(u0, k_ROUNDING2); v1 = _mm_add_epi32(u1, k_ROUNDING2); v2 = _mm_add_epi32(u2, k_ROUNDING2); v3 = _mm_add_epi32(u3, k_ROUNDING2); w0 = _mm_srai_epi32(v0, shift2); w1 = _mm_srai_epi32(v1, shift2); w2 = _mm_srai_epi32(v2, shift2); w3 = _mm_srai_epi32(v3, shift2); res0 = _mm_packs_epi32(w0, w1); res1 = _mm_packs_epi32(w2, w3); //2nd 4x4 t0 = _mm_unpacklo_epi64(im[4], im[5]); t1 = _mm_unpacklo_epi64(im[6], im[7]); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[0])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[0])); u0 = _mm_hadd_epi32(r0, r1); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[1])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[1])); u1 = _mm_hadd_epi32(r0, r1); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[2])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[2])); u2 = _mm_hadd_epi32(r0, r1); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[3])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[3])); u3 = _mm_hadd_epi32(r0, r1); v0 = _mm_add_epi32(u0, k_ROUNDING2); v1 = _mm_add_epi32(u1, k_ROUNDING2); v2 = _mm_add_epi32(u2, k_ROUNDING2); v3 = _mm_add_epi32(u3, k_ROUNDING2); w0 = _mm_srai_epi32(v0, shift2); w1 = _mm_srai_epi32(v1, shift2); w2 = _mm_srai_epi32(v2, shift2); w3 = _mm_srai_epi32(v3, shift2); res2 = _mm_packs_epi32(w0, w1); res3 = _mm_packs_epi32(w2, w3); //3rd 4x4 t0 = _mm_unpacklo_epi64(im[8], im[9]); t1 = _mm_unpacklo_epi64(im[10], im[11]); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[0])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[0])); u0 = _mm_hadd_epi32(r0, r1); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[1])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[1])); u1 = _mm_hadd_epi32(r0, r1); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[2])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[2])); u2 = _mm_hadd_epi32(r0, r1); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[3])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[3])); u3 = _mm_hadd_epi32(r0, r1); v0 = _mm_add_epi32(u0, k_ROUNDING2); v1 = _mm_add_epi32(u1, k_ROUNDING2); v2 = _mm_add_epi32(u2, k_ROUNDING2); v3 = _mm_add_epi32(u3, k_ROUNDING2); w0 = _mm_srai_epi32(v0, shift2); w1 = _mm_srai_epi32(v1, shift2); w2 = _mm_srai_epi32(v2, shift2); w3 = _mm_srai_epi32(v3, shift2); res4 = _mm_packs_epi32(w0, w1); res5 = _mm_packs_epi32(w2, w3); //4th 4x4 t0 = _mm_unpacklo_epi64(im[12], im[13]); t1 = _mm_unpacklo_epi64(im[14], im[15]); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[0])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[0])); u0 = _mm_hadd_epi32(r0, r1); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[1])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[1])); u1 = _mm_hadd_epi32(r0, r1); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[2])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[2])); u2 = _mm_hadd_epi32(r0, r1); r0 = _mm_madd_epi16(t0, _mm_load_si128((__m128i*)tab_dct1_4[3])); r1 = _mm_madd_epi16(t1, _mm_load_si128((__m128i*)tab_dct1_4[3])); u3 = _mm_hadd_epi32(r0, r1); v0 = _mm_add_epi32(u0, k_ROUNDING2); v1 = _mm_add_epi32(u1, k_ROUNDING2); v2 = _mm_add_epi32(u2, k_ROUNDING2); v3 = _mm_add_epi32(u3, k_ROUNDING2); w0 = _mm_srai_epi32(v0, shift2); w1 = _mm_srai_epi32(v1, shift2); w2 = _mm_srai_epi32(v2, shift2); w3 = _mm_srai_epi32(v3, shift2); res6 = _mm_packs_epi32(w0, w1); res7 = _mm_packs_epi32(w2, w3); //store d0 = _mm_unpacklo_epi64(res0, res2); d1 = _mm_unpacklo_epi64(res4, res6); d2 = _mm_unpackhi_epi64(res0, res2); d3 = _mm_unpackhi_epi64(res4, res6); d4 = _mm_unpacklo_epi64(res1, res3); d5 = _mm_unpacklo_epi64(res5, res7); d6 = _mm_unpackhi_epi64(res1, res3); d7 = _mm_unpackhi_epi64(res5, res7); _mm_storeu_si128((__m128i *)(dst + 0), d0); _mm_storeu_si128((__m128i *)(dst + 8), d1); _mm_storeu_si128((__m128i *)(dst + 16), d2); _mm_storeu_si128((__m128i *)(dst + 24), d3); _mm_storeu_si128((__m128i *)(dst + 32), d4); _mm_storeu_si128((__m128i *)(dst + 40), d5); _mm_storeu_si128((__m128i *)(dst + 48), d6); _mm_storeu_si128((__m128i *)(dst + 56), d7); } /* --------------------------------------------------------------------------- */ void dct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_src) { const int SHIFT1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT - 2; const int ADD1 = (1 << SHIFT1) >> 1; const int SHIFT2 = B16X16_IN_BIT + FACTO_BIT; const int ADD2 = (1 << SHIFT2) >> 1; const __m128i c_add1 = _mm_set1_epi32(ADD1); const __m128i c_add2 = _mm_set1_epi32(ADD2); const __m128i k_p32_p32 = _mm_set1_epi16(32); const __m128i k_p32_m32 = pair_set_epi16(32, -32); const __m128i k_p17_p42 = pair_set_epi16(17, 42); const __m128i k_m42_p17 = pair_set_epi16(-42, 17); __m128i in[16]; __m128i tr00, tr01; __m128i r0, r1, r2, r3, t0, t2; __m128i u10, u11, u12, u13, u20, u21, u22, u23; __m128i T00A, T01A, T02A, T03A, T00B, T01B, T02B, T03B; __m128i T10, T11, T12, T13, T14, T15, T16, T17; __m128i T20, T21, T22, T23, T24, T25, T26, T27; __m128i T30, T31, T32, T33; __m128i T40, T41; __m128i T70; __m128i tmpZero = _mm_setzero_si128(); __m128i tab_dct_16_02 = _mm_loadu_si128((__m128i*)tab_dct_16_0[2]); __m128i tab_dct_16_03 = _mm_loadu_si128((__m128i*)tab_dct_16_0[3]); __m128i tab_dct_8_1 = _mm_loadu_si128((__m128i*)tab_dct_8[1]); __m128i tab_dct_16_18 = _mm_loadu_si128((__m128i*)tab_dct_16_1[8]); __m128i tab_dct_16_19 = _mm_loadu_si128((__m128i*)tab_dct_16_1[9]); __m128i tab_dct_16_110 = _mm_loadu_si128((__m128i*)tab_dct_16_1[10]); __m128i tab_dct_16_111 = _mm_loadu_si128((__m128i*)tab_dct_16_1[11]); __m128i tab_dct_16_112 = _mm_loadu_si128((__m128i*)tab_dct_16_1[12]); __m128i tab_dct_16_113 = _mm_loadu_si128((__m128i*)tab_dct_16_1[13]); ///// DCT1 4x16->16x4////// in[0] = _mm_load_si128((const __m128i *)(src + 0 * i_src)); in[1] = _mm_load_si128((const __m128i *)(src + 2 * i_src)); in[4] = _mm_load_si128((const __m128i *)(src + 4 * i_src)); in[5] = _mm_load_si128((const __m128i *)(src + 6 * i_src)); in[8] = _mm_load_si128((const __m128i *)(src + 8 * i_src)); in[9] = _mm_load_si128((const __m128i *)(src + 10 * i_src)); in[12] = _mm_load_si128((const __m128i *)(src + 12 * i_src)); in[13] = _mm_load_si128((const __m128i *)(src + 14 * i_src)); //transpose input data //1st 4x4 tr00 = _mm_shuffle_epi32(in[0], 0xD8);//00 01 04 05 02 03 06 07 tr01 = _mm_shuffle_epi32(in[1], 0xD8);//08 09 12 13 10 11 14 15 r0 = _mm_unpacklo_epi64(tr00, tr01);//00 01 04 05 08 09 12 13 r1 = _mm_unpackhi_epi64(tr00, tr01);//02 03 06 07 10 11 14 15 r2 = _mm_shufflehi_epi16(r0, 0xB1); r2 = _mm_shufflelo_epi16(r2, 0xB1);//01 00 05 04 09 08 13 12 r3 = _mm_shufflehi_epi16(r1, 0xB1); r3 = _mm_shufflelo_epi16(r3, 0xB1);//03 02 07 06 11 10 15 14 t0 = _mm_add_epi16(r0, r3);//00+03 01+02 04+07 05+06 08+11 09+10 12+15 13+14 t2 = _mm_sub_epi16(r2, r1); u10 = _mm_madd_epi16(t0, k_p32_p32);//(00+03)*32+(01+02)*32 (04+07)*32+(05+06)*32 (08+11)*32+(09+10)*32 (12+15)*32+(13+14)*32 u12 = _mm_madd_epi16(t0, k_p32_m32);//(00+03)*32-(01+02)*32 (04+07)*32-(05+06)*32 (08+11)*32-(09+10)*32 (12+15)*32-(13+14)*32 u11 = _mm_madd_epi16(t2, k_p17_p42); u13 = _mm_madd_epi16(t2, k_m42_p17); //λ u10 = _mm_srai_epi32(_mm_add_epi32(u10, c_add1), SHIFT1); u11 = _mm_srai_epi32(_mm_add_epi32(u11, c_add1), SHIFT1); u12 = _mm_srai_epi32(_mm_add_epi32(u12, c_add1), SHIFT1); u13 = _mm_srai_epi32(_mm_add_epi32(u13, c_add1), SHIFT1); //2nd 4x4 tr00 = _mm_shuffle_epi32(in[4], 0xD8); tr01 = _mm_shuffle_epi32(in[5], 0xD8); r0 = _mm_unpacklo_epi64(tr00, tr01); r1 = _mm_unpackhi_epi64(tr00, tr01); r2 = _mm_shufflehi_epi16(r0, 0xB1); r2 = _mm_shufflelo_epi16(r2, 0xB1); r3 = _mm_shufflehi_epi16(r1, 0xB1); r3 = _mm_shufflelo_epi16(r3, 0xB1); t0 = _mm_add_epi16(r0, r3); t2 = _mm_sub_epi16(r2, r1); u20 = _mm_madd_epi16(t0, k_p32_p32); u22 = _mm_madd_epi16(t0, k_p32_m32); u21 = _mm_madd_epi16(t2, k_p17_p42); u23 = _mm_madd_epi16(t2, k_m42_p17); //λ u20 = _mm_srai_epi32(_mm_add_epi32(u20, c_add1), SHIFT1); u21 = _mm_srai_epi32(_mm_add_epi32(u21, c_add1), SHIFT1); u22 = _mm_srai_epi32(_mm_add_epi32(u22, c_add1), SHIFT1); u23 = _mm_srai_epi32(_mm_add_epi32(u23, c_add1), SHIFT1); T00A = _mm_packs_epi32(u10, u20); T01A = _mm_packs_epi32(u11, u21); T02A = _mm_packs_epi32(u12, u22); T03A = _mm_packs_epi32(u13, u23); //3rd 4x4 tr00 = _mm_shuffle_epi32(in[8], 0xD8); tr01 = _mm_shuffle_epi32(in[9], 0xD8); r0 = _mm_unpacklo_epi64(tr00, tr01); r1 = _mm_unpackhi_epi64(tr00, tr01); r2 = _mm_shufflehi_epi16(r0, 0xB1); r2 = _mm_shufflelo_epi16(r2, 0xB1); r3 = _mm_shufflehi_epi16(r1, 0xB1); r3 = _mm_shufflelo_epi16(r3, 0xB1); t0 = _mm_add_epi16(r0, r3); t2 = _mm_sub_epi16(r2, r1); u10 = _mm_madd_epi16(t0, k_p32_p32); u12 = _mm_madd_epi16(t0, k_p32_m32); u11 = _mm_madd_epi16(t2, k_p17_p42); u13 = _mm_madd_epi16(t2, k_m42_p17); //λ u10 = _mm_srai_epi32(_mm_add_epi32(u10, c_add1), SHIFT1); u11 = _mm_srai_epi32(_mm_add_epi32(u11, c_add1), SHIFT1); u12 = _mm_srai_epi32(_mm_add_epi32(u12, c_add1), SHIFT1); u13 = _mm_srai_epi32(_mm_add_epi32(u13, c_add1), SHIFT1); //4th 4x4 tr00 = _mm_shuffle_epi32(in[12], 0xD8); tr01 = _mm_shuffle_epi32(in[13], 0xD8); r0 = _mm_unpacklo_epi64(tr00, tr01); r1 = _mm_unpackhi_epi64(tr00, tr01); r2 = _mm_shufflehi_epi16(r0, 0xB1); r2 = _mm_shufflelo_epi16(r2, 0xB1); r3 = _mm_shufflehi_epi16(r1, 0xB1); r3 = _mm_shufflelo_epi16(r3, 0xB1); t0 = _mm_add_epi16(r0, r3); t2 = _mm_sub_epi16(r2, r1); u20 = _mm_madd_epi16(t0, k_p32_p32); u22 = _mm_madd_epi16(t0, k_p32_m32); u21 = _mm_madd_epi16(t2, k_p17_p42); u23 = _mm_madd_epi16(t2, k_m42_p17); //λ u20 = _mm_srai_epi32(_mm_add_epi32(u20, c_add1), SHIFT1); u21 = _mm_srai_epi32(_mm_add_epi32(u21, c_add1), SHIFT1); u22 = _mm_srai_epi32(_mm_add_epi32(u22, c_add1), SHIFT1); u23 = _mm_srai_epi32(_mm_add_epi32(u23, c_add1), SHIFT1); T00B = _mm_packs_epi32(u10, u20); T01B = _mm_packs_epi32(u11, u21); T02B = _mm_packs_epi32(u12, u22); T03B = _mm_packs_epi32(u13, u23); ///// DCT2 16x4->4x16////// T00A = _mm_shuffle_epi8(T00A, tab_dct_16_02);//00 03 01 02 07 04 06 05 T00B = _mm_shuffle_epi8(T00B, tab_dct_16_03);//17 14 16 15 10 13 11 12 T01A = _mm_shuffle_epi8(T01A, tab_dct_16_02); T01B = _mm_shuffle_epi8(T01B, tab_dct_16_03); T02A = _mm_shuffle_epi8(T02A, tab_dct_16_02); T02B = _mm_shuffle_epi8(T02B, tab_dct_16_03); T03A = _mm_shuffle_epi8(T03A, tab_dct_16_02); T03B = _mm_shuffle_epi8(T03B, tab_dct_16_03); T10 = _mm_unpacklo_epi16(T00A, T00B);//00 17 03 14 01 16 02 15 T11 = _mm_unpackhi_epi16(T00A, T00B);//07 10 04 13 06 11 05 12 T12 = _mm_unpacklo_epi16(T01A, T01B); T13 = _mm_unpackhi_epi16(T01A, T01B); T14 = _mm_unpacklo_epi16(T02A, T02B); T15 = _mm_unpackhi_epi16(T02A, T02B); T16 = _mm_unpacklo_epi16(T03A, T03B); T17 = _mm_unpackhi_epi16(T03A, T03B); T20 = _mm_madd_epi16(T10, tab_dct_8_1);//00+17 03+14 01+16 02+15 *32 T21 = _mm_madd_epi16(T11, tab_dct_8_1);//07+10 04+13 06+11 05+12 *32 T22 = _mm_madd_epi16(T12, tab_dct_8_1); T23 = _mm_madd_epi16(T13, tab_dct_8_1); T24 = _mm_madd_epi16(T14, tab_dct_8_1); T25 = _mm_madd_epi16(T15, tab_dct_8_1); T26 = _mm_madd_epi16(T16, tab_dct_8_1); T27 = _mm_madd_epi16(T17, tab_dct_8_1); T30 = _mm_add_epi32(T20, T21);//00+17 + 07+10 03+14 + 04+13 01+16 + 06+11 02+15 + 05+12 T31 = _mm_add_epi32(T22, T23); T32 = _mm_add_epi32(T24, T25); T33 = _mm_add_epi32(T26, T27); T30 = _mm_hadd_epi32(T30, T31); T31 = _mm_hadd_epi32(T32, T33); T40 = _mm_hadd_epi32(T30, T31); T41 = _mm_hsub_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_add2), SHIFT2); T70 = _mm_packs_epi32(T40, tmpZero); _mm_storel_epi64((__m128i*)&dst[0 * 4], T70); T70 = _mm_packs_epi32(T41, tmpZero); _mm_storel_epi64((__m128i*)&dst[8 * 4], T70); T20 = _mm_madd_epi16(T10, tab_dct_16_18); T21 = _mm_madd_epi16(T11, tab_dct_16_18); T22 = _mm_madd_epi16(T12, tab_dct_16_18); T23 = _mm_madd_epi16(T13, tab_dct_16_18); T24 = _mm_madd_epi16(T14, tab_dct_16_18); T25 = _mm_madd_epi16(T15, tab_dct_16_18); T26 = _mm_madd_epi16(T16, tab_dct_16_18); T27 = _mm_madd_epi16(T17, tab_dct_16_18); T30 = _mm_add_epi32(T20, T21); T31 = _mm_add_epi32(T22, T23); T32 = _mm_add_epi32(T24, T25); T33 = _mm_add_epi32(T26, T27); T30 = _mm_hadd_epi32(T30, T31); T31 = _mm_hadd_epi32(T32, T33); T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T70 = _mm_packs_epi32(T40, tmpZero); _mm_storel_epi64((__m128i*)&dst[4 * 4], T70); T20 = _mm_madd_epi16(T10, tab_dct_16_19); T21 = _mm_madd_epi16(T11, tab_dct_16_19); T22 = _mm_madd_epi16(T12, tab_dct_16_19); T23 = _mm_madd_epi16(T13, tab_dct_16_19); T24 = _mm_madd_epi16(T14, tab_dct_16_19); T25 = _mm_madd_epi16(T15, tab_dct_16_19); T26 = _mm_madd_epi16(T16, tab_dct_16_19); T27 = _mm_madd_epi16(T17, tab_dct_16_19); T30 = _mm_add_epi32(T20, T21); T31 = _mm_add_epi32(T22, T23); T32 = _mm_add_epi32(T24, T25); T33 = _mm_add_epi32(T26, T27); T30 = _mm_hadd_epi32(T30, T31); T31 = _mm_hadd_epi32(T32, T33); T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T70 = _mm_packs_epi32(T40, tmpZero); _mm_storel_epi64((__m128i*)&dst[12 * 4], T70); T20 = _mm_madd_epi16(T10, tab_dct_16_110); T21 = _mm_madd_epi16(T11, tab_dct_16_110); T22 = _mm_madd_epi16(T12, tab_dct_16_110); T23 = _mm_madd_epi16(T13, tab_dct_16_110); T24 = _mm_madd_epi16(T14, tab_dct_16_110); T25 = _mm_madd_epi16(T15, tab_dct_16_110); T26 = _mm_madd_epi16(T16, tab_dct_16_110); T27 = _mm_madd_epi16(T17, tab_dct_16_110); T30 = _mm_sub_epi32(T20, T21); T31 = _mm_sub_epi32(T22, T23); T32 = _mm_sub_epi32(T24, T25); T33 = _mm_sub_epi32(T26, T27); T30 = _mm_hadd_epi32(T30, T31); T31 = _mm_hadd_epi32(T32, T33); T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T70 = _mm_packs_epi32(T40, tmpZero); _mm_storel_epi64((__m128i*)&dst[2 * 4], T70); T20 = _mm_madd_epi16(T10, tab_dct_16_111); T21 = _mm_madd_epi16(T11, tab_dct_16_111); T22 = _mm_madd_epi16(T12, tab_dct_16_111); T23 = _mm_madd_epi16(T13, tab_dct_16_111); T24 = _mm_madd_epi16(T14, tab_dct_16_111); T25 = _mm_madd_epi16(T15, tab_dct_16_111); T26 = _mm_madd_epi16(T16, tab_dct_16_111); T27 = _mm_madd_epi16(T17, tab_dct_16_111); T30 = _mm_sub_epi32(T20, T21); T31 = _mm_sub_epi32(T22, T23); T32 = _mm_sub_epi32(T24, T25); T33 = _mm_sub_epi32(T26, T27); T30 = _mm_hadd_epi32(T30, T31); T31 = _mm_hadd_epi32(T32, T33); T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T70 = _mm_packs_epi32(T40, tmpZero); _mm_storel_epi64((__m128i*)&dst[6 * 4], T70); T20 = _mm_madd_epi16(T10, tab_dct_16_112); T21 = _mm_madd_epi16(T11, tab_dct_16_112); T22 = _mm_madd_epi16(T12, tab_dct_16_112); T23 = _mm_madd_epi16(T13, tab_dct_16_112); T24 = _mm_madd_epi16(T14, tab_dct_16_112); T25 = _mm_madd_epi16(T15, tab_dct_16_112); T26 = _mm_madd_epi16(T16, tab_dct_16_112); T27 = _mm_madd_epi16(T17, tab_dct_16_112); T30 = _mm_sub_epi32(T20, T21); T31 = _mm_sub_epi32(T22, T23); T32 = _mm_sub_epi32(T24, T25); T33 = _mm_sub_epi32(T26, T27); T30 = _mm_hadd_epi32(T30, T31); T31 = _mm_hadd_epi32(T32, T33); T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T70 = _mm_packs_epi32(T40, tmpZero); _mm_storel_epi64((__m128i*)&dst[10 * 4], T70); T20 = _mm_madd_epi16(T10, tab_dct_16_113); T21 = _mm_madd_epi16(T11, tab_dct_16_113); T22 = _mm_madd_epi16(T12, tab_dct_16_113); T23 = _mm_madd_epi16(T13, tab_dct_16_113); T24 = _mm_madd_epi16(T14, tab_dct_16_113); T25 = _mm_madd_epi16(T15, tab_dct_16_113); T26 = _mm_madd_epi16(T16, tab_dct_16_113); T27 = _mm_madd_epi16(T17, tab_dct_16_113); T30 = _mm_sub_epi32(T20, T21); T31 = _mm_sub_epi32(T22, T23); T32 = _mm_sub_epi32(T24, T25); T33 = _mm_sub_epi32(T26, T27); T30 = _mm_hadd_epi32(T30, T31); T31 = _mm_hadd_epi32(T32, T33); T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T70 = _mm_packs_epi32(T40, tmpZero); _mm_storel_epi64((__m128i*)&dst[14 * 4], T70); __m128i tab_dct_16_1_tab; __m128i tab_dct_16_1_tab1; #define MAKE_ODD(tab, dstPos) \ tab_dct_16_1_tab = _mm_loadu_si128((__m128i const*)tab_dct_16_1[(tab)]); \ tab_dct_16_1_tab1 = _mm_loadu_si128((__m128i const*)tab_dct_16_1[(tab + 1)]); \ T20 = _mm_madd_epi16(T10, tab_dct_16_1_tab); /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \ T21 = _mm_madd_epi16(T11, tab_dct_16_1_tab1); /* [*O5_0 *O6_0 *O4_0 *O7_0] */ \ T22 = _mm_madd_epi16(T12, tab_dct_16_1_tab); \ T23 = _mm_madd_epi16(T13, tab_dct_16_1_tab1); \ T24 = _mm_madd_epi16(T14, tab_dct_16_1_tab); \ T25 = _mm_madd_epi16(T15, tab_dct_16_1_tab1); \ T26 = _mm_madd_epi16(T16, tab_dct_16_1_tab); \ T27 = _mm_madd_epi16(T17, tab_dct_16_1_tab1); \ \ T30 = _mm_add_epi32(T20, T21); \ T31 = _mm_add_epi32(T22, T23); \ T32 = _mm_add_epi32(T24, T25); \ T33 = _mm_add_epi32(T26, T27); \ \ T30 = _mm_hadd_epi32(T30, T31); \ T31 = _mm_hadd_epi32(T32, T33); \ \ T40 = _mm_hadd_epi32(T30, T31); \ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); \ \ T70 = _mm_packs_epi32(T40, tmpZero); \ _mm_storel_epi64((__m128i*)&dst[(dstPos)* 4], T70); MAKE_ODD(14, 1); MAKE_ODD(16, 3); MAKE_ODD(18, 5); MAKE_ODD(20, 7); MAKE_ODD(22, 9); MAKE_ODD(24, 11); MAKE_ODD(26, 13); MAKE_ODD(28, 15); #undef MAKE_ODD } /* --------------------------------------------------------------------------- */ void dct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_src) { const int SHIFT1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; const int SHIFT2 = B16X16_IN_BIT + FACTO_BIT; const int ADD1 = (1 << SHIFT1) >> 1; const int ADD2 = (1 << SHIFT2) >> 1; ALIGN32(int16_t tmp[16 * 16]); // Const __m128i c_add1 = _mm_set1_epi32(ADD1); __m128i c_add2 = _mm_set1_epi32(ADD2); __m128i T00A, T01A, T02A, T03A, T04A, T05A, T06A, T07A; __m128i T00B, T01B, T02B, T03B, T04B, T05B, T06B, T07B; __m128i T10, T11, T12, T13, T14, T15, T16, T17; __m128i T20, T21, T22, T23, T24, T25, T26, T27; __m128i T30, T31, T32, T33, T34, T35, T36, T37; __m128i T40, T41, T42, T43, T44, T45, T46, T47; __m128i T50, T51, T52, T53; __m128i T60, T61, T62, T63, T64, T65, T66, T67; __m128i T70; int i; // DCT1 for (i = 0; i < 16; i += 8) { T00A = _mm_load_si128((__m128i*)(src + (i + 0) * i_src + 0)); // [07 06 05 04 03 02 01 00] T00B = _mm_load_si128((__m128i*)(src + (i + 0) * i_src + 8)); // [0F 0E 0D 0C 0B 0A 09 08] T01A = _mm_load_si128((__m128i*)(src + (i + 1) * i_src + 0)); // [17 16 15 14 13 12 11 10] T01B = _mm_load_si128((__m128i*)(src + (i + 1) * i_src + 8)); // [1F 1E 1D 1C 1B 1A 19 18] T02A = _mm_load_si128((__m128i*)(src + (i + 2) * i_src + 0)); // [27 26 25 24 23 22 21 20] T02B = _mm_load_si128((__m128i*)(src + (i + 2) * i_src + 8)); // [2F 2E 2D 2C 2B 2A 29 28] T03A = _mm_load_si128((__m128i*)(src + (i + 3) * i_src + 0)); // [37 36 35 34 33 32 31 30] T03B = _mm_load_si128((__m128i*)(src + (i + 3) * i_src + 8)); // [3F 3E 3D 3C 3B 3A 39 38] T04A = _mm_load_si128((__m128i*)(src + (i + 4) * i_src + 0)); // [47 46 45 44 43 42 41 40] T04B = _mm_load_si128((__m128i*)(src + (i + 4) * i_src + 8)); // [4F 4E 4D 4C 4B 4A 49 48] T05A = _mm_load_si128((__m128i*)(src + (i + 5) * i_src + 0)); // [57 56 55 54 53 52 51 50] T05B = _mm_load_si128((__m128i*)(src + (i + 5) * i_src + 8)); // [5F 5E 5D 5C 5B 5A 59 58] T06A = _mm_load_si128((__m128i*)(src + (i + 6) * i_src + 0)); // [67 66 65 64 63 62 61 60] T06B = _mm_load_si128((__m128i*)(src + (i + 6) * i_src + 8)); // [6F 6E 6D 6C 6B 6A 69 68] T07A = _mm_load_si128((__m128i*)(src + (i + 7) * i_src + 0)); // [77 76 75 74 73 72 71 70] T07B = _mm_load_si128((__m128i*)(src + (i + 7) * i_src + 8)); // [7F 7E 7D 7C 7B 7A 79 78] T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T04B = _mm_shuffle_epi8(T04B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T05B = _mm_shuffle_epi8(T05B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T06B = _mm_shuffle_epi8(T06B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T07B = _mm_shuffle_epi8(T07B, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T10 = _mm_add_epi16(T00A, T00B); T11 = _mm_add_epi16(T01A, T01B); T12 = _mm_add_epi16(T02A, T02B); T13 = _mm_add_epi16(T03A, T03B); T14 = _mm_add_epi16(T04A, T04B); T15 = _mm_add_epi16(T05A, T05B); T16 = _mm_add_epi16(T06A, T06B); T17 = _mm_add_epi16(T07A, T07B); T20 = _mm_sub_epi16(T00A, T00B); T21 = _mm_sub_epi16(T01A, T01B); T22 = _mm_sub_epi16(T02A, T02B); T23 = _mm_sub_epi16(T03A, T03B); T24 = _mm_sub_epi16(T04A, T04B); T25 = _mm_sub_epi16(T05A, T05B); T26 = _mm_sub_epi16(T06A, T06B); T27 = _mm_sub_epi16(T07A, T07B); T30 = _mm_shuffle_epi8(T10, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T31 = _mm_shuffle_epi8(T11, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T32 = _mm_shuffle_epi8(T12, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T33 = _mm_shuffle_epi8(T13, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T34 = _mm_shuffle_epi8(T14, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T35 = _mm_shuffle_epi8(T15, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T36 = _mm_shuffle_epi8(T16, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T37 = _mm_shuffle_epi8(T17, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T40 = _mm_hadd_epi16(T30, T31); T41 = _mm_hadd_epi16(T32, T33); T42 = _mm_hadd_epi16(T34, T35); T43 = _mm_hadd_epi16(T36, T37); T44 = _mm_hsub_epi16(T30, T31); T45 = _mm_hsub_epi16(T32, T33); T46 = _mm_hsub_epi16(T34, T35); T47 = _mm_hsub_epi16(T36, T37); T50 = _mm_hadd_epi16(T40, T41); T51 = _mm_hadd_epi16(T42, T43); T52 = _mm_hsub_epi16(T40, T41); T53 = _mm_hsub_epi16(T42, T43); T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1])); T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1])); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_add1), SHIFT1); T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_add1), SHIFT1); T70 = _mm_packs_epi32(T60, T61); _mm_store_si128((__m128i*)(tmp + 0 * 16 + i), T70); T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2])); T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2])); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_add1), SHIFT1); T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_add1), SHIFT1); T70 = _mm_packs_epi32(T60, T61); _mm_store_si128((__m128i*)(tmp + 8 * 16 + i), T70); T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3])); T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3])); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_add1), SHIFT1); T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_add1), SHIFT1); T70 = _mm_packs_epi32(T60, T61); _mm_store_si128((__m128i*)(tmp + 4 * 16 + i), T70); T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4])); T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4])); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_add1), SHIFT1); T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_add1), SHIFT1); T70 = _mm_packs_epi32(T60, T61); _mm_store_si128((__m128i*)(tmp + 12 * 16 + i), T70); T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[5])); T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[5])); T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[5])); T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5])); T60 = _mm_hadd_epi32(T60, T61); T61 = _mm_hadd_epi32(T62, T63); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_add1), SHIFT1); T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_add1), SHIFT1); T70 = _mm_packs_epi32(T60, T61); _mm_store_si128((__m128i*)(tmp + 2 * 16 + i), T70); T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[6])); T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[6])); T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[6])); T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6])); T60 = _mm_hadd_epi32(T60, T61); T61 = _mm_hadd_epi32(T62, T63); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_add1), SHIFT1); T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_add1), SHIFT1); T70 = _mm_packs_epi32(T60, T61); _mm_store_si128((__m128i*)(tmp + 6 * 16 + i), T70); T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[7])); T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[7])); T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[7])); T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7])); T60 = _mm_hadd_epi32(T60, T61); T61 = _mm_hadd_epi32(T62, T63); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_add1), SHIFT1); T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_add1), SHIFT1); T70 = _mm_packs_epi32(T60, T61); _mm_store_si128((__m128i*)(tmp + 10 * 16 + i), T70); T60 = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[8])); T61 = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[8])); T62 = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[8])); T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[8])); T60 = _mm_hadd_epi32(T60, T61); T61 = _mm_hadd_epi32(T62, T63); T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_add1), SHIFT1); T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_add1), SHIFT1); T70 = _mm_packs_epi32(T60, T61); _mm_store_si128((__m128i*)(tmp + 14 * 16 + i), T70); #define MAKE_ODD(tab, dstPos) \ T60 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ T61 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ T62 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ T63 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ T64 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ T65 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ T66 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ T67 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \ T60 = _mm_hadd_epi32(T60, T61); \ T61 = _mm_hadd_epi32(T62, T63); \ T62 = _mm_hadd_epi32(T64, T65); \ T63 = _mm_hadd_epi32(T66, T67); \ T60 = _mm_hadd_epi32(T60, T61); \ T61 = _mm_hadd_epi32(T62, T63); \ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_add1), SHIFT1); \ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_add1), SHIFT1); \ T70 = _mm_packs_epi32(T60, T61); \ _mm_store_si128((__m128i*)(tmp + (dstPos) * 16 + i), T70); MAKE_ODD(0, 1); MAKE_ODD(1, 3); MAKE_ODD(2, 5); MAKE_ODD(3, 7); MAKE_ODD(4, 9); MAKE_ODD(5, 11); MAKE_ODD(6, 13); MAKE_ODD(7, 15); #undef MAKE_ODD } // DCT2 for (i = 0; i < 16; i += 4) { T00A = _mm_load_si128((__m128i*)(tmp + (i + 0) * 16 + 0)); // [07 06 05 04 03 02 01 00] T00B = _mm_load_si128((__m128i*)(tmp + (i + 0) * 16 + 8)); // [0F 0E 0D 0C 0B 0A 09 08] T01A = _mm_load_si128((__m128i*)(tmp + (i + 1) * 16 + 0)); // [17 16 15 14 13 12 11 10] T01B = _mm_load_si128((__m128i*)(tmp + (i + 1) * 16 + 8)); // [1F 1E 1D 1C 1B 1A 19 18] T02A = _mm_load_si128((__m128i*)(tmp + (i + 2) * 16 + 0)); // [27 26 25 24 23 22 21 20] T02B = _mm_load_si128((__m128i*)(tmp + (i + 2) * 16 + 8)); // [2F 2E 2D 2C 2B 2A 29 28] T03A = _mm_load_si128((__m128i*)(tmp + (i + 3) * 16 + 0)); // [37 36 35 34 33 32 31 30] T03B = _mm_load_si128((__m128i*)(tmp + (i + 3) * 16 + 8)); // [3F 3E 3D 3C 3B 3A 39 38] T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); T01A = _mm_shuffle_epi8(T01A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); T02A = _mm_shuffle_epi8(T02A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); T03A = _mm_shuffle_epi8(T03A, _mm_load_si128((__m128i*)tab_dct_16_0[2])); T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_16_0[3])); T10 = _mm_unpacklo_epi16(T00A, T00B); T11 = _mm_unpackhi_epi16(T00A, T00B); T12 = _mm_unpacklo_epi16(T01A, T01B); T13 = _mm_unpackhi_epi16(T01A, T01B); T14 = _mm_unpacklo_epi16(T02A, T02B); T15 = _mm_unpackhi_epi16(T02A, T02B); T16 = _mm_unpacklo_epi16(T03A, T03B); T17 = _mm_unpackhi_epi16(T03A, T03B); T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_8[1])); T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_8[1])); T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_8[1])); T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_8[1])); T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_8[1])); T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_8[1])); T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_8[1])); T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_8[1])); T30 = _mm_add_epi32(T20, T21); T31 = _mm_add_epi32(T22, T23); T32 = _mm_add_epi32(T24, T25); T33 = _mm_add_epi32(T26, T27); T30 = _mm_hadd_epi32(T30, T31); T31 = _mm_hadd_epi32(T32, T33); T40 = _mm_hadd_epi32(T30, T31); T41 = _mm_hsub_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_add2), SHIFT2); T40 = _mm_packs_epi32(T40, T40); T41 = _mm_packs_epi32(T41, T41); _mm_storel_epi64((__m128i*)(dst + 0 * 16 + i), T40); _mm_storel_epi64((__m128i*)(dst + 8 * 16 + i), T41); T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T30 = _mm_add_epi32(T20, T21); T31 = _mm_add_epi32(T22, T23); T32 = _mm_add_epi32(T24, T25); T33 = _mm_add_epi32(T26, T27); T30 = _mm_hadd_epi32(T30, T31); T31 = _mm_hadd_epi32(T32, T33); T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T40 = _mm_packs_epi32(T40, T40); _mm_storel_epi64((__m128i*)(dst + 4 * 16 + i), T40); T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T30 = _mm_add_epi32(T20, T21); T31 = _mm_add_epi32(T22, T23); T32 = _mm_add_epi32(T24, T25); T33 = _mm_add_epi32(T26, T27); T30 = _mm_hadd_epi32(T30, T31); T31 = _mm_hadd_epi32(T32, T33); T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T40 = _mm_packs_epi32(T40, T40); _mm_storel_epi64((__m128i*)(dst + 12 * 16 + i), T40); T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[10])); T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[10])); T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[10])); T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[10])); T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[10])); T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[10])); T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[10])); T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[10])); T30 = _mm_sub_epi32(T20, T21); T31 = _mm_sub_epi32(T22, T23); T32 = _mm_sub_epi32(T24, T25); T33 = _mm_sub_epi32(T26, T27); T30 = _mm_hadd_epi32(T30, T31); T31 = _mm_hadd_epi32(T32, T33); T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T40 = _mm_packs_epi32(T40, T40); _mm_storel_epi64((__m128i*)(dst + 2 * 16 + i), T40); T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[11])); T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[11])); T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[11])); T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[11])); T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[11])); T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[11])); T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[11])); T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[11])); T30 = _mm_sub_epi32(T20, T21); T31 = _mm_sub_epi32(T22, T23); T32 = _mm_sub_epi32(T24, T25); T33 = _mm_sub_epi32(T26, T27); T30 = _mm_hadd_epi32(T30, T31); T31 = _mm_hadd_epi32(T32, T33); T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T40 = _mm_packs_epi32(T40, T40); _mm_storel_epi64((__m128i*)(dst + 6 * 16 + i), T40); T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[12])); T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[12])); T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[12])); T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[12])); T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[12])); T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[12])); T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[12])); T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[12])); T30 = _mm_sub_epi32(T20, T21); T31 = _mm_sub_epi32(T22, T23); T32 = _mm_sub_epi32(T24, T25); T33 = _mm_sub_epi32(T26, T27); T30 = _mm_hadd_epi32(T30, T31); T31 = _mm_hadd_epi32(T32, T33); T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T40 = _mm_packs_epi32(T40, T40); _mm_storel_epi64((__m128i*)(dst + 10 * 16 + i), T40); T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[13])); T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[13])); T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[13])); T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[13])); T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[13])); T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[13])); T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[13])); T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[13])); T30 = _mm_sub_epi32(T20, T21); T31 = _mm_sub_epi32(T22, T23); T32 = _mm_sub_epi32(T24, T25); T33 = _mm_sub_epi32(T26, T27); T30 = _mm_hadd_epi32(T30, T31); T31 = _mm_hadd_epi32(T32, T33); T40 = _mm_hadd_epi32(T30, T31); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T40 = _mm_packs_epi32(T40, T40); _mm_storel_epi64((__m128i*)(dst + 14 * 16 + i), T40); #define MAKE_ODD(tab, dstPos) \ T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) ])); /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \ T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); /* [*O5_0 *O6_0 *O4_0 *O7_0] */ \ T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) ])); \ T23 = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \ T24 = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) ])); \ T25 = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \ T26 = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) ])); \ T27 = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \ \ T30 = _mm_add_epi32(T20, T21); \ T31 = _mm_add_epi32(T22, T23); \ T32 = _mm_add_epi32(T24, T25); \ T33 = _mm_add_epi32(T26, T27); \ \ T30 = _mm_hadd_epi32(T30, T31); \ T31 = _mm_hadd_epi32(T32, T33); \ \ T40 = _mm_hadd_epi32(T30, T31); \ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); \ T40 = _mm_packs_epi32(T40, T40); \ _mm_storel_epi64((__m128i*)(dst + (dstPos) * 16 + i), T40); MAKE_ODD(14, 1); MAKE_ODD(16, 3); MAKE_ODD(18, 5); MAKE_ODD(20, 7); MAKE_ODD(22, 9); MAKE_ODD(24, 11); MAKE_ODD(26, 13); MAKE_ODD(28, 15); #undef MAKE_ODD } } /* --------------------------------------------------------------------------- */ void dct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_src) { int i; int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT - 2 + (i_src & 0x01); int shift2 = B32X32_IN_BIT + FACTO_BIT; const int ADD1 = (1 << shift1) >> 1; const int ADD2 = (1 << shift2) >> 1; const __m128i c_512 = _mm_set1_epi32(ADD2); // TODO: shift1 = 2 const __m128i k_ROUNDING1 = _mm_set1_epi32(ADD1); __m128i T00A, T01A, T02A, T03A; __m128i T00B, T01B, T02B, T03B; __m128i T00C, T01C, T02C, T03C; __m128i T00D, T01D, T02D, T03D; __m128i T10A, T11A, T12A, T13A; __m128i T10B, T11B, T12B, T13B; __m128i T20, T21, T22, T23, T24, T25, T26, T27; __m128i T30, T31, T32, T33, T34, T35, T36, T37; __m128i T60, T61, T62, T63, T64, T65, T66, T67; __m128i TT00A, TT01A, TT02A, TT03A; __m128i TT00B, TT01B, TT02B, TT03B; __m128i TT00C, TT01C, TT02C, TT03C; __m128i TT00D, TT01D, TT02D, TT03D; __m128i TT10A, TT11A, TT12A, TT13A; __m128i TT10B, TT11B, TT12B, TT13B; __m128i TT20, TT21, TT22, TT23, TT24, TT25, TT26, TT27; __m128i TT30, TT31, TT32, TT33, TT34, TT35, TT36, TT37; __m128i TT60, TT61, TT62, TT63, TT64, TT65, TT66, TT67; __m128i tResult; __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; __m128i in0, in1, in2, in3, in4, in5, in6, in7; __m128i res0[4], res1[4], res2[4], res3[4], res4[4], res5[4], res6[4], res7[4]; __m128i r0, r1, r2, r3, t0, t1, t2, t3; __m128i q0, q1, q2, q3, q4, q5, q6, q7, u0, u1, u2, u3, u4, u5, u6, u7, v0, v1, v2, v3, v4, v5, v6, v7, w0, w1, w2, w3, w4, w5, w6, w7; const __m128i k_p32_p32 = _mm_set1_epi16(32); const __m128i k_p32_m32 = pair_set_epi16(32, -32); const __m128i k_p42_p17 = pair_set_epi16(42, 17); const __m128i k_p17_m42 = pair_set_epi16(17, -42); const __m128i k_p44_p38 = pair_set_epi16(44, 38); const __m128i k_p25_p9 = pair_set_epi16(25, 9); const __m128i k_p38_m9 = pair_set_epi16(38, -9); const __m128i k_m44_m25 = pair_set_epi16(-44, -25); const __m128i k_p25_m44 = pair_set_epi16(25, -44); const __m128i k_p9_p38 = pair_set_epi16(9, 38); const __m128i k_p9_m25 = pair_set_epi16(9, -25); const __m128i k_p38_m44 = pair_set_epi16(38, -44); i_src &= 0xFE; for (i = 0; i < 32 / 8; i++) { //load data in0 = _mm_loadu_si128((__m128i*)&src[(0 + i * 8) * i_src]); in1 = _mm_loadu_si128((__m128i*)&src[(1 + i * 8) * i_src]); in2 = _mm_loadu_si128((__m128i*)&src[(2 + i * 8) * i_src]); in3 = _mm_loadu_si128((__m128i*)&src[(3 + i * 8) * i_src]); in4 = _mm_loadu_si128((__m128i*)&src[(4 + i * 8) * i_src]); in5 = _mm_loadu_si128((__m128i*)&src[(5 + i * 8) * i_src]); in6 = _mm_loadu_si128((__m128i*)&src[(6 + i * 8) * i_src]); in7 = _mm_loadu_si128((__m128i*)&src[(7 + i * 8) * i_src]); //DCT1 #define TRANSPOSE_8x8(I0, I1, I2, I3, I4, I5, I6, I7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ I0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ I1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ I2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ I3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ I4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ I5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ I6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ I7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ TRANSPOSE_8x8(in0, in1, in2, in3, in4, in5, in6, in7) #undef TRANSPOSE_8x8 q0 = _mm_add_epi16(in0, in7); //E0 q1 = _mm_add_epi16(in1, in6); //E1 q2 = _mm_add_epi16(in2, in5); //E2 q3 = _mm_add_epi16(in3, in4); //E3 q4 = _mm_sub_epi16(in0, in7); //O0 q5 = _mm_sub_epi16(in1, in6); //O1 q6 = _mm_sub_epi16(in2, in5); //O2 q7 = _mm_sub_epi16(in3, in4); //O3 //even lines r0 = _mm_add_epi16(q0, q3); //EE0 r1 = _mm_add_epi16(q1, q2); //EE1 r2 = _mm_sub_epi16(q0, q3); //EO0 r3 = _mm_sub_epi16(q1, q2); //EO1 t0 = _mm_unpacklo_epi16(r0, r1); //interleave EE0 & EE1 t1 = _mm_unpackhi_epi16(r0, r1); t2 = _mm_unpacklo_epi16(r2, r3); //interleave EO0 & EO1 t3 = _mm_unpackhi_epi16(r2, r3); u0 = _mm_madd_epi16(t0, k_p32_p32); u1 = _mm_madd_epi16(t1, k_p32_p32); u2 = _mm_madd_epi16(t0, k_p32_m32); u3 = _mm_madd_epi16(t1, k_p32_m32); u4 = _mm_madd_epi16(t2, k_p42_p17); u5 = _mm_madd_epi16(t3, k_p42_p17); u6 = _mm_madd_epi16(t2, k_p17_m42); u7 = _mm_madd_epi16(t3, k_p17_m42); v0 = _mm_add_epi32(u0, k_ROUNDING1); v1 = _mm_add_epi32(u1, k_ROUNDING1); v2 = _mm_add_epi32(u2, k_ROUNDING1); v3 = _mm_add_epi32(u3, k_ROUNDING1); v4 = _mm_add_epi32(u4, k_ROUNDING1); v5 = _mm_add_epi32(u5, k_ROUNDING1); v6 = _mm_add_epi32(u6, k_ROUNDING1); v7 = _mm_add_epi32(u7, k_ROUNDING1); w0 = _mm_srai_epi32(v0, shift1); w1 = _mm_srai_epi32(v1, shift1); w2 = _mm_srai_epi32(v2, shift1); w3 = _mm_srai_epi32(v3, shift1); w4 = _mm_srai_epi32(v4, shift1); w5 = _mm_srai_epi32(v5, shift1); w6 = _mm_srai_epi32(v6, shift1); w7 = _mm_srai_epi32(v7, shift1); res0[i] = _mm_packs_epi32(w0, w1); res4[i] = _mm_packs_epi32(w2, w3); res2[i] = _mm_packs_epi32(w4, w5); res6[i] = _mm_packs_epi32(w6, w7); // odd lines t0 = _mm_unpacklo_epi16(q4, q5); //interleave O0 & O1 t1 = _mm_unpackhi_epi16(q4, q5); t2 = _mm_unpacklo_epi16(q6, q7); //interleave O2 & O3 t3 = _mm_unpackhi_epi16(q6, q7); //line 1 u0 = _mm_madd_epi16(t0, k_p44_p38); u1 = _mm_madd_epi16(t1, k_p44_p38); u2 = _mm_madd_epi16(t2, k_p25_p9); u3 = _mm_madd_epi16(t3, k_p25_p9); v0 = _mm_add_epi32(u0, u2); v1 = _mm_add_epi32(u1, u3); v0 = _mm_add_epi32(v0, k_ROUNDING1); v1 = _mm_add_epi32(v1, k_ROUNDING1); w0 = _mm_srai_epi32(v0, shift1); w1 = _mm_srai_epi32(v1, shift1); res1[i] = _mm_packs_epi32(w0, w1); //line 3 u0 = _mm_madd_epi16(t0, k_p38_m9); u1 = _mm_madd_epi16(t1, k_p38_m9); u2 = _mm_madd_epi16(t2, k_m44_m25); u3 = _mm_madd_epi16(t3, k_m44_m25); v0 = _mm_add_epi32(u0, u2); v1 = _mm_add_epi32(u1, u3); v0 = _mm_add_epi32(v0, k_ROUNDING1); v1 = _mm_add_epi32(v1, k_ROUNDING1); w0 = _mm_srai_epi32(v0, shift1); w1 = _mm_srai_epi32(v1, shift1); res3[i] = _mm_packs_epi32(w0, w1); //line 5 u0 = _mm_madd_epi16(t0, k_p25_m44); u1 = _mm_madd_epi16(t1, k_p25_m44); u2 = _mm_madd_epi16(t2, k_p9_p38); u3 = _mm_madd_epi16(t3, k_p9_p38); v0 = _mm_add_epi32(u0, u2); v1 = _mm_add_epi32(u1, u3); v0 = _mm_add_epi32(v0, k_ROUNDING1); v1 = _mm_add_epi32(v1, k_ROUNDING1); w0 = _mm_srai_epi32(v0, shift1); w1 = _mm_srai_epi32(v1, shift1); res5[i] = _mm_packs_epi32(w0, w1); //line 7 u0 = _mm_madd_epi16(t0, k_p9_m25); u1 = _mm_madd_epi16(t1, k_p9_m25); u2 = _mm_madd_epi16(t2, k_p38_m44); u3 = _mm_madd_epi16(t3, k_p38_m44); v0 = _mm_add_epi32(u0, u2); v1 = _mm_add_epi32(u1, u3); v0 = _mm_add_epi32(v0, k_ROUNDING1); v1 = _mm_add_epi32(v1, k_ROUNDING1); w0 = _mm_srai_epi32(v0, shift1); w1 = _mm_srai_epi32(v1, shift1); res7[i] = _mm_packs_epi32(w0, w1); } //DCT2 T00A = res0[0]; // [07 06 05 04 03 02 01 00] T00B = res0[1]; // [15 14 13 12 11 10 09 08] T00C = res0[2]; // [23 22 21 20 19 18 17 16] T00D = res0[3]; // [31 30 29 28 27 26 25 24] T01A = res1[0]; T01B = res1[1]; T01C = res1[2]; T01D = res1[3]; T02A = res2[0]; T02B = res2[1]; T02C = res2[2]; T02D = res2[3]; T03A = res3[0]; T03B = res3[1]; T03C = res3[2]; T03D = res3[3]; TT00A = res4[0]; TT00B = res4[1]; TT00C = res4[2]; TT00D = res4[3]; TT01A = res5[0]; TT01B = res5[1]; TT01C = res5[2]; TT01D = res5[3]; TT02A = res6[0]; TT02B = res6[1]; TT02C = res6[2]; TT02D = res6[3]; TT03A = res7[0]; TT03B = res7[1]; TT03C = res7[2]; TT03D = res7[3]; T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); // [16 17 18 19 20 21 22 23] T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); // [24 25 26 27 28 29 30 31] T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); TT00C = _mm_shuffle_epi8(TT00C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); TT00D = _mm_shuffle_epi8(TT00D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); TT01C = _mm_shuffle_epi8(TT01C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); TT01D = _mm_shuffle_epi8(TT01D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); TT02C = _mm_shuffle_epi8(TT02C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); TT02D = _mm_shuffle_epi8(TT02D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); TT03C = _mm_shuffle_epi8(TT03C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); TT03D = _mm_shuffle_epi8(TT03D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T10A = _mm_unpacklo_epi16(T00A, T00D); // [28 03 29 02 30 01 31 00] T10B = _mm_unpackhi_epi16(T00A, T00D); // [24 07 25 06 26 05 27 04] T00A = _mm_unpacklo_epi16(T00B, T00C); // [20 11 21 10 22 09 23 08] T00B = _mm_unpackhi_epi16(T00B, T00C); // [16 15 17 14 18 13 19 12] T11A = _mm_unpacklo_epi16(T01A, T01D); T11B = _mm_unpackhi_epi16(T01A, T01D); T01A = _mm_unpacklo_epi16(T01B, T01C); T01B = _mm_unpackhi_epi16(T01B, T01C); T12A = _mm_unpacklo_epi16(T02A, T02D); T12B = _mm_unpackhi_epi16(T02A, T02D); T02A = _mm_unpacklo_epi16(T02B, T02C); T02B = _mm_unpackhi_epi16(T02B, T02C); T13A = _mm_unpacklo_epi16(T03A, T03D); T13B = _mm_unpackhi_epi16(T03A, T03D); T03A = _mm_unpacklo_epi16(T03B, T03C); T03B = _mm_unpackhi_epi16(T03B, T03C); TT10A = _mm_unpacklo_epi16(TT00A, TT00D); TT10B = _mm_unpackhi_epi16(TT00A, TT00D); TT00A = _mm_unpacklo_epi16(TT00B, TT00C); TT00B = _mm_unpackhi_epi16(TT00B, TT00C); TT11A = _mm_unpacklo_epi16(TT01A, TT01D); TT11B = _mm_unpackhi_epi16(TT01A, TT01D); TT01A = _mm_unpacklo_epi16(TT01B, TT01C); TT01B = _mm_unpackhi_epi16(TT01B, TT01C); TT12A = _mm_unpacklo_epi16(TT02A, TT02D); TT12B = _mm_unpackhi_epi16(TT02A, TT02D); TT02A = _mm_unpacklo_epi16(TT02B, TT02C); TT02B = _mm_unpackhi_epi16(TT02B, TT02C); TT13A = _mm_unpacklo_epi16(TT03A, TT03D); TT13B = _mm_unpackhi_epi16(TT03A, TT03D); TT03A = _mm_unpacklo_epi16(TT03B, TT03C); TT03B = _mm_unpackhi_epi16(TT03B, TT03C); #define MAKE_ODD(tab0, tab1, tab2, tab3, dstPos) \ T20 = _mm_madd_epi16(T10A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ T21 = _mm_madd_epi16(T10B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ T22 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ T23 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ T24 = _mm_madd_epi16(T11A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ T25 = _mm_madd_epi16(T11B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ T26 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ T27 = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ T30 = _mm_madd_epi16(T12A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ T31 = _mm_madd_epi16(T12B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ T32 = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ T33 = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ T34 = _mm_madd_epi16(T13A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ T35 = _mm_madd_epi16(T13B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ T36 = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ T37 = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ \ T60 = _mm_hadd_epi32(T20, T21); \ T61 = _mm_hadd_epi32(T22, T23); \ T62 = _mm_hadd_epi32(T24, T25); \ T63 = _mm_hadd_epi32(T26, T27); \ T64 = _mm_hadd_epi32(T30, T31); \ T65 = _mm_hadd_epi32(T32, T33); \ T66 = _mm_hadd_epi32(T34, T35); \ T67 = _mm_hadd_epi32(T36, T37); \ \ T60 = _mm_hadd_epi32(T60, T61); \ T61 = _mm_hadd_epi32(T62, T63); \ T62 = _mm_hadd_epi32(T64, T65); \ T63 = _mm_hadd_epi32(T66, T67); \ \ T60 = _mm_hadd_epi32(T60, T61); \ T61 = _mm_hadd_epi32(T62, T63); \ \ T60 = _mm_hadd_epi32(T60, T61); \ \ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_512), shift2); \ \ TT20 = _mm_madd_epi16(TT10A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ TT21 = _mm_madd_epi16(TT10B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ TT22 = _mm_madd_epi16(TT00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ TT23 = _mm_madd_epi16(TT00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ TT24 = _mm_madd_epi16(TT11A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ TT25 = _mm_madd_epi16(TT11B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ TT26 = _mm_madd_epi16(TT01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ TT27 = _mm_madd_epi16(TT01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ TT30 = _mm_madd_epi16(TT12A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ TT31 = _mm_madd_epi16(TT12B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ TT32 = _mm_madd_epi16(TT02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ TT33 = _mm_madd_epi16(TT02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ TT34 = _mm_madd_epi16(TT13A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ TT35 = _mm_madd_epi16(TT13B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ TT36 = _mm_madd_epi16(TT03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ TT37 = _mm_madd_epi16(TT03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ \ TT60 = _mm_hadd_epi32(TT20, TT21); \ TT61 = _mm_hadd_epi32(TT22, TT23); \ TT62 = _mm_hadd_epi32(TT24, TT25); \ TT63 = _mm_hadd_epi32(TT26, TT27); \ TT64 = _mm_hadd_epi32(TT30, TT31); \ TT65 = _mm_hadd_epi32(TT32, TT33); \ TT66 = _mm_hadd_epi32(TT34, TT35); \ TT67 = _mm_hadd_epi32(TT36, TT37); \ \ TT60 = _mm_hadd_epi32(TT60, TT61); \ TT61 = _mm_hadd_epi32(TT62, TT63); \ TT62 = _mm_hadd_epi32(TT64, TT65); \ TT63 = _mm_hadd_epi32(TT66, TT67); \ \ TT60 = _mm_hadd_epi32(TT60, TT61); \ TT61 = _mm_hadd_epi32(TT62, TT63); \ \ TT60 = _mm_hadd_epi32(TT60, TT61); \ \ TT60 = _mm_srai_epi32(_mm_add_epi32(TT60, c_512), shift2); \ \ tResult = _mm_packs_epi32(T60, TT60); \ _mm_storeu_si128((__m128i*)&dst[(dstPos)* 8], tResult); \ MAKE_ODD(44, 44, 44, 44, 0); MAKE_ODD(45, 45, 45, 45, 16); MAKE_ODD(46, 47, 46, 47, 8); MAKE_ODD(48, 49, 48, 49, 24); MAKE_ODD(50, 51, 52, 53, 4); MAKE_ODD(54, 55, 56, 57, 12); MAKE_ODD(58, 59, 60, 61, 20); MAKE_ODD(62, 63, 64, 65, 28); MAKE_ODD(66, 67, 68, 69, 2); MAKE_ODD(70, 71, 72, 73, 6); MAKE_ODD(74, 75, 76, 77, 10); MAKE_ODD(78, 79, 80, 81, 14); MAKE_ODD(82, 83, 84, 85, 18); MAKE_ODD(86, 87, 88, 89, 22); MAKE_ODD(90, 91, 92, 93, 26); MAKE_ODD(94, 95, 96, 97, 30); MAKE_ODD(98, 99, 100, 101, 1); MAKE_ODD(102, 103, 104, 105, 3); MAKE_ODD(106, 107, 108, 109, 5); MAKE_ODD(110, 111, 112, 113, 7); MAKE_ODD(114, 115, 116, 117, 9); MAKE_ODD(118, 119, 120, 121, 11); MAKE_ODD(122, 123, 124, 125, 13); MAKE_ODD(126, 127, 128, 129, 15); MAKE_ODD(130, 131, 132, 133, 17); MAKE_ODD(134, 135, 136, 137, 19); MAKE_ODD(138, 139, 140, 141, 21); MAKE_ODD(142, 143, 144, 145, 23); MAKE_ODD(146, 147, 148, 149, 25); MAKE_ODD(150, 151, 152, 153, 27); MAKE_ODD(154, 155, 156, 157, 29); MAKE_ODD(158, 159, 160, 161, 31); #undef MAKE_ODD } /* --------------------------------------------------------------------------- */ void dct_c_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_src) { int i; int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; int shift2 = B32X32_IN_BIT + FACTO_BIT - 2 - (i_src & 0x01); const int ADD1 = (1 << shift1) >> 1; const int ADD2 = (1 << shift2) >> 1; const __m128i c_4 = _mm_set1_epi32(ADD1); // TODO: shift1 = 2 const __m128i k_ROUNDING2 = _mm_set1_epi32(ADD2); __m128i r0, r1, r2, r3, t0, q0, q1, q2, q3, q4, q5, q6, q7, v0, v1, w0, w1; __m128i T00A, T01A, T02A, T03A, T04A, T05A, T06A, T07A; __m128i T00B, T01B, T02B, T03B, T04B, T05B, T06B, T07B; __m128i T00C, T01C, T02C, T03C, T04C, T05C, T06C, T07C; __m128i T00D, T01D, T02D, T03D, T04D, T05D, T06D, T07D; __m128i T10A, T11A, T12A, T13A, T14A, T15A, T16A, T17A; __m128i T10B, T11B, T12B, T13B, T14B, T15B, T16B, T17B; __m128i T20, T21, T22, T23, T24, T25, T26, T27; __m128i T30, T31, T32, T33, T34, T35, T36, T37; __m128i T40, T41, T42, T43, T44, T45, T46, T47; __m128i T50, T51, T52, T53; __m128i T60; __m128i im[32]; i_src &= 0xFE; T00A = _mm_loadu_si128((__m128i*)&src[0 * i_src + 0]); // [07 06 05 04 03 02 01 00] T00B = _mm_loadu_si128((__m128i*)&src[0 * i_src + 8]); // [15 14 13 12 11 10 09 08] T00C = _mm_loadu_si128((__m128i*)&src[0 * i_src + 16]); // [23 22 21 20 19 18 17 16] T00D = _mm_loadu_si128((__m128i*)&src[0 * i_src + 24]); // [31 30 29 28 27 26 25 24] T01A = _mm_loadu_si128((__m128i*)&src[1 * i_src + 0]); T01B = _mm_loadu_si128((__m128i*)&src[1 * i_src + 8]); T01C = _mm_loadu_si128((__m128i*)&src[1 * i_src + 16]); T01D = _mm_loadu_si128((__m128i*)&src[1 * i_src + 24]); T02A = _mm_loadu_si128((__m128i*)&src[2 * i_src + 0]); T02B = _mm_loadu_si128((__m128i*)&src[2 * i_src + 8]); T02C = _mm_loadu_si128((__m128i*)&src[2 * i_src + 16]); T02D = _mm_loadu_si128((__m128i*)&src[2 * i_src + 24]); T03A = _mm_loadu_si128((__m128i*)&src[3 * i_src + 0]); T03B = _mm_loadu_si128((__m128i*)&src[3 * i_src + 8]); T03C = _mm_loadu_si128((__m128i*)&src[3 * i_src + 16]); T03D = _mm_loadu_si128((__m128i*)&src[3 * i_src + 24]); T04A = _mm_loadu_si128((__m128i*)&src[4 * i_src + 0]); T04B = _mm_loadu_si128((__m128i*)&src[4 * i_src + 8]); T04C = _mm_loadu_si128((__m128i*)&src[4 * i_src + 16]); T04D = _mm_loadu_si128((__m128i*)&src[4 * i_src + 24]); T05A = _mm_loadu_si128((__m128i*)&src[5 * i_src + 0]); T05B = _mm_loadu_si128((__m128i*)&src[5 * i_src + 8]); T05C = _mm_loadu_si128((__m128i*)&src[5 * i_src + 16]); T05D = _mm_loadu_si128((__m128i*)&src[5 * i_src + 24]); T06A = _mm_loadu_si128((__m128i*)&src[6 * i_src + 0]); T06B = _mm_loadu_si128((__m128i*)&src[6 * i_src + 8]); T06C = _mm_loadu_si128((__m128i*)&src[6 * i_src + 16]); T06D = _mm_loadu_si128((__m128i*)&src[6 * i_src + 24]); T07A = _mm_loadu_si128((__m128i*)&src[7 * i_src + 0]); T07B = _mm_loadu_si128((__m128i*)&src[7 * i_src + 8]); T07C = _mm_loadu_si128((__m128i*)&src[7 * i_src + 16]); T07D = _mm_loadu_si128((__m128i*)&src[7 * i_src + 24]); // DCT1 T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [05 02 06 01 04 03 07 00] T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [10 13 09 14 11 12 08 15] T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [21 18 22 17 20 19 23 16] T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [26 29 25 30 27 28 24 31] T01A = _mm_shuffle_epi8(T01A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T02A = _mm_shuffle_epi8(T02A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T03A = _mm_shuffle_epi8(T03A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T04A = _mm_shuffle_epi8(T04A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T04B = _mm_shuffle_epi8(T04B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T04C = _mm_shuffle_epi8(T04C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T04D = _mm_shuffle_epi8(T04D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T05A = _mm_shuffle_epi8(T05A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T05B = _mm_shuffle_epi8(T05B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T05C = _mm_shuffle_epi8(T05C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T05D = _mm_shuffle_epi8(T05D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T06A = _mm_shuffle_epi8(T06A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T06B = _mm_shuffle_epi8(T06B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T06C = _mm_shuffle_epi8(T06C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T06D = _mm_shuffle_epi8(T06D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T07A = _mm_shuffle_epi8(T07A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T07B = _mm_shuffle_epi8(T07B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T07C = _mm_shuffle_epi8(T07C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T07D = _mm_shuffle_epi8(T07D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T10A = _mm_add_epi16(T00A, T00D); // [E05 E02 E06 E01 E04 E03 E07 E00] T10B = _mm_add_epi16(T00B, T00C); // [E10 E13 E09 E14 E11 E12 E08 E15] T11A = _mm_add_epi16(T01A, T01D); T11B = _mm_add_epi16(T01B, T01C); T12A = _mm_add_epi16(T02A, T02D); T12B = _mm_add_epi16(T02B, T02C); T13A = _mm_add_epi16(T03A, T03D); T13B = _mm_add_epi16(T03B, T03C); T14A = _mm_add_epi16(T04A, T04D); T14B = _mm_add_epi16(T04B, T04C); T15A = _mm_add_epi16(T05A, T05D); T15B = _mm_add_epi16(T05B, T05C); T16A = _mm_add_epi16(T06A, T06D); T16B = _mm_add_epi16(T06B, T06C); T17A = _mm_add_epi16(T07A, T07D); T17B = _mm_add_epi16(T07B, T07C); T00A = _mm_sub_epi16(T00A, T00D); // [O05 O02 O06 O01 O04 O03 O07 O00] T00B = _mm_sub_epi16(T00B, T00C); // [O10 O13 O09 O14 O11 O12 O08 O15] T01A = _mm_sub_epi16(T01A, T01D); T01B = _mm_sub_epi16(T01B, T01C); T02A = _mm_sub_epi16(T02A, T02D); T02B = _mm_sub_epi16(T02B, T02C); T03A = _mm_sub_epi16(T03A, T03D); T03B = _mm_sub_epi16(T03B, T03C); T04A = _mm_sub_epi16(T04A, T04D); T04B = _mm_sub_epi16(T04B, T04C); T05A = _mm_sub_epi16(T05A, T05D); T05B = _mm_sub_epi16(T05B, T05C); T06A = _mm_sub_epi16(T06A, T06D); T06B = _mm_sub_epi16(T06B, T06C); T07A = _mm_sub_epi16(T07A, T07D); T07B = _mm_sub_epi16(T07B, T07C); T20 = _mm_add_epi16(T10A, T10B); // [EE5 EE2 EE6 EE1 EE4 EE3 EE7 EE0] T21 = _mm_add_epi16(T11A, T11B); T22 = _mm_add_epi16(T12A, T12B); T23 = _mm_add_epi16(T13A, T13B); T24 = _mm_add_epi16(T14A, T14B); T25 = _mm_add_epi16(T15A, T15B); T26 = _mm_add_epi16(T16A, T16B); T27 = _mm_add_epi16(T17A, T17B); T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_8[1])); T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_8[1])); T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_8[1])); T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_8[1])); T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_8[1])); T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_8[1])); T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_8[1])); T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_8[1])); T40 = _mm_hadd_epi32(T30, T31); T41 = _mm_hadd_epi32(T32, T33); T42 = _mm_hadd_epi32(T34, T35); T43 = _mm_hadd_epi32(T36, T37); T50 = _mm_hadd_epi32(T40, T41); T51 = _mm_hadd_epi32(T42, T43); T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_4), shift1); T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_4), shift1); T60 = _mm_packs_epi32(T50, T51); im[0] = T60; //_mm_storeu_si128((__m128i*)&dst[0 * 8], T60); T50 = _mm_hsub_epi32(T40, T41); T51 = _mm_hsub_epi32(T42, T43); T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_4), shift1); T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_4), shift1); T60 = _mm_packs_epi32(T50, T51); im[16] = T60; //_mm_storeu_si128((__m128i*)&dst[16 * 8], T60); T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T40 = _mm_hadd_epi32(T30, T31); T41 = _mm_hadd_epi32(T32, T33); T42 = _mm_hadd_epi32(T34, T35); T43 = _mm_hadd_epi32(T36, T37); T50 = _mm_hadd_epi32(T40, T41); T51 = _mm_hadd_epi32(T42, T43); T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_4), shift1); T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_4), shift1); T60 = _mm_packs_epi32(T50, T51); im[8] = T60; //_mm_storeu_si128((__m128i*)&dst[8 * 8], T60); T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[9])); T40 = _mm_hadd_epi32(T30, T31); T41 = _mm_hadd_epi32(T32, T33); T42 = _mm_hadd_epi32(T34, T35); T43 = _mm_hadd_epi32(T36, T37); T50 = _mm_hadd_epi32(T40, T41); T51 = _mm_hadd_epi32(T42, T43); T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_4), shift1); T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_4), shift1); T60 = _mm_packs_epi32(T50, T51); im[24] = T60; //_mm_storeu_si128((__m128i*)&dst[24 * 8], T60); #define MAKE_ODD(tab, dstPos) \ T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ \ T40 = _mm_hadd_epi32(T30, T31); \ T41 = _mm_hadd_epi32(T32, T33); \ T42 = _mm_hadd_epi32(T34, T35); \ T43 = _mm_hadd_epi32(T36, T37); \ \ T50 = _mm_hadd_epi32(T40, T41); \ T51 = _mm_hadd_epi32(T42, T43); \ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_4), shift1); \ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_4), shift1); \ T60 = _mm_packs_epi32(T50, T51); \ im[(dstPos)] = T60; //_mm_storeu_si128((__m128i*)&dst[dstPos * 8], T60); MAKE_ODD(0, 4); MAKE_ODD(1, 12); MAKE_ODD(2, 20); MAKE_ODD(3, 28); T20 = _mm_sub_epi16(T10A, T10B); // [EO5 EO2 EO6 EO1 EO4 EO3 EO7 EO0] T21 = _mm_sub_epi16(T11A, T11B); T22 = _mm_sub_epi16(T12A, T12B); T23 = _mm_sub_epi16(T13A, T13B); T24 = _mm_sub_epi16(T14A, T14B); T25 = _mm_sub_epi16(T15A, T15B); T26 = _mm_sub_epi16(T16A, T16B); T27 = _mm_sub_epi16(T17A, T17B); MAKE_ODD(4, 2); MAKE_ODD(5, 6); MAKE_ODD(6, 10); MAKE_ODD(7, 14); MAKE_ODD(8, 18); MAKE_ODD(9, 22); MAKE_ODD(10, 26); MAKE_ODD(11, 30); #undef MAKE_ODD #define MAKE_ODD(tab, dstPos) \ T20 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T21 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)+1])); \ T22 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T23 = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)+1])); \ T24 = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T25 = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)+1])); \ T26 = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T27 = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)+1])); \ T30 = _mm_madd_epi16(T04A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T31 = _mm_madd_epi16(T04B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)+1])); \ T32 = _mm_madd_epi16(T05A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T33 = _mm_madd_epi16(T05B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)+1])); \ T34 = _mm_madd_epi16(T06A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T35 = _mm_madd_epi16(T06B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)+1])); \ T36 = _mm_madd_epi16(T07A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T37 = _mm_madd_epi16(T07B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)+1])); \ \ T40 = _mm_hadd_epi32(T20, T21); \ T41 = _mm_hadd_epi32(T22, T23); \ T42 = _mm_hadd_epi32(T24, T25); \ T43 = _mm_hadd_epi32(T26, T27); \ T44 = _mm_hadd_epi32(T30, T31); \ T45 = _mm_hadd_epi32(T32, T33); \ T46 = _mm_hadd_epi32(T34, T35); \ T47 = _mm_hadd_epi32(T36, T37); \ \ T50 = _mm_hadd_epi32(T40, T41); \ T51 = _mm_hadd_epi32(T42, T43); \ T52 = _mm_hadd_epi32(T44, T45); \ T53 = _mm_hadd_epi32(T46, T47); \ \ T50 = _mm_hadd_epi32(T50, T51); \ T51 = _mm_hadd_epi32(T52, T53); \ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_4), shift1); \ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_4), shift1); \ T60 = _mm_packs_epi32(T50, T51); \ im[(dstPos)] = T60; //_mm_storeu_si128((__m128i*)&dst[dstPos * 8], T60); MAKE_ODD(12, 1); MAKE_ODD(14, 3); MAKE_ODD(16, 5); MAKE_ODD(18, 7); MAKE_ODD(20, 9); MAKE_ODD(22, 11); MAKE_ODD(24, 13); MAKE_ODD(26, 15); MAKE_ODD(28, 17); MAKE_ODD(30, 19); MAKE_ODD(32, 21); MAKE_ODD(34, 23); MAKE_ODD(36, 25); MAKE_ODD(38, 27); MAKE_ODD(40, 29); MAKE_ODD(42, 31); #undef MAKE_ODD //DCT2 for (i = 0; i < 32 / 8; i++){ /*in0 = _mm_loadu_si128((const __m128i *)(src + (0 + i * 8) * 8)); in1 = _mm_loadu_si128((const __m128i *)(src + (1 + i * 8) * 8)); in2 = _mm_loadu_si128((const __m128i *)(src + (2 + i * 8) * 8)); in3 = _mm_loadu_si128((const __m128i *)(src + (3 + i * 8) * 8)); in4 = _mm_loadu_si128((const __m128i *)(src + (4 + i * 8) * 8)); in5 = _mm_loadu_si128((const __m128i *)(src + (5 + i * 8) * 8)); in6 = _mm_loadu_si128((const __m128i *)(src + (6 + i * 8) * 8)); in7 = _mm_loadu_si128((const __m128i *)(src + (7 + i * 8) * 8));*/ #define MAKE_ODD(tab)\ q0 = _mm_madd_epi16(im[8 * i + 0], _mm_load_si128((__m128i*)tab_dct_8_1[tab])); \ q1 = _mm_madd_epi16(im[8 * i + 1], _mm_load_si128((__m128i*)tab_dct_8_1[tab])); \ q2 = _mm_madd_epi16(im[8 * i + 2], _mm_load_si128((__m128i*)tab_dct_8_1[tab])); \ q3 = _mm_madd_epi16(im[8 * i + 3], _mm_load_si128((__m128i*)tab_dct_8_1[tab])); \ q4 = _mm_madd_epi16(im[8 * i + 4], _mm_load_si128((__m128i*)tab_dct_8_1[tab])); \ q5 = _mm_madd_epi16(im[8 * i + 5], _mm_load_si128((__m128i*)tab_dct_8_1[tab])); \ q6 = _mm_madd_epi16(im[8 * i + 6], _mm_load_si128((__m128i*)tab_dct_8_1[tab])); \ q7 = _mm_madd_epi16(im[8 * i + 7], _mm_load_si128((__m128i*)tab_dct_8_1[tab])); \ r0 = _mm_hadd_epi32(q0, q1); \ r1 = _mm_hadd_epi32(q2, q3); \ r2 = _mm_hadd_epi32(q4, q5); \ r3 = _mm_hadd_epi32(q6, q7); \ v0 = _mm_hadd_epi32(r0, r1); \ v1 = _mm_hadd_epi32(r2, r3); \ v0 = _mm_add_epi32(v0, k_ROUNDING2); \ v1 = _mm_add_epi32(v1, k_ROUNDING2); \ w0 = _mm_srai_epi32(v0, shift2); \ w1 = _mm_srai_epi32(v1, shift2); \ t0 = _mm_packs_epi32(w0, w1); \ _mm_storeu_si128((__m128i *)(dst + 32 * tab + i * 8), t0); MAKE_ODD(0); MAKE_ODD(1); MAKE_ODD(2); MAKE_ODD(3); MAKE_ODD(4); MAKE_ODD(5); MAKE_ODD(6); MAKE_ODD(7); #undef MAKE_ODD } } /* --------------------------------------------------------------------------- */ void dct_c_32x32_half_sse128(const coeff_t *src, coeff_t *dst, int i_src) { const int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01); const int SHIFT2 = B32X32_IN_BIT + FACTO_BIT; const int ADD1 = (1 << shift1) >> 1; const int ADD2 = (1 << SHIFT2) >> 1; // Const __m128i c_add1 = _mm_set1_epi32(ADD1); __m128i c_add2 = _mm_set1_epi32(ADD2); __m128i T00A, T01A, T02A, T03A, T04A, T05A, T06A, T07A; __m128i T00B, T01B, T02B, T03B, T04B, T05B, T06B, T07B; __m128i T00C, T01C, T02C, T03C, T04C, T05C, T06C, T07C; __m128i T00D, T01D, T02D, T03D, T04D, T05D, T06D, T07D; __m128i T10A, T11A, T12A, T13A, T14A, T15A, T16A, T17A; __m128i T10B, T11B, T12B, T13B, T14B, T15B, T16B, T17B; __m128i T20, T21, T22, T23, T24, T25, T26, T27; __m128i T30, T31, T32, T33, T34, T35, T36, T37; __m128i T40, T41, T42, T43, T44, T45, T46, T47; __m128i T50, T51, T52, T53; __m128i T60, T61, T62, T63, T64, T65, T66, T67; __m128i im[16][4]; int i; i_src &= 0xFE; /* remember to remove the flag bit */ // DCT1 for (i = 0; i < 32 / 8; i++) { T00A = _mm_load_si128((__m128i*)(src + (i * 8 + 0) * i_src + 0)); // [07 06 05 04 03 02 01 00] T00B = _mm_load_si128((__m128i*)(src + (i * 8 + 0) * i_src + 8)); // [15 14 13 12 11 10 09 08] T00C = _mm_load_si128((__m128i*)(src + (i * 8 + 0) * i_src + 16)); // [23 22 21 20 19 18 17 16] T00D = _mm_load_si128((__m128i*)(src + (i * 8 + 0) * i_src + 24)); // [31 30 29 28 27 26 25 24] T01A = _mm_load_si128((__m128i*)(src + (i * 8 + 1) * i_src + 0)); T01B = _mm_load_si128((__m128i*)(src + (i * 8 + 1) * i_src + 8)); T01C = _mm_load_si128((__m128i*)(src + (i * 8 + 1) * i_src + 16)); T01D = _mm_load_si128((__m128i*)(src + (i * 8 + 1) * i_src + 24)); T02A = _mm_load_si128((__m128i*)(src + (i * 8 + 2) * i_src + 0)); T02B = _mm_load_si128((__m128i*)(src + (i * 8 + 2) * i_src + 8)); T02C = _mm_load_si128((__m128i*)(src + (i * 8 + 2) * i_src + 16)); T02D = _mm_load_si128((__m128i*)(src + (i * 8 + 2) * i_src + 24)); T03A = _mm_load_si128((__m128i*)(src + (i * 8 + 3) * i_src + 0)); T03B = _mm_load_si128((__m128i*)(src + (i * 8 + 3) * i_src + 8)); T03C = _mm_load_si128((__m128i*)(src + (i * 8 + 3) * i_src + 16)); T03D = _mm_load_si128((__m128i*)(src + (i * 8 + 3) * i_src + 24)); T04A = _mm_load_si128((__m128i*)(src + (i * 8 + 4) * i_src + 0)); T04B = _mm_load_si128((__m128i*)(src + (i * 8 + 4) * i_src + 8)); T04C = _mm_load_si128((__m128i*)(src + (i * 8 + 4) * i_src + 16)); T04D = _mm_load_si128((__m128i*)(src + (i * 8 + 4) * i_src + 24)); T05A = _mm_load_si128((__m128i*)(src + (i * 8 + 5) * i_src + 0)); T05B = _mm_load_si128((__m128i*)(src + (i * 8 + 5) * i_src + 8)); T05C = _mm_load_si128((__m128i*)(src + (i * 8 + 5) * i_src + 16)); T05D = _mm_load_si128((__m128i*)(src + (i * 8 + 5) * i_src + 24)); T06A = _mm_load_si128((__m128i*)(src + (i * 8 + 6) * i_src + 0)); T06B = _mm_load_si128((__m128i*)(src + (i * 8 + 6) * i_src + 8)); T06C = _mm_load_si128((__m128i*)(src + (i * 8 + 6) * i_src + 16)); T06D = _mm_load_si128((__m128i*)(src + (i * 8 + 6) * i_src + 24)); T07A = _mm_load_si128((__m128i*)(src + (i * 8 + 7) * i_src + 0)); T07B = _mm_load_si128((__m128i*)(src + (i * 8 + 7) * i_src + 8)); T07C = _mm_load_si128((__m128i*)(src + (i * 8 + 7) * i_src + 16)); T07D = _mm_load_si128((__m128i*)(src + (i * 8 + 7) * i_src + 24)); T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [05 02 06 01 04 03 07 00] T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [10 13 09 14 11 12 08 15] T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [21 18 22 17 20 19 23 16] T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [26 29 25 30 27 28 24 31] T01A = _mm_shuffle_epi8(T01A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T02A = _mm_shuffle_epi8(T02A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T03A = _mm_shuffle_epi8(T03A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T04A = _mm_shuffle_epi8(T04A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T04B = _mm_shuffle_epi8(T04B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T04C = _mm_shuffle_epi8(T04C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T04D = _mm_shuffle_epi8(T04D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T05A = _mm_shuffle_epi8(T05A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T05B = _mm_shuffle_epi8(T05B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T05C = _mm_shuffle_epi8(T05C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T05D = _mm_shuffle_epi8(T05D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T06A = _mm_shuffle_epi8(T06A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T06B = _mm_shuffle_epi8(T06B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T06C = _mm_shuffle_epi8(T06C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T06D = _mm_shuffle_epi8(T06D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T07A = _mm_shuffle_epi8(T07A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T07B = _mm_shuffle_epi8(T07B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T07C = _mm_shuffle_epi8(T07C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T07D = _mm_shuffle_epi8(T07D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T10A = _mm_add_epi16(T00A, T00D); // [E05 E02 E06 E01 E04 E03 E07 E00] T10B = _mm_add_epi16(T00B, T00C); // [E10 E13 E09 E14 E11 E12 E08 E15] T11A = _mm_add_epi16(T01A, T01D); T11B = _mm_add_epi16(T01B, T01C); T12A = _mm_add_epi16(T02A, T02D); T12B = _mm_add_epi16(T02B, T02C); T13A = _mm_add_epi16(T03A, T03D); T13B = _mm_add_epi16(T03B, T03C); T14A = _mm_add_epi16(T04A, T04D); T14B = _mm_add_epi16(T04B, T04C); T15A = _mm_add_epi16(T05A, T05D); T15B = _mm_add_epi16(T05B, T05C); T16A = _mm_add_epi16(T06A, T06D); T16B = _mm_add_epi16(T06B, T06C); T17A = _mm_add_epi16(T07A, T07D); T17B = _mm_add_epi16(T07B, T07C); T00A = _mm_sub_epi16(T00A, T00D); // [O05 O02 O06 O01 O04 O03 O07 O00] T00B = _mm_sub_epi16(T00B, T00C); // [O10 O13 O09 O14 O11 O12 O08 O15] T01A = _mm_sub_epi16(T01A, T01D); T01B = _mm_sub_epi16(T01B, T01C); T02A = _mm_sub_epi16(T02A, T02D); T02B = _mm_sub_epi16(T02B, T02C); T03A = _mm_sub_epi16(T03A, T03D); T03B = _mm_sub_epi16(T03B, T03C); T04A = _mm_sub_epi16(T04A, T04D); T04B = _mm_sub_epi16(T04B, T04C); T05A = _mm_sub_epi16(T05A, T05D); T05B = _mm_sub_epi16(T05B, T05C); T06A = _mm_sub_epi16(T06A, T06D); T06B = _mm_sub_epi16(T06B, T06C); T07A = _mm_sub_epi16(T07A, T07D); T07B = _mm_sub_epi16(T07B, T07C); T20 = _mm_add_epi16(T10A, T10B); // [EE5 EE2 EE6 EE1 EE4 EE3 EE7 EE0] T21 = _mm_add_epi16(T11A, T11B); T22 = _mm_add_epi16(T12A, T12B); T23 = _mm_add_epi16(T13A, T13B); T24 = _mm_add_epi16(T14A, T14B); T25 = _mm_add_epi16(T15A, T15B); T26 = _mm_add_epi16(T16A, T16B); T27 = _mm_add_epi16(T17A, T17B); T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_8[1]));// [ 05+02 06+01 04+03 07+00] T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_8[1])); T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_8[1])); T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_8[1])); T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_8[1])); T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_8[1])); T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_8[1])); T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_8[1])); T40 = _mm_hadd_epi32(T30, T31);//[05+02+06+01 04+03+07+00] T41 = _mm_hadd_epi32(T32, T33); T42 = _mm_hadd_epi32(T34, T35); T43 = _mm_hadd_epi32(T36, T37); T50 = _mm_hadd_epi32(T40, T41); T51 = _mm_hadd_epi32(T42, T43); T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_add1), shift1); T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_add1), shift1); T60 = _mm_packs_epi32(T50, T51); im[0][i] = T60; T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[8])); T40 = _mm_hadd_epi32(T30, T31); T41 = _mm_hadd_epi32(T32, T33); T42 = _mm_hadd_epi32(T34, T35); T43 = _mm_hadd_epi32(T36, T37); T50 = _mm_hadd_epi32(T40, T41); T51 = _mm_hadd_epi32(T42, T43); T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_add1), shift1); T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_add1), shift1); T60 = _mm_packs_epi32(T50, T51); im[8][i] = T60; #define MAKE_ODD(tab, dstPos) \ T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T33 = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T34 = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T35 = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T36 = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ T37 = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \ \ T40 = _mm_hadd_epi32(T30, T31); \ T41 = _mm_hadd_epi32(T32, T33); \ T42 = _mm_hadd_epi32(T34, T35); \ T43 = _mm_hadd_epi32(T36, T37); \ \ T50 = _mm_hadd_epi32(T40, T41); \ T51 = _mm_hadd_epi32(T42, T43); \ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_add1), shift1); \ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_add1), shift1); \ T60 = _mm_packs_epi32(T50, T51); \ im[(dstPos)][i] = T60; MAKE_ODD(0, 4); MAKE_ODD(1, 12); T20 = _mm_sub_epi16(T10A, T10B); // [EO5 EO2 EO6 EO1 EO4 EO3 EO7 EO0] T21 = _mm_sub_epi16(T11A, T11B); T22 = _mm_sub_epi16(T12A, T12B); T23 = _mm_sub_epi16(T13A, T13B); T24 = _mm_sub_epi16(T14A, T14B); T25 = _mm_sub_epi16(T15A, T15B); T26 = _mm_sub_epi16(T16A, T16B); T27 = _mm_sub_epi16(T17A, T17B); MAKE_ODD( 4, 2); MAKE_ODD( 5, 6); MAKE_ODD( 6, 10); MAKE_ODD( 7, 14); #undef MAKE_ODD #define MAKE_ODD(tab, dstPos) \ T20 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) ])); \ T21 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ T22 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) ])); \ T23 = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ T24 = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) ])); \ T25 = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ T26 = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) ])); \ T27 = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ T30 = _mm_madd_epi16(T04A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) ])); \ T31 = _mm_madd_epi16(T04B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ T32 = _mm_madd_epi16(T05A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) ])); \ T33 = _mm_madd_epi16(T05B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ T34 = _mm_madd_epi16(T06A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) ])); \ T35 = _mm_madd_epi16(T06B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ T36 = _mm_madd_epi16(T07A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) ])); \ T37 = _mm_madd_epi16(T07B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \ \ T40 = _mm_hadd_epi32(T20, T21); \ T41 = _mm_hadd_epi32(T22, T23); \ T42 = _mm_hadd_epi32(T24, T25); \ T43 = _mm_hadd_epi32(T26, T27); \ T44 = _mm_hadd_epi32(T30, T31); \ T45 = _mm_hadd_epi32(T32, T33); \ T46 = _mm_hadd_epi32(T34, T35); \ T47 = _mm_hadd_epi32(T36, T37); \ \ T50 = _mm_hadd_epi32(T40, T41); \ T51 = _mm_hadd_epi32(T42, T43); \ T52 = _mm_hadd_epi32(T44, T45); \ T53 = _mm_hadd_epi32(T46, T47); \ \ T50 = _mm_hadd_epi32(T50, T51); \ T51 = _mm_hadd_epi32(T52, T53); \ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_add1), shift1); \ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_add1), shift1); \ T60 = _mm_packs_epi32(T50, T51); \ im[(dstPos)][i] = T60; MAKE_ODD(12, 1); MAKE_ODD(14, 3); MAKE_ODD(16, 5); MAKE_ODD(18, 7); MAKE_ODD(20, 9); MAKE_ODD(22, 11); MAKE_ODD(24, 13); MAKE_ODD(26, 15); #undef MAKE_ODD } /* clear result buffer */ xavs2_fast_memzero_mmx(dst, 32 * 32 * sizeof(coeff_t)); // DCT2, ֻǰ16кǰ16 for (i = 0; i < 16 / 4; i++) { // OPT_ME: to avoid register spill, I use matrix multiply, have other way? T00A = im[i * 4 + 0][0]; // [07 06 05 04 03 02 01 00] T00B = im[i * 4 + 0][1]; // [15 14 13 12 11 10 09 08] T00C = im[i * 4 + 0][2]; // [23 22 21 20 19 18 17 16] T00D = im[i * 4 + 0][3]; // [31 30 29 28 27 26 25 24] T01A = im[i * 4 + 1][0]; T01B = im[i * 4 + 1][1]; T01C = im[i * 4 + 1][2]; T01D = im[i * 4 + 1][3]; T02A = im[i * 4 + 2][0]; T02B = im[i * 4 + 2][1]; T02C = im[i * 4 + 2][2]; T02D = im[i * 4 + 2][3]; T03A = im[i * 4 + 3][0]; T03B = im[i * 4 + 3][1]; T03C = im[i * 4 + 3][2]; T03D = im[i * 4 + 3][3]; T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); // [16 17 18 19 20 21 22 23] T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); // [24 25 26 27 28 29 30 31] T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_16_0[0])); T10A = _mm_unpacklo_epi16(T00A, T00D); // [28 03 29 02 30 01 31 00] T10B = _mm_unpackhi_epi16(T00A, T00D); // [24 07 25 06 26 05 27 04] T00A = _mm_unpacklo_epi16(T00B, T00C); // [20 11 21 10 22 09 23 08] T00B = _mm_unpackhi_epi16(T00B, T00C); // [16 15 17 14 18 13 19 12] T11A = _mm_unpacklo_epi16(T01A, T01D); T11B = _mm_unpackhi_epi16(T01A, T01D); T01A = _mm_unpacklo_epi16(T01B, T01C); T01B = _mm_unpackhi_epi16(T01B, T01C); T12A = _mm_unpacklo_epi16(T02A, T02D); T12B = _mm_unpackhi_epi16(T02A, T02D); T02A = _mm_unpacklo_epi16(T02B, T02C); T02B = _mm_unpackhi_epi16(T02B, T02C); T13A = _mm_unpacklo_epi16(T03A, T03D); T13B = _mm_unpackhi_epi16(T03A, T03D); T03A = _mm_unpacklo_epi16(T03B, T03C); T03B = _mm_unpackhi_epi16(T03B, T03C); #define MAKE_ODD(tab0, tab1, tab2, tab3, dstPos) \ T20 = _mm_madd_epi16(T10A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ T21 = _mm_madd_epi16(T10B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ T22 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ T23 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ T24 = _mm_madd_epi16(T11A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ T25 = _mm_madd_epi16(T11B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ T26 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ T27 = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ T30 = _mm_madd_epi16(T12A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ T31 = _mm_madd_epi16(T12B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ T32 = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ T33 = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ T34 = _mm_madd_epi16(T13A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \ T35 = _mm_madd_epi16(T13B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \ T36 = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \ T37 = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \ \ T60 = _mm_hadd_epi32(T20, T21); \ T61 = _mm_hadd_epi32(T22, T23); \ T62 = _mm_hadd_epi32(T24, T25); \ T63 = _mm_hadd_epi32(T26, T27); \ T64 = _mm_hadd_epi32(T30, T31); \ T65 = _mm_hadd_epi32(T32, T33); \ T66 = _mm_hadd_epi32(T34, T35); \ T67 = _mm_hadd_epi32(T36, T37); \ \ T60 = _mm_hadd_epi32(T60, T61); \ T61 = _mm_hadd_epi32(T62, T63); \ T62 = _mm_hadd_epi32(T64, T65); \ T63 = _mm_hadd_epi32(T66, T67); \ \ T60 = _mm_hadd_epi32(T60, T61); \ T61 = _mm_hadd_epi32(T62, T63); \ \ T60 = _mm_hadd_epi32(T60, T61); \ \ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_add2), SHIFT2); \ T60 = _mm_packs_epi32(T60, T60); \ _mm_storel_epi64((__m128i*)(dst + (dstPos) * 32 + (i * 4) + 0), T60); MAKE_ODD(44, 44, 44, 44, 0); MAKE_ODD(46, 47, 46, 47, 8); MAKE_ODD(50, 51, 52, 53, 4); MAKE_ODD(54, 55, 56, 57, 12); MAKE_ODD(66, 67, 68, 69, 2); MAKE_ODD(70, 71, 72, 73, 6); MAKE_ODD(74, 75, 76, 77, 10); MAKE_ODD(78, 79, 80, 81, 14); MAKE_ODD( 98, 99, 100, 101, 1); MAKE_ODD(102, 103, 104, 105, 3); MAKE_ODD(106, 107, 108, 109, 5); MAKE_ODD(110, 111, 112, 113, 7); MAKE_ODD(114, 115, 116, 117, 9); MAKE_ODD(118, 119, 120, 121, 11); MAKE_ODD(122, 123, 124, 125, 13); MAKE_ODD(126, 127, 128, 129, 15); #undef MAKE_ODD } } //optimize 32x32 size transform void dct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_src) { const int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01); const int SHIFT2 = B32X32_IN_BIT + FACTO_BIT; const int ADD1 = (1 << shift1) >> 1; const int ADD2 = (1 << SHIFT2) >> 1; // Const __m128i c_add1 = _mm_set1_epi32(ADD1); __m128i c_add2 = _mm_set1_epi32(ADD2); __m128i T00A, T01A, T02A, T03A, T04A, T05A, T06A, T07A; __m128i T00B, T01B, T02B, T03B, T04B, T05B, T06B, T07B; __m128i T00C, T01C, T02C, T03C, T04C, T05C, T06C, T07C; __m128i T00D, T01D, T02D, T03D, T04D, T05D, T06D, T07D; __m128i T10A, T11A, T12A, T13A, T14A, T15A, T16A, T17A; __m128i T10B, T11B, T12B, T13B, T14B, T15B, T16B, T17B; __m128i T20, T21, T22, T23, T24, T25, T26, T27; __m128i T30, T31, T32, T33, T34, T35, T36, T37; __m128i T40, T41, T42, T43, T44, T45, T46, T47; __m128i T50, T51, T52, T53; __m128i T60; __m128i im[32][4]; int i; i_src &= 0xFE; /* remember to remove the flag bit */ // DCT1 for (i = 0; i < 32 / 8; i++) { T00A = _mm_load_si128((__m128i*)(src + 0)); // [07 06 05 04 03 02 01 00] T00B = _mm_load_si128((__m128i*)(src + 8)); // [15 14 13 12 11 10 09 08] T00C = _mm_load_si128((__m128i*)(src + 16)); // [23 22 21 20 19 18 17 16] T00D = _mm_load_si128((__m128i*)(src + 24)); // [31 30 29 28 27 26 25 24] src += i_src; T01A = _mm_load_si128((__m128i*)(src + 0)); T01B = _mm_load_si128((__m128i*)(src + 8)); T01C = _mm_load_si128((__m128i*)(src + 16)); T01D = _mm_load_si128((__m128i*)(src + 24)); src += i_src; T02A = _mm_load_si128((__m128i*)(src + 0)); T02B = _mm_load_si128((__m128i*)(src + 8)); T02C = _mm_load_si128((__m128i*)(src + 16)); T02D = _mm_load_si128((__m128i*)(src + 24)); src += i_src; T03A = _mm_load_si128((__m128i*)(src + 0)); T03B = _mm_load_si128((__m128i*)(src + 8)); T03C = _mm_load_si128((__m128i*)(src + 16)); T03D = _mm_load_si128((__m128i*)(src + 24)); src += i_src; T04A = _mm_load_si128((__m128i*)(src + 0)); T04B = _mm_load_si128((__m128i*)(src + 8)); T04C = _mm_load_si128((__m128i*)(src + 16)); T04D = _mm_load_si128((__m128i*)(src + 24)); src += i_src; T05A = _mm_load_si128((__m128i*)(src + 0)); T05B = _mm_load_si128((__m128i*)(src + 8)); T05C = _mm_load_si128((__m128i*)(src + 16)); T05D = _mm_load_si128((__m128i*)(src + 24)); src += i_src; T06A = _mm_load_si128((__m128i*)(src + 0)); T06B = _mm_load_si128((__m128i*)(src + 8)); T06C = _mm_load_si128((__m128i*)(src + 16)); T06D = _mm_load_si128((__m128i*)(src + 24)); src += i_src; T07A = _mm_load_si128((__m128i*)(src + 0)); T07B = _mm_load_si128((__m128i*)(src + 8)); T07C = _mm_load_si128((__m128i*)(src + 16)); T07D = _mm_load_si128((__m128i*)(src + 24)); src += i_src; //_mm_load_si128((__m128i)tab_dct_16_0[1]) *((__m128i*)tab_dct_16_0[1]) T00A = _mm_shuffle_epi8(T00A, *((__m128i*)tab_dct_16_0[1])); // [05 02 06 01 04 03 07 00] T00B = _mm_shuffle_epi8(T00B, *((__m128i*)tab_dct_32_0[0])); // [10 13 09 14 11 12 08 15] T00C = _mm_shuffle_epi8(T00C, *((__m128i*)tab_dct_16_0[1])); // [21 18 22 17 20 19 23 16] T00D = _mm_shuffle_epi8(T00D, *((__m128i*)tab_dct_32_0[0])); // [26 29 25 30 27 28 24 31] T01A = _mm_shuffle_epi8(T01A, *((__m128i*)tab_dct_16_0[1])); T01B = _mm_shuffle_epi8(T01B, *((__m128i*)tab_dct_32_0[0])); T01C = _mm_shuffle_epi8(T01C, *((__m128i*)tab_dct_16_0[1])); T01D = _mm_shuffle_epi8(T01D, *((__m128i*)tab_dct_32_0[0])); T02A = _mm_shuffle_epi8(T02A, *((__m128i*)tab_dct_16_0[1])); T02B = _mm_shuffle_epi8(T02B, *((__m128i*)tab_dct_32_0[0])); T02C = _mm_shuffle_epi8(T02C, *((__m128i*)tab_dct_16_0[1])); T02D = _mm_shuffle_epi8(T02D, *((__m128i*)tab_dct_32_0[0])); T03A = _mm_shuffle_epi8(T03A, *((__m128i*)tab_dct_16_0[1])); T03B = _mm_shuffle_epi8(T03B, *((__m128i*)tab_dct_32_0[0])); T03C = _mm_shuffle_epi8(T03C, *((__m128i*)tab_dct_16_0[1])); T03D = _mm_shuffle_epi8(T03D, *((__m128i*)tab_dct_32_0[0])); T04A = _mm_shuffle_epi8(T04A, *((__m128i*)tab_dct_16_0[1])); T04B = _mm_shuffle_epi8(T04B, *((__m128i*)tab_dct_32_0[0])); T04C = _mm_shuffle_epi8(T04C, *((__m128i*)tab_dct_16_0[1])); T04D = _mm_shuffle_epi8(T04D, *((__m128i*)tab_dct_32_0[0])); T05A = _mm_shuffle_epi8(T05A, *((__m128i*)tab_dct_16_0[1])); T05B = _mm_shuffle_epi8(T05B, *((__m128i*)tab_dct_32_0[0])); T05C = _mm_shuffle_epi8(T05C, *((__m128i*)tab_dct_16_0[1])); T05D = _mm_shuffle_epi8(T05D, *((__m128i*)tab_dct_32_0[0])); T06A = _mm_shuffle_epi8(T06A, *((__m128i*)tab_dct_16_0[1])); T06B = _mm_shuffle_epi8(T06B, *((__m128i*)tab_dct_32_0[0])); T06C = _mm_shuffle_epi8(T06C, *((__m128i*)tab_dct_16_0[1])); T06D = _mm_shuffle_epi8(T06D, *((__m128i*)tab_dct_32_0[0])); T07A = _mm_shuffle_epi8(T07A, *((__m128i*)tab_dct_16_0[1])); T07B = _mm_shuffle_epi8(T07B, *((__m128i*)tab_dct_32_0[0])); T07C = _mm_shuffle_epi8(T07C, *((__m128i*)tab_dct_16_0[1])); T07D = _mm_shuffle_epi8(T07D, *((__m128i*)tab_dct_32_0[0])); T10A = _mm_add_epi16(T00A, T00D); // [E05 E02 E06 E01 E04 E03 E07 E00] T10B = _mm_add_epi16(T00B, T00C); // [E10 E13 E09 E14 E11 E12 E08 E15] T11A = _mm_add_epi16(T01A, T01D); T11B = _mm_add_epi16(T01B, T01C); T12A = _mm_add_epi16(T02A, T02D); T12B = _mm_add_epi16(T02B, T02C); T13A = _mm_add_epi16(T03A, T03D); T13B = _mm_add_epi16(T03B, T03C); T14A = _mm_add_epi16(T04A, T04D); T14B = _mm_add_epi16(T04B, T04C); T15A = _mm_add_epi16(T05A, T05D); T15B = _mm_add_epi16(T05B, T05C); T16A = _mm_add_epi16(T06A, T06D); T16B = _mm_add_epi16(T06B, T06C); T17A = _mm_add_epi16(T07A, T07D); T17B = _mm_add_epi16(T07B, T07C); T00A = _mm_sub_epi16(T00A, T00D); // [O05 O02 O06 O01 O04 O03 O07 O00] T00B = _mm_sub_epi16(T00B, T00C); // [O10 O13 O09 O14 O11 O12 O08 O15] T01A = _mm_sub_epi16(T01A, T01D); T01B = _mm_sub_epi16(T01B, T01C); T02A = _mm_sub_epi16(T02A, T02D); T02B = _mm_sub_epi16(T02B, T02C); T03A = _mm_sub_epi16(T03A, T03D); T03B = _mm_sub_epi16(T03B, T03C); T04A = _mm_sub_epi16(T04A, T04D); T04B = _mm_sub_epi16(T04B, T04C); T05A = _mm_sub_epi16(T05A, T05D); T05B = _mm_sub_epi16(T05B, T05C); T06A = _mm_sub_epi16(T06A, T06D); T06B = _mm_sub_epi16(T06B, T06C); T07A = _mm_sub_epi16(T07A, T07D); T07B = _mm_sub_epi16(T07B, T07C); T20 = _mm_add_epi16(T10A, T10B); // [EE5 EE2 EE6 EE1 EE4 EE3 EE7 EE0] T21 = _mm_add_epi16(T11A, T11B); T22 = _mm_add_epi16(T12A, T12B); T23 = _mm_add_epi16(T13A, T13B); T24 = _mm_add_epi16(T14A, T14B); T25 = _mm_add_epi16(T15A, T15B); T26 = _mm_add_epi16(T16A, T16B); T27 = _mm_add_epi16(T17A, T17B); //_mm_load_si128((__m128i*)tab_dct_8[1]) ->*((__m128i*)tab_dct_8[1]) T30 = _mm_madd_epi16(T20, *((__m128i*)tab_dct_8[1])); T31 = _mm_madd_epi16(T21, *((__m128i*)tab_dct_8[1])); T32 = _mm_madd_epi16(T22, *((__m128i*)tab_dct_8[1])); T33 = _mm_madd_epi16(T23, *((__m128i*)tab_dct_8[1])); T34 = _mm_madd_epi16(T24, *((__m128i*)tab_dct_8[1])); T35 = _mm_madd_epi16(T25, *((__m128i*)tab_dct_8[1])); T36 = _mm_madd_epi16(T26, *((__m128i*)tab_dct_8[1])); T37 = _mm_madd_epi16(T27, *((__m128i*)tab_dct_8[1])); T40 = _mm_hadd_epi32(T30, T31); T41 = _mm_hadd_epi32(T32, T33); T42 = _mm_hadd_epi32(T34, T35); T43 = _mm_hadd_epi32(T36, T37); T50 = _mm_hadd_epi32(T40, T41); T51 = _mm_hadd_epi32(T42, T43); T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_add1), shift1); T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_add1), shift1); T60 = _mm_packs_epi32(T50, T51); im[0][i] = T60;//1608мı任ϵ(16 bit per bit width) T50 = _mm_hsub_epi32(T40, T41); T51 = _mm_hsub_epi32(T42, T43); T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_add1), shift1); T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_add1), shift1); T60 = _mm_packs_epi32(T50, T51); im[16][i] = T60; //_mm_load_si128((__m128i*)tab_dct_16_1[8]) -> T30 = _mm_madd_epi16(T20, *((__m128i*)tab_dct_16_1[8])); T31 = _mm_madd_epi16(T21, *((__m128i*)tab_dct_16_1[8])); T32 = _mm_madd_epi16(T22, *((__m128i*)tab_dct_16_1[8])); T33 = _mm_madd_epi16(T23, *((__m128i*)tab_dct_16_1[8])); T34 = _mm_madd_epi16(T24, *((__m128i*)tab_dct_16_1[8])); T35 = _mm_madd_epi16(T25, *((__m128i*)tab_dct_16_1[8])); T36 = _mm_madd_epi16(T26, *((__m128i*)tab_dct_16_1[8])); T37 = _mm_madd_epi16(T27, *((__m128i*)tab_dct_16_1[8])); T40 = _mm_hadd_epi32(T30, T31); T41 = _mm_hadd_epi32(T32, T33); T42 = _mm_hadd_epi32(T34, T35); T43 = _mm_hadd_epi32(T36, T37); T50 = _mm_hadd_epi32(T40, T41); T51 = _mm_hadd_epi32(T42, T43); T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_add1), shift1); T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_add1), shift1); T60 = _mm_packs_epi32(T50, T51); im[8][i] = T60; // T30 = _mm_madd_epi16(T20, *((__m128i*)tab_dct_16_1[9])); T31 = _mm_madd_epi16(T21, *((__m128i*)tab_dct_16_1[9])); T32 = _mm_madd_epi16(T22, *((__m128i*)tab_dct_16_1[9])); T33 = _mm_madd_epi16(T23, *((__m128i*)tab_dct_16_1[9])); T34 = _mm_madd_epi16(T24, *((__m128i*)tab_dct_16_1[9])); T35 = _mm_madd_epi16(T25, *((__m128i*)tab_dct_16_1[9])); T36 = _mm_madd_epi16(T26, *((__m128i*)tab_dct_16_1[9])); T37 = _mm_madd_epi16(T27, *((__m128i*)tab_dct_16_1[9])); T40 = _mm_hadd_epi32(T30, T31); T41 = _mm_hadd_epi32(T32, T33); T42 = _mm_hadd_epi32(T34, T35); T43 = _mm_hadd_epi32(T36, T37); T50 = _mm_hadd_epi32(T40, T41); T51 = _mm_hadd_epi32(T42, T43); T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_add1), shift1); T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_add1), shift1); T60 = _mm_packs_epi32(T50, T51); im[24][i] = T60; #define MAKE_ODD(tab, dstPos) \ T30 = _mm_madd_epi16(T20, *((__m128i*)tab_dct_32_1[(tab)])); \ T31 = _mm_madd_epi16(T21, *((__m128i*)tab_dct_32_1[(tab)])); \ T32 = _mm_madd_epi16(T22, *((__m128i*)tab_dct_32_1[(tab)])); \ T33 = _mm_madd_epi16(T23, *((__m128i*)tab_dct_32_1[(tab)])); \ T34 = _mm_madd_epi16(T24, *((__m128i*)tab_dct_32_1[(tab)])); \ T35 = _mm_madd_epi16(T25, *((__m128i*)tab_dct_32_1[(tab)])); \ T36 = _mm_madd_epi16(T26, *((__m128i*)tab_dct_32_1[(tab)])); \ T37 = _mm_madd_epi16(T27, *((__m128i*)tab_dct_32_1[(tab)])); \ \ T40 = _mm_hadd_epi32(T30, T31); \ T41 = _mm_hadd_epi32(T32, T33); \ T42 = _mm_hadd_epi32(T34, T35); \ T43 = _mm_hadd_epi32(T36, T37); \ \ T50 = _mm_hadd_epi32(T40, T41); \ T51 = _mm_hadd_epi32(T42, T43); \ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_add1), shift1); \ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_add1), shift1); \ T60 = _mm_packs_epi32(T50, T51); \ im[(dstPos)][i] = T60; MAKE_ODD(0, 4); MAKE_ODD(1, 12); MAKE_ODD(2, 20); MAKE_ODD(3, 28); T20 = _mm_sub_epi16(T10A, T10B); // [EO5 EO2 EO6 EO1 EO4 EO3 EO7 EO0] T21 = _mm_sub_epi16(T11A, T11B); T22 = _mm_sub_epi16(T12A, T12B); T23 = _mm_sub_epi16(T13A, T13B); T24 = _mm_sub_epi16(T14A, T14B); T25 = _mm_sub_epi16(T15A, T15B); T26 = _mm_sub_epi16(T16A, T16B); T27 = _mm_sub_epi16(T17A, T17B); MAKE_ODD(4, 2); MAKE_ODD(5, 6); MAKE_ODD(6, 10); MAKE_ODD(7, 14); MAKE_ODD(8, 18); MAKE_ODD(9, 22); MAKE_ODD(10, 26); MAKE_ODD(11, 30); #undef MAKE_ODD // _mm_load_si128((__m128i*)tab_dct_32_1[(tab) ]) -> *((__m128i*)tab_dct_32_1[(tab) ]) #define MAKE_ODD(tab, dstPos) \ T20 = _mm_madd_epi16(T00A, *((__m128i*)tab_dct_32_1[(tab)])); \ T21 = _mm_madd_epi16(T00B, *((__m128i*)tab_dct_32_1[(tab)+1])); \ T22 = _mm_madd_epi16(T01A, *((__m128i*)tab_dct_32_1[(tab)])); \ T23 = _mm_madd_epi16(T01B, *((__m128i*)tab_dct_32_1[(tab)+1])); \ T24 = _mm_madd_epi16(T02A, *((__m128i*)tab_dct_32_1[(tab)])); \ T25 = _mm_madd_epi16(T02B, *((__m128i*)tab_dct_32_1[(tab)+1])); \ T26 = _mm_madd_epi16(T03A, *((__m128i*)tab_dct_32_1[(tab)])); \ T27 = _mm_madd_epi16(T03B, *((__m128i*)tab_dct_32_1[(tab)+1])); \ T30 = _mm_madd_epi16(T04A, *((__m128i*)tab_dct_32_1[(tab)])); \ T31 = _mm_madd_epi16(T04B, *((__m128i*)tab_dct_32_1[(tab)+1])); \ T32 = _mm_madd_epi16(T05A, *((__m128i*)tab_dct_32_1[(tab)])); \ T33 = _mm_madd_epi16(T05B, *((__m128i*)tab_dct_32_1[(tab)+1])); \ T34 = _mm_madd_epi16(T06A, *((__m128i*)tab_dct_32_1[(tab)])); \ T35 = _mm_madd_epi16(T06B, *((__m128i*)tab_dct_32_1[(tab)+1])); \ T36 = _mm_madd_epi16(T07A, *((__m128i*)tab_dct_32_1[(tab)])); \ T37 = _mm_madd_epi16(T07B, *((__m128i*)tab_dct_32_1[(tab)+1])); \ \ T40 = _mm_hadd_epi32(T20, T21); \ T41 = _mm_hadd_epi32(T22, T23); \ T42 = _mm_hadd_epi32(T24, T25); \ T43 = _mm_hadd_epi32(T26, T27); \ T44 = _mm_hadd_epi32(T30, T31); \ T45 = _mm_hadd_epi32(T32, T33); \ T46 = _mm_hadd_epi32(T34, T35); \ T47 = _mm_hadd_epi32(T36, T37); \ \ T50 = _mm_hadd_epi32(T40, T41); \ T51 = _mm_hadd_epi32(T42, T43); \ T52 = _mm_hadd_epi32(T44, T45); \ T53 = _mm_hadd_epi32(T46, T47); \ \ T50 = _mm_hadd_epi32(T50, T51); \ T51 = _mm_hadd_epi32(T52, T53); \ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_add1), shift1); \ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_add1), shift1); \ T60 = _mm_packs_epi32(T50, T51); \ im[(dstPos)][i] = T60; MAKE_ODD(12, 1); MAKE_ODD(14, 3); MAKE_ODD(16, 5); MAKE_ODD(18, 7); MAKE_ODD(20, 9); MAKE_ODD(22, 11); MAKE_ODD(24, 13); MAKE_ODD(26, 15); MAKE_ODD(28, 17); MAKE_ODD(30, 19); MAKE_ODD(32, 21); MAKE_ODD(34, 23); MAKE_ODD(36, 25); MAKE_ODD(38, 27); MAKE_ODD(40, 29); MAKE_ODD(42, 31); #undef MAKE_ODD } // DCT2 __m128i R0C0, R0C1, R0C2, R0C3, R0C4, R0C5, R0C6; __m128i R1C0, R1C1, R1C2, R1C3, R1C4, R1C5, R1C6; __m128i R2C0, R2C1, R2C2, R2C3, R2C4, R2C5, R2C6; __m128i R3C0, R3C1, R3C2, R3C3, R3C4, R3C5, R3C6; __m128i R4C0, R4C1, R4C2, R4C3, R4C4, R4C5, R4C6; __m128i R5C0, R5C1, R5C2, R5C3, R5C4, R5C5, R5C6; __m128i R6C0, R6C1, R6C2, R6C3, R6C4, R6C5, R6C6; __m128i R7C0, R7C1, R7C2, R7C3, R7C4, R7C5, R7C6; __m128i R0C0_origin, R0C1_origin, R0C2_origin, R0C3_origin, R0C4_origin, R0C5_origin, R0C6_origin, R0C7_origin; __m128i R1C0_origin, R1C1_origin, R1C2_origin, R1C3_origin, R1C4_origin, R1C5_origin, R1C6_origin, R1C7_origin; __m128i R2C0_origin, R2C1_origin, R2C2_origin, R2C3_origin, R2C4_origin, R2C5_origin, R2C6_origin, R2C7_origin; __m128i R3C0_origin, R3C1_origin, R3C2_origin, R3C3_origin, R3C4_origin, R3C5_origin, R3C6_origin, R3C7_origin; __m128i R4C0_origin, R4C1_origin, R4C2_origin, R4C3_origin, R4C4_origin, R4C5_origin, R4C6_origin, R4C7_origin; __m128i R5C0_origin, R5C1_origin, R5C2_origin, R5C3_origin, R5C4_origin, R5C5_origin, R5C6_origin, R5C7_origin; __m128i R6C0_origin, R6C1_origin, R6C2_origin, R6C3_origin, R6C4_origin, R6C5_origin, R6C6_origin, R6C7_origin; __m128i R7C0_origin, R7C1_origin, R7C2_origin, R7C3_origin, R7C4_origin, R7C5_origin, R7C6_origin, R7C7_origin; __m128i COE0, COE1, COE2, COE3, COE4, COE5, COE6, COE7; __m128i COE_re_0, COE_re_1; __m128i COE_result; for (i = 0; i < 32 / 8; i++) { T00A = im[i * 8 + 0][0]; // [07 06 05 04 03 02 01 00] T00B = im[i * 8 + 0][1]; // [15 14 13 12 11 10 09 08] T00C = im[i * 8 + 0][2]; // [23 22 21 20 19 18 17 16] T00D = im[i * 8 + 0][3]; // [31 30 29 28 27 26 25 24] T01A = im[i * 8 + 1][0]; T01B = im[i * 8 + 1][1]; T01C = im[i * 8 + 1][2]; T01D = im[i * 8 + 1][3]; T02A = im[i * 8 + 2][0]; T02B = im[i * 8 + 2][1]; T02C = im[i * 8 + 2][2]; T02D = im[i * 8 + 2][3]; T03A = im[i * 8 + 3][0]; T03B = im[i * 8 + 3][1]; T03C = im[i * 8 + 3][2]; T03D = im[i * 8 + 3][3]; T04A = im[i * 8 + 4][0]; T04B = im[i * 8 + 4][1]; T04C = im[i * 8 + 4][2]; T04D = im[i * 8 + 4][3]; T05A = im[i * 8 + 5][0]; T05B = im[i * 8 + 5][1]; T05C = im[i * 8 + 5][2]; T05D = im[i * 8 + 5][3]; T06A = im[i * 8 + 6][0]; T06B = im[i * 8 + 6][1]; T06C = im[i * 8 + 6][2]; T06D = im[i * 8 + 6][3]; T07A = im[i * 8 + 7][0]; T07B = im[i * 8 + 7][1]; T07C = im[i * 8 + 7][2]; T07D = im[i * 8 + 7][3]; T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [05 02 06 01 04 03 07 00] T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [10 13 09 14 11 12 08 15] T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [21 18 22 17 20 19 23 16] T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [26 29 25 30 27 28 24 31] T01A = _mm_shuffle_epi8(T01A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T02A = _mm_shuffle_epi8(T02A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T03A = _mm_shuffle_epi8(T03A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T04A = _mm_shuffle_epi8(T04A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T04B = _mm_shuffle_epi8(T04B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T04C = _mm_shuffle_epi8(T04C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T04D = _mm_shuffle_epi8(T04D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T05A = _mm_shuffle_epi8(T05A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T05B = _mm_shuffle_epi8(T05B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T05C = _mm_shuffle_epi8(T05C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T05D = _mm_shuffle_epi8(T05D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T06A = _mm_shuffle_epi8(T06A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T06B = _mm_shuffle_epi8(T06B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T06C = _mm_shuffle_epi8(T06C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T06D = _mm_shuffle_epi8(T06D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T07A = _mm_shuffle_epi8(T07A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T07B = _mm_shuffle_epi8(T07B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); T07C = _mm_shuffle_epi8(T07C, _mm_load_si128((__m128i*)tab_dct_16_0[1])); T07D = _mm_shuffle_epi8(T07D, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // (i*8)+0 R0C0_origin = _mm_cvtepi16_epi32(T00A);// 04 03 07 00 T00A = _mm_srli_si128(T00A, 8); R0C1_origin = _mm_cvtepi16_epi32(T00A);// 05 02 06 01 R0C2_origin = _mm_cvtepi16_epi32(T00B);// 11 12 08 15 T00B = _mm_srli_si128(T00B, 8); R0C3_origin = _mm_cvtepi16_epi32(T00B);// 10 13 09 14 R0C4_origin = _mm_cvtepi16_epi32(T00C);// 20 19 23 16 T00C = _mm_srli_si128(T00C, 8); R0C5_origin = _mm_cvtepi16_epi32(T00C);// 21 18 22 17 R0C6_origin = _mm_cvtepi16_epi32(T00D);// 27 28 24 31 T00D = _mm_srli_si128(T00D, 8); R0C7_origin = _mm_cvtepi16_epi32(T00D);// 26 29 25 30 //add 32bit R0C0 = _mm_add_epi32(R0C0_origin, R0C6_origin);// [04+27] [03+28] [07+24] [00+31] R0C1 = _mm_add_epi32(R0C1_origin, R0C7_origin);// [05+26] [02+29] [06+25] [01+30] R0C2 = _mm_add_epi32(R0C2_origin, R0C4_origin);// [11+20] [12+19] [08+23] [15+16] R0C3 = _mm_add_epi32(R0C3_origin, R0C5_origin);// [10+21] [13+18] [09+22] [14+17] R0C4 = _mm_add_epi32(R0C0, R0C2);// [04+27 + 11+20] [03+28 + 12+29] [07+24 + 08+23] [00+31 + 15+16]----4 3 7 0 R0C5 = _mm_add_epi32(R0C1, R0C3);// [05+26 + 10+21] [02+29 + 13+18] [06+25 + 09+22] [01+30 + 14+17]----5 2 6 1 R0C6 = _mm_hadd_epi32(R0C4, R0C5);// [] [] [] [] 2 1 3 0 // (i*8)+1 R1C0_origin = _mm_cvtepi16_epi32(T01A);// 04 03 07 00 T01A = _mm_srli_si128(T01A, 8); R1C1_origin = _mm_cvtepi16_epi32(T01A);// 05 02 06 01 R1C2_origin = _mm_cvtepi16_epi32(T01B);// 11 12 08 15 T01B = _mm_srli_si128(T01B, 8); R1C3_origin = _mm_cvtepi16_epi32(T01B);// 10 13 09 14 R1C4_origin = _mm_cvtepi16_epi32(T01C);// 20 19 23 16 T01C = _mm_srli_si128(T01C, 8); R1C5_origin = _mm_cvtepi16_epi32(T01C);// 21 18 22 17 R1C6_origin = _mm_cvtepi16_epi32(T01D);// 27 28 24 31 T01D = _mm_srli_si128(T01D, 8); R1C7_origin = _mm_cvtepi16_epi32(T01D);// 26 29 25 30 R1C0 = _mm_add_epi32(R1C0_origin, R1C6_origin); R1C1 = _mm_add_epi32(R1C1_origin, R1C7_origin); R1C2 = _mm_add_epi32(R1C2_origin, R1C4_origin); R1C3 = _mm_add_epi32(R1C3_origin, R1C5_origin); R1C4 = _mm_add_epi32(R1C0, R1C2);// [04+27 + 11+20] [03+28 + 12+29] [07+24 + 08+23] [00+31 + 15+16]----4 3 7 0 R1C5 = _mm_add_epi32(R1C1, R1C3);// [05+26 + 10+21] [02+29 + 13+18] [06+25 + 09+22] [01+30 + 14+17]----5 2 6 1 R1C6 = _mm_hadd_epi32(R1C4, R1C5);// [] [] [] [] 2 1 3 0 // (i*8)+2 R2C0_origin = _mm_cvtepi16_epi32(T02A); // 04 03 07 00 T02A = _mm_srli_si128(T02A, 8); R2C1_origin = _mm_cvtepi16_epi32(T02A);// 05 02 06 01 R2C2_origin = _mm_cvtepi16_epi32(T02B);// 11 12 08 15 T02B = _mm_srli_si128(T02B, 8); R2C3_origin = _mm_cvtepi16_epi32(T02B);// 10 13 09 14 R2C4_origin = _mm_cvtepi16_epi32(T02C);// 20 19 23 16 T02C = _mm_srli_si128(T02C, 8); R2C5_origin = _mm_cvtepi16_epi32(T02C);// 21 18 22 17 R2C6_origin = _mm_cvtepi16_epi32(T02D);// 27 28 24 31 T02D = _mm_srli_si128(T02D, 8); R2C7_origin = _mm_cvtepi16_epi32(T02D);// 26 29 25 30 R2C0 = _mm_add_epi32(R2C0_origin, R2C6_origin);// [04+27] [03+28] [07+24] [00+31] R2C1 = _mm_add_epi32(R2C1_origin, R2C7_origin);// [05+26] [02+29] [06+25] [01+30] R2C2 = _mm_add_epi32(R2C2_origin, R2C4_origin);// [11+20] [12+19] [08+23] [15+16] R2C3 = _mm_add_epi32(R2C3_origin, R2C5_origin);// [10+21] [13+18] [09+22] [14+17] R2C4 = _mm_add_epi32(R2C0, R2C2);// [04+27 + 11+20] [03+28 + 12+29] [07+24 + 08+23] [00+31 + 15+16]----4 3 7 0 R2C5 = _mm_add_epi32(R2C1, R2C3);// [05+26 + 10+21] [02+29 + 13+18] [06+25 + 09+22] [01+30 + 14+17]----5 2 6 1 R2C6 = _mm_hadd_epi32(R2C4, R2C5);// [] [] [] [] 2 1 3 0 // (i*8)+3 R3C0_origin = _mm_cvtepi16_epi32(T03A); // 04 03 07 00 T03A = _mm_srli_si128(T03A, 8); R3C1_origin = _mm_cvtepi16_epi32(T03A);// 05 02 06 01 R3C2_origin = _mm_cvtepi16_epi32(T03B);// 11 12 08 15 T03B = _mm_srli_si128(T03B, 8); R3C3_origin = _mm_cvtepi16_epi32(T03B);// 10 13 09 14 R3C4_origin = _mm_cvtepi16_epi32(T03C);// 20 19 23 16 T03C = _mm_srli_si128(T03C, 8); R3C5_origin = _mm_cvtepi16_epi32(T03C);// 21 18 22 17 R3C6_origin = _mm_cvtepi16_epi32(T03D);// 27 28 24 31 T03D = _mm_srli_si128(T03D, 8); R3C7_origin = _mm_cvtepi16_epi32(T03D);// 26 29 25 30 R3C0 = _mm_add_epi32(R3C0_origin, R3C6_origin); R3C1 = _mm_add_epi32(R3C1_origin, R3C7_origin); R3C2 = _mm_add_epi32(R3C2_origin, R3C4_origin); R3C3 = _mm_add_epi32(R3C3_origin, R3C5_origin); R3C4 = _mm_add_epi32(R3C0, R3C2);// [04+27 + 11+20] [03+28 + 12+29] [07+24 + 08+23] [00+31 + 15+16]----4 3 7 0 R3C5 = _mm_add_epi32(R3C1, R3C3);// [05+26 + 10+21] [02+29 + 13+18] [06+25 + 09+22] [01+30 + 14+17]----5 2 6 1 R3C6 = _mm_hadd_epi32(R3C4, R3C5);// [] [] [] [] 2 1 3 0 // (i*8)+4 R4C0_origin = _mm_cvtepi16_epi32(T04A); // 04 03 07 00 T04A = _mm_srli_si128(T04A, 8); R4C1_origin = _mm_cvtepi16_epi32(T04A);// 05 02 06 01 R4C2_origin = _mm_cvtepi16_epi32(T04B);// 11 12 08 15 T04B = _mm_srli_si128(T04B, 8); R4C3_origin = _mm_cvtepi16_epi32(T04B);// 10 13 09 14 R4C4_origin = _mm_cvtepi16_epi32(T04C);// 20 19 23 16 T04C = _mm_srli_si128(T04C, 8); R4C5_origin = _mm_cvtepi16_epi32(T04C);// 21 18 22 17 R4C6_origin = _mm_cvtepi16_epi32(T04D);// 27 28 24 31 T04D = _mm_srli_si128(T04D, 8); R4C7_origin = _mm_cvtepi16_epi32(T04D);// 26 29 25 30 R4C0 = _mm_add_epi32(R4C0_origin, R4C6_origin);// [04+27] [03+28] [07+24] [00+31] R4C1 = _mm_add_epi32(R4C1_origin, R4C7_origin);// [05+26] [02+29] [06+25] [01+30] R4C2 = _mm_add_epi32(R4C2_origin, R4C4_origin);// [11+20] [12+19] [08+23] [15+16] R4C3 = _mm_add_epi32(R4C3_origin, R4C5_origin);// [10+21] [13+18] [09+22] [14+17] R4C4 = _mm_add_epi32(R4C0, R4C2);// [04+27 + 11+20] [03+28 + 12+29] [07+24 + 08+23] [00+31 + 15+16]----4 3 7 0 R4C5 = _mm_add_epi32(R4C1, R4C3);// [05+26 + 10+21] [02+29 + 13+18] [06+25 + 09+22] [01+30 + 14+17]----5 2 6 1 R4C6 = _mm_hadd_epi32(R4C4, R4C5);// [] [] [] [] 2 1 3 0 // (i*8)+5 R5C0_origin = _mm_cvtepi16_epi32(T05A); // 04 03 07 00 T05A = _mm_srli_si128(T05A, 8); R5C1_origin = _mm_cvtepi16_epi32(T05A);// 05 02 06 01 R5C2_origin = _mm_cvtepi16_epi32(T05B);// 11 12 08 15 T05B = _mm_srli_si128(T05B, 8); R5C3_origin = _mm_cvtepi16_epi32(T05B);// 10 13 09 14 R5C4_origin = _mm_cvtepi16_epi32(T05C);// 20 19 23 16 T05C = _mm_srli_si128(T05C, 8); R5C5_origin = _mm_cvtepi16_epi32(T05C);// 21 18 22 17 R5C6_origin = _mm_cvtepi16_epi32(T05D);// 27 28 24 31 T05D = _mm_srli_si128(T05D, 8); R5C7_origin = _mm_cvtepi16_epi32(T05D);// 26 29 25 30 R5C0 = _mm_add_epi32(R5C0_origin, R5C6_origin); R5C1 = _mm_add_epi32(R5C1_origin, R5C7_origin); R5C2 = _mm_add_epi32(R5C2_origin, R5C4_origin); R5C3 = _mm_add_epi32(R5C3_origin, R5C5_origin); R5C4 = _mm_add_epi32(R5C0, R5C2);// [04+27 + 11+20] [03+28 + 12+29] [07+24 + 08+23] [00+31 + 15+16]----4 3 7 0 R5C5 = _mm_add_epi32(R5C1, R5C3);// [05+26 + 10+21] [02+29 + 13+18] [06+25 + 09+22] [01+30 + 14+17]----5 2 6 1 R5C6 = _mm_hadd_epi32(R5C4, R5C5);// [] [] [] [] 2 1 3 0 // (i*8)+6 R6C0_origin = _mm_cvtepi16_epi32(T06A); // 04 03 07 00 T06A = _mm_srli_si128(T06A, 8); R6C1_origin = _mm_cvtepi16_epi32(T06A);// 05 02 06 01 R6C2_origin = _mm_cvtepi16_epi32(T06B);// 11 12 08 15 T06B = _mm_srli_si128(T06B, 8); R6C3_origin = _mm_cvtepi16_epi32(T06B);// 10 13 09 14 R6C4_origin = _mm_cvtepi16_epi32(T06C);// 20 19 23 16 T06C = _mm_srli_si128(T06C, 8); R6C5_origin = _mm_cvtepi16_epi32(T06C);// 21 18 22 17 R6C6_origin = _mm_cvtepi16_epi32(T06D);// 27 28 24 31 T06D = _mm_srli_si128(T06D, 8); R6C7_origin = _mm_cvtepi16_epi32(T06D);// 26 29 25 30 R6C0 = _mm_add_epi32(R6C0_origin, R6C6_origin);// [04+27] [03+28] [07+24] [00+31] R6C1 = _mm_add_epi32(R6C1_origin, R6C7_origin);// [05+26] [02+29] [06+25] [01+30] R6C2 = _mm_add_epi32(R6C2_origin, R6C4_origin);// [11+20] [12+19] [08+23] [15+16] R6C3 = _mm_add_epi32(R6C3_origin, R6C5_origin);// [10+21] [13+18] [09+22] [14+17] R6C4 = _mm_add_epi32(R6C0, R6C2);// [04+27 + 11+20] [03+28 + 12+29] [07+24 + 08+23] [00+31 + 15+16]----4 3 7 0 R6C5 = _mm_add_epi32(R6C1, R6C3);// [05+26 + 10+21] [02+29 + 13+18] [06+25 + 09+22] [01+30 + 14+17]----5 2 6 1 R6C6 = _mm_hadd_epi32(R6C4, R6C5);// [] [] [] [] 2 1 3 0 // (i*8)+7 R7C0_origin = _mm_cvtepi16_epi32(T07A);// 04 03 07 00 T07A = _mm_srli_si128(T07A, 8); R7C1_origin = _mm_cvtepi16_epi32(T07A);// 05 02 06 01 R7C2_origin = _mm_cvtepi16_epi32(T07B);// 11 12 08 15 T07B = _mm_srli_si128(T07B, 8); R7C3_origin = _mm_cvtepi16_epi32(T07B);// 10 13 09 14 R7C4_origin = _mm_cvtepi16_epi32(T07C);// 20 19 23 16 T07C = _mm_srli_si128(T07C, 8); R7C5_origin = _mm_cvtepi16_epi32(T07C);// 21 18 22 17 R7C6_origin = _mm_cvtepi16_epi32(T07D);// 27 28 24 31 T07D = _mm_srli_si128(T07D, 8); R7C7_origin = _mm_cvtepi16_epi32(T07D);// 26 29 25 30 R7C0 = _mm_add_epi32(R7C0_origin, R7C6_origin); R7C1 = _mm_add_epi32(R7C1_origin, R7C7_origin); R7C2 = _mm_add_epi32(R7C2_origin, R7C4_origin); R7C3 = _mm_add_epi32(R7C3_origin, R7C5_origin); R7C4 = _mm_add_epi32(R7C0, R7C2);// [04+27 + 11+20] [03+28 + 12+29] [07+24 + 08+23] [00+31 + 15+16]----4 3 7 0 R7C5 = _mm_add_epi32(R7C1, R7C3);// [05+26 + 10+21] [02+29 + 13+18] [06+25 + 09+22] [01+30 + 14+17]----5 2 6 1 R7C6 = _mm_hadd_epi32(R7C4, R7C5);// [] [] [] [] 2 1 3 0 //coefficient //compute coefficient COE0 = _mm_mullo_epi32(R0C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[0])); COE1 = _mm_mullo_epi32(R1C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[0])); COE2 = _mm_mullo_epi32(R2C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[0])); COE3 = _mm_mullo_epi32(R3C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[0])); COE4 = _mm_mullo_epi32(R4C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[0])); COE5 = _mm_mullo_epi32(R5C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[0])); COE6 = _mm_mullo_epi32(R6C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[0])); COE7 = _mm_mullo_epi32(R7C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[0])); COE0 = _mm_hadd_epi32(COE0, COE1);//low 64 bit 1th row ,high 64 bit 2th coefficient COE1 = _mm_hadd_epi32(COE2, COE3); COE2 = _mm_hadd_epi32(COE4, COE5); COE3 = _mm_hadd_epi32(COE6, COE7); COE_re_0 = _mm_hadd_epi32(COE0, COE1);//[127-96] 3th row [95-64] 2th row [63-32] 1th row [31-0] 0 row COE_re_1 = _mm_hadd_epi32(COE2, COE3); COE_re_0 = _mm_srai_epi32(_mm_add_epi32(COE_re_0, c_add2), SHIFT2); COE_re_1 = _mm_srai_epi32(_mm_add_epi32(COE_re_1, c_add2), SHIFT2); COE_result = _mm_packs_epi32(COE_re_0, COE_re_1); _mm_store_si128((__m128i*)(dst + (i * 8) + 0), COE_result); COE_re_0 = _mm_hsub_epi32(COE0, COE1); COE_re_1 = _mm_hsub_epi32(COE2, COE3); COE_re_0 = _mm_srai_epi32(_mm_add_epi32(COE_re_0, c_add2), SHIFT2); COE_re_1 = _mm_srai_epi32(_mm_add_epi32(COE_re_1, c_add2), SHIFT2); COE_result = _mm_packs_epi32(COE_re_0, COE_re_1); _mm_store_si128((__m128i*)(dst + (16 * i_src) + (i * 8) + 0), COE_result); COE0 = _mm_mullo_epi32(R0C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[1])); COE1 = _mm_mullo_epi32(R1C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[1])); COE2 = _mm_mullo_epi32(R2C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[1])); COE3 = _mm_mullo_epi32(R3C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[1])); COE4 = _mm_mullo_epi32(R4C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[1])); COE5 = _mm_mullo_epi32(R5C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[1])); COE6 = _mm_mullo_epi32(R6C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[1])); COE7 = _mm_mullo_epi32(R7C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[1])); COE0 = _mm_hadd_epi32(COE0, COE1);//low 64 bit 1th row ,high 64 bit 2th coefficient COE1 = _mm_hadd_epi32(COE2, COE3); COE2 = _mm_hadd_epi32(COE4, COE5); COE3 = _mm_hadd_epi32(COE6, COE7); COE_re_0 = _mm_hadd_epi32(COE0, COE1);//[127-96] 3th row [95-64] 2th row [63-32] 1th row [31-0] 0 row COE_re_1 = _mm_hadd_epi32(COE2, COE3); COE_re_0 = _mm_srai_epi32(_mm_add_epi32(COE_re_0, c_add2), SHIFT2); COE_re_1 = _mm_srai_epi32(_mm_add_epi32(COE_re_1, c_add2), SHIFT2); COE_result = _mm_packs_epi32(COE_re_0, COE_re_1); _mm_store_si128((__m128i*)(dst + (8 * i_src) + (i * 8) + 0), COE_result); COE0 = _mm_mullo_epi32(R0C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[2])); COE1 = _mm_mullo_epi32(R1C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[2])); COE2 = _mm_mullo_epi32(R2C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[2])); COE3 = _mm_mullo_epi32(R3C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[2])); COE4 = _mm_mullo_epi32(R4C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[2])); COE5 = _mm_mullo_epi32(R5C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[2])); COE6 = _mm_mullo_epi32(R6C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[2])); COE7 = _mm_mullo_epi32(R7C6, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[2])); COE0 = _mm_hadd_epi32(COE0, COE1);//low 64 bit 1th row ,high 64 bit 2th coefficient COE1 = _mm_hadd_epi32(COE2, COE3); COE2 = _mm_hadd_epi32(COE4, COE5); COE3 = _mm_hadd_epi32(COE6, COE7); COE_re_0 = _mm_hadd_epi32(COE0, COE1);//[127-96] 3th row [95-64] 2th row [63-32] 1th row [31-0] 0 row COE_re_1 = _mm_hadd_epi32(COE2, COE3); COE_re_0 = _mm_srai_epi32(_mm_add_epi32(COE_re_0, c_add2), SHIFT2); COE_re_1 = _mm_srai_epi32(_mm_add_epi32(COE_re_1, c_add2), SHIFT2); COE_result = _mm_packs_epi32(COE_re_0, COE_re_1); _mm_store_si128((__m128i*)(dst + (24 * i_src) + (i * 8) + 0), COE_result); #define MAKE_ODD(tab,dstPos) \ COE0 = _mm_add_epi32(_mm_mullo_epi32(R0C4, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(R0C5, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ COE1 = _mm_add_epi32(_mm_mullo_epi32(R1C4, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(R1C5, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ COE2 = _mm_add_epi32(_mm_mullo_epi32(R2C4, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(R2C5, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ COE3 = _mm_add_epi32(_mm_mullo_epi32(R3C4, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(R3C5, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ COE4 = _mm_add_epi32(_mm_mullo_epi32(R4C4, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(R4C5, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ COE5 = _mm_add_epi32(_mm_mullo_epi32(R5C4, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(R5C5, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ COE6 = _mm_add_epi32(_mm_mullo_epi32(R6C4, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(R6C5, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ COE7 = _mm_add_epi32(_mm_mullo_epi32(R7C4, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(R7C5, _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ \ COE0 = _mm_hadd_epi32(COE0, COE1); \ COE1 = _mm_hadd_epi32(COE2, COE3); \ COE2 = _mm_hadd_epi32(COE4, COE5); \ COE3 = _mm_hadd_epi32(COE6, COE7); \ \ COE_re_0 = _mm_hadd_epi32(COE0, COE1); \ COE_re_1 = _mm_hadd_epi32(COE2, COE3); \ COE_re_0 = _mm_srai_epi32(_mm_add_epi32(COE_re_0, c_add2), SHIFT2); \ COE_re_1 = _mm_srai_epi32(_mm_add_epi32(COE_re_1, c_add2), SHIFT2); \ COE_result = _mm_packs_epi32(COE_re_0, COE_re_1); \ _mm_store_si128((__m128i*)(dst + (dstPos * i_src) + (i * 8) + 0), COE_result); MAKE_ODD(3, 4); MAKE_ODD(5, 12); MAKE_ODD(7, 20); MAKE_ODD(9, 28); #undef MAKE_ODD #define MAKE_ODD(tab,dstPos) \ COE0 = _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R0C0, R0C2), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R0C1, R0C3), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ COE1 = _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R1C0, R1C2), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R1C1, R1C3), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ COE2 = _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R2C0, R2C2), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R2C1, R2C3), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ COE3 = _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R3C0, R3C2), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R3C1, R3C3), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ COE4 = _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R4C0, R4C2), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R4C1, R4C3), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ COE5 = _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R5C0, R5C2), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R5C1, R5C3), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ COE6 = _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R6C0, R6C2), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R6C1, R6C3), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ COE7 = _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R7C0, R7C2), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R7C1, R7C3), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))); \ \ COE0 = _mm_hadd_epi32(COE0, COE1); \ COE1 = _mm_hadd_epi32(COE2, COE3); \ COE2 = _mm_hadd_epi32(COE4, COE5); \ COE3 = _mm_hadd_epi32(COE6, COE7); \ \ COE_re_0 = _mm_hadd_epi32(COE0, COE1); \ COE_re_1 = _mm_hadd_epi32(COE2, COE3); \ COE_re_0 = _mm_srai_epi32(_mm_add_epi32(COE_re_0, c_add2), SHIFT2); \ COE_re_1 = _mm_srai_epi32(_mm_add_epi32(COE_re_1, c_add2), SHIFT2); \ COE_result = _mm_packs_epi32(COE_re_0, COE_re_1); \ _mm_store_si128((__m128i*)(dst + (dstPos * i_src) + (i * 8) + 0), COE_result); MAKE_ODD(11, 2); MAKE_ODD(13, 6); MAKE_ODD(15, 10); MAKE_ODD(17, 14); MAKE_ODD(19, 18); MAKE_ODD(21, 22); MAKE_ODD(23, 26); MAKE_ODD(25, 30); #undef MAKE_ODD /*COE0 = _mm_add_epi32( _mm_add_epi32( _mm_mullo_epi32(_mm_sub_epi32(R0C0_origin, R0C6_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[1 ]) ) , _mm_mullo_epi32(_mm_sub_epi32(R0C1_origin, R0C7_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[1] ) ) ) , _mm_add_epi32( _mm_mullo_epi32(_mm_sub_epi32(R0C2_origin, R0C4_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[1] ) ) , _mm_mullo_epi32(_mm_sub_epi32(R0C3_origin, R0C5_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[1] ) ) ) ); */ //compute 1 3 5 7 9 11 ....29 31 //dct coefficient matrix is symmetry .So according to this property . we can compute [0-31] [7-24].... //then add corresponding bit .we can get the result.-------zhangjiaqi 2016-12-10 #define MAKE_ODD(tab,dstPos) \ COE0 = _mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R0C0_origin, R0C6_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R0C1_origin, R0C7_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))), _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R0C2_origin, R0C4_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 2])), _mm_mullo_epi32(_mm_sub_epi32(R0C3_origin, R0C5_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 3])))); \ COE1 = _mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R1C0_origin, R1C6_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R1C1_origin, R1C7_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))), _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R1C2_origin, R1C4_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 2])), _mm_mullo_epi32(_mm_sub_epi32(R1C3_origin, R1C5_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 3])))); \ COE2 = _mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R2C0_origin, R2C6_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R2C1_origin, R2C7_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))), _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R2C2_origin, R2C4_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 2])), _mm_mullo_epi32(_mm_sub_epi32(R2C3_origin, R2C5_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 3])))); \ COE3 = _mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R3C0_origin, R3C6_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R3C1_origin, R3C7_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))), _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R3C2_origin, R3C4_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 2])), _mm_mullo_epi32(_mm_sub_epi32(R3C3_origin, R3C5_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 3])))); \ COE4 = _mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R4C0_origin, R4C6_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R4C1_origin, R4C7_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))), _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R4C2_origin, R4C4_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 2])), _mm_mullo_epi32(_mm_sub_epi32(R4C3_origin, R4C5_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 3])))); \ COE5 = _mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R5C0_origin, R5C6_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R5C1_origin, R5C7_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))), _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R5C2_origin, R5C4_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 2])), _mm_mullo_epi32(_mm_sub_epi32(R5C3_origin, R5C5_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 3])))); \ COE6 = _mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R6C0_origin, R6C6_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R6C1_origin, R6C7_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))), _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R6C2_origin, R6C4_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 2])), _mm_mullo_epi32(_mm_sub_epi32(R6C3_origin, R6C5_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 3])))); \ COE7 = _mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R7C0_origin, R7C6_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab])), _mm_mullo_epi32(_mm_sub_epi32(R7C1_origin, R7C7_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 1]))), _mm_add_epi32(_mm_mullo_epi32(_mm_sub_epi32(R7C2_origin, R7C4_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 2])), _mm_mullo_epi32(_mm_sub_epi32(R7C3_origin, R7C5_origin), _mm_load_si128((__m128i*)tab_dct_32_zhangjiaqi[tab + 3])))); \ \ COE0 = _mm_hadd_epi32(COE0, COE1); \ COE1 = _mm_hadd_epi32(COE2, COE3); \ COE2 = _mm_hadd_epi32(COE4, COE5); \ COE3 = _mm_hadd_epi32(COE6, COE7); \ \ COE_re_0 = _mm_hadd_epi32(COE0, COE1); \ COE_re_1 = _mm_hadd_epi32(COE2, COE3); \ COE_re_0 = _mm_srai_epi32(_mm_add_epi32(COE_re_0, c_add2), SHIFT2); \ COE_re_1 = _mm_srai_epi32(_mm_add_epi32(COE_re_1, c_add2), SHIFT2); \ COE_result = _mm_packs_epi32(COE_re_0, COE_re_1); \ \ _mm_store_si128((__m128i*)(dst + (dstPos * i_src) + (i * 8) + 0), COE_result); MAKE_ODD(27, 1); MAKE_ODD(31, 3); MAKE_ODD(35, 5); MAKE_ODD(39, 7); MAKE_ODD(43, 9); MAKE_ODD(47, 11); MAKE_ODD(51, 13); MAKE_ODD(55, 15); MAKE_ODD(59, 17); MAKE_ODD(63, 19); MAKE_ODD(67, 21); MAKE_ODD(71, 23); MAKE_ODD(75, 25); MAKE_ODD(79, 27); MAKE_ODD(83, 29); MAKE_ODD(87, 31); #undef MAKE_ODD } #undef SHIFT1 #undef ADD1 #undef SHIFT2 #undef ADD2 } /* --------------------------------------------------------------------------- */ static void trans_2nd_hor_sse128(coeff_t *coeff, int i_coeff) { #define SHIFT 7 #define ADD 64 // const __m128i c_add = _mm_set1_epi32(ADD); // load 4x4 coeffs __m128i T10 = _mm_loadl_epi64((__m128i*)(coeff + 0 * i_coeff)); // [0 0 0 0 a3 a2 a1 a0] __m128i T11 = _mm_loadl_epi64((__m128i*)(coeff + 1 * i_coeff)); // [0 0 0 0 b3 b2 b1 b0] __m128i T12 = _mm_loadl_epi64((__m128i*)(coeff + 2 * i_coeff)); // [0 0 0 0 c3 c2 c1 c0] __m128i T13 = _mm_loadl_epi64((__m128i*)(coeff + 3 * i_coeff)); // [0 0 0 0 d3 d2 d1 d0] __m128i T20 = _mm_shuffle_epi32(T10, 0x00); // [a1 a0 a1 a0 a1 a0 a1 a0] __m128i T21 = _mm_shuffle_epi32(T10, 0x55); // [a3 a2 a3 a2 a3 a2 a3 a2] __m128i T22 = _mm_shuffle_epi32(T11, 0x00); __m128i T23 = _mm_shuffle_epi32(T11, 0x55); __m128i T24 = _mm_shuffle_epi32(T12, 0x00); __m128i T25 = _mm_shuffle_epi32(T12, 0x55); __m128i T26 = _mm_shuffle_epi32(T13, 0x00); __m128i T27 = _mm_shuffle_epi32(T13, 0x55); // load g_2T_H transform matrix __m128i C10 = _mm_load_si128((__m128i*)(g_2T_H + 0 * 2 * SEC_TR_SIZE)); // [h1 h0 g1 g0 f1 f0 e1 e0] __m128i C11 = _mm_load_si128((__m128i*)(g_2T_H + 1 * 2 * SEC_TR_SIZE)); // [h3 h2 g3 g2 f3 f2 e3 e2] // transform __m128i T30 = _mm_madd_epi16(T20, C10); // [a0*h0+a1*h1, a0*g0+a1*g1, a0*f0+a1*f1, a0*e0+a1*e1] __m128i T31 = _mm_madd_epi16(T21, C11); // [a2*h2+a3*h3, a2*g2+a3*g3, a2*f2+a3*f3, a2*e2+a3*e3] __m128i T32 = _mm_madd_epi16(T22, C10); __m128i T33 = _mm_madd_epi16(T23, C11); __m128i T34 = _mm_madd_epi16(T24, C10); __m128i T35 = _mm_madd_epi16(T25, C11); __m128i T36 = _mm_madd_epi16(T26, C10); __m128i T37 = _mm_madd_epi16(T27, C11); __m128i T40 = _mm_add_epi32(T30, T31); __m128i T41 = _mm_add_epi32(T32, T33); __m128i T42 = _mm_add_epi32(T34, T35); __m128i T43 = _mm_add_epi32(T36, T37); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add), SHIFT); T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_add), SHIFT); T42 = _mm_srai_epi32(_mm_add_epi32(T42, c_add), SHIFT); T43 = _mm_srai_epi32(_mm_add_epi32(T43, c_add), SHIFT); // store T40 = _mm_packs_epi32(T40, T40); T41 = _mm_packs_epi32(T41, T41); T42 = _mm_packs_epi32(T42, T42); T43 = _mm_packs_epi32(T43, T43); _mm_storel_epi64((__m128i*)(coeff + 0 * i_coeff), T40); _mm_storel_epi64((__m128i*)(coeff + 1 * i_coeff), T41); _mm_storel_epi64((__m128i*)(coeff + 2 * i_coeff), T42); _mm_storel_epi64((__m128i*)(coeff + 3 * i_coeff), T43); #undef SHIFT #undef ADD } /* --------------------------------------------------------------------------- */ static void trans_2nd_ver_sse128(coeff_t *coeff, int i_coeff) { #define SHIFT 7 #define ADD 64 // const __m128i c_add = _mm_set1_epi32(ADD); // load 4x4 coeffs __m128i T10 = _mm_loadl_epi64((__m128i*)(coeff + 0 * i_coeff)); // [0 0 0 0 a3 a2 a1 a0] __m128i T11 = _mm_loadl_epi64((__m128i*)(coeff + 1 * i_coeff)); // [0 0 0 0 b3 b2 b1 b0] __m128i T12 = _mm_loadl_epi64((__m128i*)(coeff + 2 * i_coeff)); // [0 0 0 0 c3 c2 c1 c0] __m128i T13 = _mm_loadl_epi64((__m128i*)(coeff + 3 * i_coeff)); // [0 0 0 0 d3 d2 d1 d0] __m128i T20 = _mm_unpacklo_epi16(T10, T11); // [b3 a3 b2 a2 b1 a1 b0 a0] __m128i T21 = _mm_unpacklo_epi16(T12, T13); // [d3 c3 d2 c2 d1 c1 d0 c0] // load g_2T_V transform matrix __m128i C10 = _mm_load_si128((__m128i*)(g_2T_V + 0 * 2 * SEC_TR_SIZE)); // [e1 e0 e1 e0 e1 e0 e1 e0] __m128i C11 = _mm_load_si128((__m128i*)(g_2T_V + 1 * 2 * SEC_TR_SIZE)); // [e3 e2 e3 e2 e3 e2 e3 e2] __m128i C12 = _mm_load_si128((__m128i*)(g_2T_V + 2 * 2 * SEC_TR_SIZE)); // [f1 f0 f1 f0 f1 f0 f1 f0] __m128i C13 = _mm_load_si128((__m128i*)(g_2T_V + 3 * 2 * SEC_TR_SIZE)); // [f3 f2 f3 f2 f3 f2 f3 f2] __m128i C14 = _mm_load_si128((__m128i*)(g_2T_V + 4 * 2 * SEC_TR_SIZE)); // [g1 g0 g1 g0 g1 g0 g1 g0] __m128i C15 = _mm_load_si128((__m128i*)(g_2T_V + 5 * 2 * SEC_TR_SIZE)); // [g3 g2 g3 g2 g3 g2 g3 g2] __m128i C16 = _mm_load_si128((__m128i*)(g_2T_V + 6 * 2 * SEC_TR_SIZE)); // [h1 h0 h1 h0 h1 h0 h1 h0] __m128i C17 = _mm_load_si128((__m128i*)(g_2T_V + 7 * 2 * SEC_TR_SIZE)); // [h3 h2 h3 h2 h3 h2 h3 h2] // transform __m128i T30 = _mm_madd_epi16(T20, C10); // [a3*e0+b3*e1, a2*e0+b2*e1, a1*e0+b1*e1, a0*e0+b0*e1] __m128i T31 = _mm_madd_epi16(T21, C11); // [c3*e2+d3*e3, c2*e2+d2*e3, c1*e2+d1*e3, c0*e2+d0*e3] __m128i T32 = _mm_madd_epi16(T20, C12); __m128i T33 = _mm_madd_epi16(T21, C13); __m128i T34 = _mm_madd_epi16(T20, C14); __m128i T35 = _mm_madd_epi16(T21, C15); __m128i T36 = _mm_madd_epi16(T20, C16); __m128i T37 = _mm_madd_epi16(T21, C17); __m128i T40 = _mm_add_epi32(T30, T31); __m128i T41 = _mm_add_epi32(T32, T33); __m128i T42 = _mm_add_epi32(T34, T35); __m128i T43 = _mm_add_epi32(T36, T37); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add), SHIFT); T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_add), SHIFT); T42 = _mm_srai_epi32(_mm_add_epi32(T42, c_add), SHIFT); T43 = _mm_srai_epi32(_mm_add_epi32(T43, c_add), SHIFT); // store T40 = _mm_packs_epi32(T40, T40); T41 = _mm_packs_epi32(T41, T41); T42 = _mm_packs_epi32(T42, T42); T43 = _mm_packs_epi32(T43, T43); _mm_storel_epi64((__m128i*)(coeff + 0 * i_coeff), T40); _mm_storel_epi64((__m128i*)(coeff + 1 * i_coeff), T41); _mm_storel_epi64((__m128i*)(coeff + 2 * i_coeff), T42); _mm_storel_epi64((__m128i*)(coeff + 3 * i_coeff), T43); #undef SHIFT #undef ADD } /* --------------------------------------------------------------------------- * i_mode - real intra mode (luma) * b_top - block top available? * b_left - block left available? */ void transform_2nd_sse128(coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left) { int vt = (i_mode >= 0 && i_mode <= 23); int ht = (i_mode >= 13 && i_mode <= 32) || (i_mode >= 0 && i_mode <= 2); if (vt && b_top) { trans_2nd_ver_sse128(coeff, i_coeff); } if (ht && b_left) { trans_2nd_hor_sse128(coeff, i_coeff); } } /* --------------------------------------------------------------------------- */ void transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff) { const int SHIFT1 = B4X4_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + 1; const int SHIFT2 = B4X4_IN_BIT + FACTO_BIT + 1; const int ADD1 = 1 << (SHIFT1 - 1); const int ADD2 = 1 << (SHIFT2 - 1); __m128i C12, C13, C14, C15, C16, C17; // const __m128i c_add1 = _mm_set1_epi32(ADD1); __m128i c_add2 = _mm_set1_epi32(ADD2); // hor --------------------------------------------------------- // load 4x4 coeffs __m128i T10 = _mm_loadl_epi64((__m128i*)(coeff + 0 * i_coeff)); // [0 0 0 0 a3 a2 a1 a0] __m128i T11 = _mm_loadl_epi64((__m128i*)(coeff + 1 * i_coeff)); // [0 0 0 0 b3 b2 b1 b0] __m128i T12 = _mm_loadl_epi64((__m128i*)(coeff + 2 * i_coeff)); // [0 0 0 0 c3 c2 c1 c0] __m128i T13 = _mm_loadl_epi64((__m128i*)(coeff + 3 * i_coeff)); // [0 0 0 0 d3 d2 d1 d0] __m128i T20 = _mm_shuffle_epi32(T10, 0x00); // [a1 a0 a1 a0 a1 a0 a1 a0] __m128i T21 = _mm_shuffle_epi32(T10, 0x55); // [a3 a2 a3 a2 a3 a2 a3 a2] __m128i T22 = _mm_shuffle_epi32(T11, 0x00); __m128i T23 = _mm_shuffle_epi32(T11, 0x55); __m128i T24 = _mm_shuffle_epi32(T12, 0x00); __m128i T25 = _mm_shuffle_epi32(T12, 0x55); __m128i T26 = _mm_shuffle_epi32(T13, 0x00); __m128i T27 = _mm_shuffle_epi32(T13, 0x55); // load g_2TC_H transform matrix __m128i C10 = _mm_load_si128((__m128i*)(g_2TC_H + 0 * 2 * SEC_TR_SIZE)); // [h1 h0 g1 g0 f1 f0 e1 e0] __m128i C11 = _mm_load_si128((__m128i*)(g_2TC_H + 1 * 2 * SEC_TR_SIZE)); // [h3 h2 g3 g2 f3 f2 e3 e2] // transform __m128i T30 = _mm_madd_epi16(T20, C10); // [a0*h0+a1*h1, a0*g0+a1*g1, a0*f0+a1*f1, a0*e0+a1*e1] __m128i T31 = _mm_madd_epi16(T21, C11); // [a2*h2+a3*h3, a2*g2+a3*g3, a2*f2+a3*f3, a2*e2+a3*e3] __m128i T32 = _mm_madd_epi16(T22, C10); __m128i T33 = _mm_madd_epi16(T23, C11); __m128i T34 = _mm_madd_epi16(T24, C10); __m128i T35 = _mm_madd_epi16(T25, C11); __m128i T36 = _mm_madd_epi16(T26, C10); __m128i T37 = _mm_madd_epi16(T27, C11); __m128i T40 = _mm_add_epi32(T30, T31); __m128i T41 = _mm_add_epi32(T32, T33); __m128i T42 = _mm_add_epi32(T34, T35); __m128i T43 = _mm_add_epi32(T36, T37); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add1), SHIFT1); T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_add1), SHIFT1); T42 = _mm_srai_epi32(_mm_add_epi32(T42, c_add1), SHIFT1); T43 = _mm_srai_epi32(_mm_add_epi32(T43, c_add1), SHIFT1); // result of hor transform T40 = _mm_packs_epi32(T40, T40); // [? ? ? ? a3 a2 a1 a0] T41 = _mm_packs_epi32(T41, T41); // [? ? ? ? b3 b2 b1 b0] T42 = _mm_packs_epi32(T42, T42); // [? ? ? ? c3 c2 c1 c0] T43 = _mm_packs_epi32(T43, T43); // [? ? ? ? d3 d2 d1 d0] // ver --------------------------------------------------------- T20 = _mm_unpacklo_epi16(T40, T41); // [b3 a3 b2 a2 b1 a1 b0 a0] T21 = _mm_unpacklo_epi16(T42, T43); // [d3 c3 d2 c2 d1 c1 d0 c0] // load g_2TC_V transform matrix C10 = _mm_load_si128((__m128i*)(g_2TC_V + 0 * 2 * SEC_TR_SIZE)); // [e1 e0 e1 e0 e1 e0 e1 e0] C11 = _mm_load_si128((__m128i*)(g_2TC_V + 1 * 2 * SEC_TR_SIZE)); // [e3 e2 e3 e2 e3 e2 e3 e2] C12 = _mm_load_si128((__m128i*)(g_2TC_V + 2 * 2 * SEC_TR_SIZE)); // [f1 f0 f1 f0 f1 f0 f1 f0] C13 = _mm_load_si128((__m128i*)(g_2TC_V + 3 * 2 * SEC_TR_SIZE)); // [f3 f2 f3 f2 f3 f2 f3 f2] C14 = _mm_load_si128((__m128i*)(g_2TC_V + 4 * 2 * SEC_TR_SIZE)); // [g1 g0 g1 g0 g1 g0 g1 g0] C15 = _mm_load_si128((__m128i*)(g_2TC_V + 5 * 2 * SEC_TR_SIZE)); // [g3 g2 g3 g2 g3 g2 g3 g2] C16 = _mm_load_si128((__m128i*)(g_2TC_V + 6 * 2 * SEC_TR_SIZE)); // [h1 h0 h1 h0 h1 h0 h1 h0] C17 = _mm_load_si128((__m128i*)(g_2TC_V + 7 * 2 * SEC_TR_SIZE)); // [h3 h2 h3 h2 h3 h2 h3 h2] // transform T30 = _mm_madd_epi16(T20, C10); // [a3*e0+b3*e1, a2*e0+b2*e1, a1*e0+b1*e1, a0*e0+b0*e1] T31 = _mm_madd_epi16(T21, C11); // [c3*e2+d3*e3, c2*e2+d2*e3, c1*e2+d1*e3, c0*e2+d0*e3] T32 = _mm_madd_epi16(T20, C12); T33 = _mm_madd_epi16(T21, C13); T34 = _mm_madd_epi16(T20, C14); T35 = _mm_madd_epi16(T21, C15); T36 = _mm_madd_epi16(T20, C16); T37 = _mm_madd_epi16(T21, C17); T40 = _mm_add_epi32(T30, T31); T41 = _mm_add_epi32(T32, T33); T42 = _mm_add_epi32(T34, T35); T43 = _mm_add_epi32(T36, T37); T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_add2), SHIFT2); T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_add2), SHIFT2); T42 = _mm_srai_epi32(_mm_add_epi32(T42, c_add2), SHIFT2); T43 = _mm_srai_epi32(_mm_add_epi32(T43, c_add2), SHIFT2); // store T40 = _mm_packs_epi32(T40, T40); T41 = _mm_packs_epi32(T41, T41); T42 = _mm_packs_epi32(T42, T42); T43 = _mm_packs_epi32(T43, T43); _mm_storel_epi64((__m128i*)(coeff + 0 * i_coeff), T40); _mm_storel_epi64((__m128i*)(coeff + 1 * i_coeff), T41); _mm_storel_epi64((__m128i*)(coeff + 2 * i_coeff), T42); _mm_storel_epi64((__m128i*)(coeff + 3 * i_coeff), T43); } // transpose 8x8 & transpose 16x16(ת) #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ #define TRANSPOSE_16x16_16BIT(A0_0, A1_0, A2_0, A3_0, A4_0, A5_0, A6_0, A7_0, A8_0, A9_0, A10_0, A11_0, A12_0, A13_0, A14_0, A15_0, A0_1, A1_1, A2_1, A3_1, A4_1, A5_1, A6_1, A7_1, A8_1, A9_1, A10_1, A11_1, A12_1, A13_1, A14_1, A15_1, B0_0, B1_0, B2_0, B3_0, B4_0, B5_0, B6_0, B7_0, B8_0, B9_0, B10_0, B11_0, B12_0, B13_0, B14_0, B15_0, B0_1, B1_1, B2_1, B3_1, B4_1, B5_1, B6_1, B7_1, B8_1, B9_1, B10_1, B11_1, B12_1, B13_1, B14_1, B15_1) \ TRANSPOSE_8x8_16BIT(A0_0, A1_0, A2_0, A3_0, A4_0, A5_0, A6_0, A7_0, B0_0, B1_0, B2_0, B3_0, B4_0, B5_0, B6_0, B7_0); \ TRANSPOSE_8x8_16BIT(A8_0, A9_0, A10_0, A11_0, A12_0, A13_0, A14_0, A15_0, B0_1, B1_1, B2_1, B3_1, B4_1, B5_1, B6_1, B7_1); \ TRANSPOSE_8x8_16BIT(A0_1, A1_1, A2_1, A3_1, A4_1, A5_1, A6_1, A7_1, B8_0, B9_0, B10_0, B11_0, B12_0, B13_0, B14_0, B15_0); \ TRANSPOSE_8x8_16BIT(A8_1, A9_1, A10_1, A11_1, A12_1, A13_1, A14_1, A15_1, B8_1, B9_1, B10_1, B11_1, B12_1, B13_1, B14_1, B15_1); \ void wavelet_16x64_sse128(coeff_t *coeff) { //���� 16*64 __m128i V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2]; //���� 64*16 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8]; //��ʱ __m128i B00, B01, B02, B03, B04, B05, B06, B07, B08, B09, B10, B11, B12, B13, B14, B15, B16, B17, B18, B19, B20, B21, B22, B23, B24, B25, B26, B27, B28, B29, B30, B31; __m128i B32, B33, B34, B35, B36, B37, B38, B39, B40, B41, B42, B43, B44, B45, B46, B47, B48, B49, B50, B51, B52, B53, B54, B55, B56, B57, B58, B59, B60, B61, B62, B63; __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; int i; __m128i mAddOffset1 = _mm_set1_epi16(1); __m128i mAddOffset2 = _mm_set1_epi16(2); //load for (i = 0; i < 2; i++) { V00[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 0]); V01[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 1]); V02[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 2]); V03[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 3]); V04[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 4]); V05[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 5]); V06[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 6]); V07[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 7]); V08[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 8]); V09[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 9]); V10[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 10]); V11[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 11]); V12[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 12]); V13[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 13]); V14[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 14]); V15[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 15]); V16[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 16]); V17[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 17]); V18[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 18]); V19[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 19]); V20[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 20]); V21[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 21]); V22[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 22]); V23[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 23]); V24[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 24]); V25[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 25]); V26[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 26]); V27[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 27]); V28[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 28]); V29[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 29]); V30[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 30]); V31[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 31]); V32[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 32]); V33[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 33]); V34[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 34]); V35[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 35]); V36[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 36]); V37[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 37]); V38[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 38]); V39[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 39]); V40[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 40]); V41[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 41]); V42[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 42]); V43[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 43]); V44[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 44]); V45[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 45]); V46[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 46]); V47[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 47]); V48[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 48]); V49[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 49]); V50[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 50]); V51[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 51]); V52[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 52]); V53[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 53]); V54[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 54]); V55[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 55]); V56[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 56]); V57[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 57]); V58[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 58]); V59[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 59]); V60[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 60]); V61[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 61]); V62[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 62]); V63[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 16 * 63]); } TRANSPOSE_8x8_16BIT(V00[0], V01[0], V02[0], V03[0], V04[0], V05[0], V06[0], V07[0], T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0]); TRANSPOSE_8x8_16BIT(V08[0], V09[0], V10[0], V11[0], V12[0], V13[0], V14[0], V15[0], T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1]); TRANSPOSE_8x8_16BIT(V16[0], V17[0], V18[0], V19[0], V20[0], V21[0], V22[0], V23[0], T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2]); TRANSPOSE_8x8_16BIT(V24[0], V25[0], V26[0], V27[0], V28[0], V29[0], V30[0], V31[0], T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3]); TRANSPOSE_8x8_16BIT(V32[0], V33[0], V34[0], V35[0], V36[0], V37[0], V38[0], V39[0], T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4]); TRANSPOSE_8x8_16BIT(V40[0], V41[0], V42[0], V43[0], V44[0], V45[0], V46[0], V47[0], T00[5], T01[5], T02[5], T03[5], T04[5], T05[5], T06[5], T07[5]); TRANSPOSE_8x8_16BIT(V48[0], V49[0], V50[0], V51[0], V52[0], V53[0], V54[0], V55[0], T00[6], T01[6], T02[6], T03[6], T04[6], T05[6], T06[6], T07[6]); TRANSPOSE_8x8_16BIT(V56[0], V57[0], V58[0], V59[0], V60[0], V61[0], V62[0], V63[0], T00[7], T01[7], T02[7], T03[7], T04[7], T05[7], T06[7], T07[7]); TRANSPOSE_8x8_16BIT(V00[1], V01[1], V02[1], V03[1], V04[1], V05[1], V06[1], V07[1], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0]); TRANSPOSE_8x8_16BIT(V08[1], V09[1], V10[1], V11[1], V12[1], V13[1], V14[1], V15[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1]); TRANSPOSE_8x8_16BIT(V16[1], V17[1], V18[1], V19[1], V20[1], V21[1], V22[1], V23[1], T08[2], T09[2], T10[2], T11[2], T12[2], T13[2], T14[2], T15[2]); TRANSPOSE_8x8_16BIT(V24[1], V25[1], V26[1], V27[1], V28[1], V29[1], V30[1], V31[1], T08[3], T09[3], T10[3], T11[3], T12[3], T13[3], T14[3], T15[3]); TRANSPOSE_8x8_16BIT(V32[1], V33[1], V34[1], V35[1], V36[1], V37[1], V38[1], V39[1], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4]); TRANSPOSE_8x8_16BIT(V40[1], V41[1], V42[1], V43[1], V44[1], V45[1], V46[1], V47[1], T08[5], T09[5], T10[5], T11[5], T12[5], T13[5], T14[5], T15[5]); TRANSPOSE_8x8_16BIT(V48[1], V49[1], V50[1], V51[1], V52[1], V53[1], V54[1], V55[1], T08[6], T09[6], T10[6], T11[6], T12[6], T13[6], T14[6], T15[6]); TRANSPOSE_8x8_16BIT(V56[1], V57[1], V58[1], V59[1], V60[1], V61[1], V62[1], V63[1], T08[7], T09[7], T10[7], T11[7], T12[7], T13[7], T14[7], T15[7]); /* step 1: horizontal transform */ // pExt[x] -= (pExt[x - 1] + pExt[x + 1]) >> 1; for (i = 0; i < 8; i++){ T01[i] = _mm_sub_epi16(T01[i], _mm_srai_epi16(_mm_add_epi16(T00[i], T02[i]), 1)); T03[i] = _mm_sub_epi16(T03[i], _mm_srai_epi16(_mm_add_epi16(T02[i], T04[i]), 1)); T05[i] = _mm_sub_epi16(T05[i], _mm_srai_epi16(_mm_add_epi16(T04[i], T06[i]), 1)); T07[i] = _mm_sub_epi16(T07[i], _mm_srai_epi16(_mm_add_epi16(T06[i], T08[i]), 1)); T09[i] = _mm_sub_epi16(T09[i], _mm_srai_epi16(_mm_add_epi16(T08[i], T10[i]), 1)); T11[i] = _mm_sub_epi16(T11[i], _mm_srai_epi16(_mm_add_epi16(T10[i], T12[i]), 1)); T13[i] = _mm_sub_epi16(T13[i], _mm_srai_epi16(_mm_add_epi16(T12[i], T14[i]), 1)); T15[i] = _mm_sub_epi16(T15[i], _mm_srai_epi16(_mm_add_epi16(T14[i], T14[i]), 1)); } for (i = 0; i < 8; i++){ T00[i] = _mm_add_epi16(T00[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(T01[i], T01[i]), mAddOffset2), 2)); T02[i] = _mm_add_epi16(T02[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(T01[i], T03[i]), mAddOffset2), 2)); T04[i] = _mm_add_epi16(T04[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(T03[i], T05[i]), mAddOffset2), 2)); T06[i] = _mm_add_epi16(T06[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(T05[i], T07[i]), mAddOffset2), 2)); T08[i] = _mm_add_epi16(T08[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(T07[i], T09[i]), mAddOffset2), 2)); T10[i] = _mm_add_epi16(T10[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(T09[i], T11[i]), mAddOffset2), 2)); T12[i] = _mm_add_epi16(T12[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(T11[i], T13[i]), mAddOffset2), 2)); T14[i] = _mm_add_epi16(T14[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(T13[i], T15[i]), mAddOffset2), 2)); } /* step 2: vertical transform */ /* copy ת��*/ TRANSPOSE_8x8_16BIT(T00[0], T02[0], T04[0], T06[0], T08[0], T10[0], T12[0], T14[0], B00, B01, B02, B03, B04, B05, B06, B07); TRANSPOSE_8x8_16BIT(T00[1], T02[1], T04[1], T06[1], T08[1], T10[1], T12[1], T14[1], B08, B09, B10, B11, B12, B13, B14, B15); TRANSPOSE_8x8_16BIT(T00[2], T02[2], T04[2], T06[2], T08[2], T10[2], T12[2], T14[2], B16, B17, B18, B19, B20, B21, B22, B23); TRANSPOSE_8x8_16BIT(T00[3], T02[3], T04[3], T06[3], T08[3], T10[3], T12[3], T14[3], B24, B25, B26, B27, B28, B29, B30, B31); TRANSPOSE_8x8_16BIT(T00[4], T02[4], T04[4], T06[4], T08[4], T10[4], T12[4], T14[4], B32, B33, B34, B35, B36, B37, B38, B39); TRANSPOSE_8x8_16BIT(T00[5], T02[5], T04[5], T06[5], T08[5], T10[5], T12[5], T14[5], B40, B41, B42, B43, B44, B45, B46, B47); TRANSPOSE_8x8_16BIT(T00[6], T02[6], T04[6], T06[6], T08[6], T10[6], T12[6], T14[6], B48, B49, B50, B51, B52, B53, B54, B55); TRANSPOSE_8x8_16BIT(T00[7], T02[7], T04[7], T06[7], T08[7], T10[7], T12[7], T14[7], B56, B57, B58, B59, B60, B61, B62, B63); //pExt[y] -= (pExt[y - 1] + pExt[y + 1]) >> 1; B01 = _mm_sub_epi16(B01, _mm_srai_epi16(_mm_add_epi16(B00, B02), 1)); B03 = _mm_sub_epi16(B03, _mm_srai_epi16(_mm_add_epi16(B02, B04), 1)); B05 = _mm_sub_epi16(B05, _mm_srai_epi16(_mm_add_epi16(B04, B06), 1)); B07 = _mm_sub_epi16(B07, _mm_srai_epi16(_mm_add_epi16(B06, B08), 1)); B09 = _mm_sub_epi16(B09, _mm_srai_epi16(_mm_add_epi16(B08, B10), 1)); B11 = _mm_sub_epi16(B11, _mm_srai_epi16(_mm_add_epi16(B10, B12), 1)); B13 = _mm_sub_epi16(B13, _mm_srai_epi16(_mm_add_epi16(B12, B14), 1)); B15 = _mm_sub_epi16(B15, _mm_srai_epi16(_mm_add_epi16(B14, B16), 1)); B17 = _mm_sub_epi16(B17, _mm_srai_epi16(_mm_add_epi16(B16, B18), 1)); B19 = _mm_sub_epi16(B19, _mm_srai_epi16(_mm_add_epi16(B18, B20), 1)); B21 = _mm_sub_epi16(B21, _mm_srai_epi16(_mm_add_epi16(B20, B22), 1)); B23 = _mm_sub_epi16(B23, _mm_srai_epi16(_mm_add_epi16(B22, B24), 1)); B25 = _mm_sub_epi16(B25, _mm_srai_epi16(_mm_add_epi16(B24, B26), 1)); B27 = _mm_sub_epi16(B27, _mm_srai_epi16(_mm_add_epi16(B26, B28), 1)); B29 = _mm_sub_epi16(B29, _mm_srai_epi16(_mm_add_epi16(B28, B30), 1)); B31 = _mm_sub_epi16(B31, _mm_srai_epi16(_mm_add_epi16(B30, B32), 1)); B33 = _mm_sub_epi16(B33, _mm_srai_epi16(_mm_add_epi16(B32, B34), 1)); B35 = _mm_sub_epi16(B35, _mm_srai_epi16(_mm_add_epi16(B34, B36), 1)); B37 = _mm_sub_epi16(B37, _mm_srai_epi16(_mm_add_epi16(B36, B38), 1)); B39 = _mm_sub_epi16(B39, _mm_srai_epi16(_mm_add_epi16(B38, B40), 1)); B41 = _mm_sub_epi16(B41, _mm_srai_epi16(_mm_add_epi16(B40, B42), 1)); B43 = _mm_sub_epi16(B43, _mm_srai_epi16(_mm_add_epi16(B42, B44), 1)); B45 = _mm_sub_epi16(B45, _mm_srai_epi16(_mm_add_epi16(B44, B46), 1)); B47 = _mm_sub_epi16(B47, _mm_srai_epi16(_mm_add_epi16(B46, B48), 1)); B49 = _mm_sub_epi16(B49, _mm_srai_epi16(_mm_add_epi16(B48, B50), 1)); B51 = _mm_sub_epi16(B51, _mm_srai_epi16(_mm_add_epi16(B50, B52), 1)); B53 = _mm_sub_epi16(B53, _mm_srai_epi16(_mm_add_epi16(B52, B54), 1)); B55 = _mm_sub_epi16(B55, _mm_srai_epi16(_mm_add_epi16(B54, B56), 1)); B57 = _mm_sub_epi16(B57, _mm_srai_epi16(_mm_add_epi16(B56, B58), 1)); B59 = _mm_sub_epi16(B59, _mm_srai_epi16(_mm_add_epi16(B58, B60), 1)); B61 = _mm_sub_epi16(B61, _mm_srai_epi16(_mm_add_epi16(B60, B62), 1)); B63 = _mm_sub_epi16(B63, _mm_srai_epi16(_mm_add_epi16(B62, B62), 1)); //pExt[y] = (pExt[y] << 1) + ((pExt[y - 1] + pExt[y + 1] + 1) >> 1); B00 = _mm_add_epi16(_mm_slli_epi16(B00, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B01, B01), mAddOffset1), 1)); B02 = _mm_add_epi16(_mm_slli_epi16(B02, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B01, B03), mAddOffset1), 1)); B04 = _mm_add_epi16(_mm_slli_epi16(B04, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B03, B05), mAddOffset1), 1)); B06 = _mm_add_epi16(_mm_slli_epi16(B06, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B05, B07), mAddOffset1), 1)); B08 = _mm_add_epi16(_mm_slli_epi16(B08, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B07, B09), mAddOffset1), 1)); B10 = _mm_add_epi16(_mm_slli_epi16(B10, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B09, B11), mAddOffset1), 1)); B12 = _mm_add_epi16(_mm_slli_epi16(B12, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B11, B13), mAddOffset1), 1)); B14 = _mm_add_epi16(_mm_slli_epi16(B14, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B13, B15), mAddOffset1), 1)); B16 = _mm_add_epi16(_mm_slli_epi16(B16, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B15, B17), mAddOffset1), 1)); B18 = _mm_add_epi16(_mm_slli_epi16(B18, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B17, B19), mAddOffset1), 1)); B20 = _mm_add_epi16(_mm_slli_epi16(B20, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B19, B21), mAddOffset1), 1)); B22 = _mm_add_epi16(_mm_slli_epi16(B22, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B21, B23), mAddOffset1), 1)); B24 = _mm_add_epi16(_mm_slli_epi16(B24, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B23, B25), mAddOffset1), 1)); B26 = _mm_add_epi16(_mm_slli_epi16(B26, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B25, B27), mAddOffset1), 1)); B28 = _mm_add_epi16(_mm_slli_epi16(B28, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B27, B29), mAddOffset1), 1)); B30 = _mm_add_epi16(_mm_slli_epi16(B30, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B29, B31), mAddOffset1), 1)); B32 = _mm_add_epi16(_mm_slli_epi16(B32, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B31, B33), mAddOffset1), 1)); B34 = _mm_add_epi16(_mm_slli_epi16(B34, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B33, B35), mAddOffset1), 1)); B36 = _mm_add_epi16(_mm_slli_epi16(B36, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B35, B37), mAddOffset1), 1)); B38 = _mm_add_epi16(_mm_slli_epi16(B38, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B37, B39), mAddOffset1), 1)); B40 = _mm_add_epi16(_mm_slli_epi16(B40, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B39, B41), mAddOffset1), 1)); B42 = _mm_add_epi16(_mm_slli_epi16(B42, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B41, B43), mAddOffset1), 1)); B44 = _mm_add_epi16(_mm_slli_epi16(B44, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B43, B45), mAddOffset1), 1)); B46 = _mm_add_epi16(_mm_slli_epi16(B46, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B45, B47), mAddOffset1), 1)); B48 = _mm_add_epi16(_mm_slli_epi16(B48, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B47, B49), mAddOffset1), 1)); B50 = _mm_add_epi16(_mm_slli_epi16(B50, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B49, B51), mAddOffset1), 1)); B52 = _mm_add_epi16(_mm_slli_epi16(B52, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B51, B53), mAddOffset1), 1)); B54 = _mm_add_epi16(_mm_slli_epi16(B54, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B53, B55), mAddOffset1), 1)); B56 = _mm_add_epi16(_mm_slli_epi16(B56, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B55, B57), mAddOffset1), 1)); B58 = _mm_add_epi16(_mm_slli_epi16(B58, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B57, B59), mAddOffset1), 1)); B60 = _mm_add_epi16(_mm_slli_epi16(B60, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B59, B61), mAddOffset1), 1)); B62 = _mm_add_epi16(_mm_slli_epi16(B62, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B61, B63), mAddOffset1), 1)); //STORE _mm_store_si128((__m128i*)&coeff[8 * 0], B00); _mm_store_si128((__m128i*)&coeff[8 * 1], B02); _mm_store_si128((__m128i*)&coeff[8 * 2], B04); _mm_store_si128((__m128i*)&coeff[8 * 3], B06); _mm_store_si128((__m128i*)&coeff[8 * 4], B08); _mm_store_si128((__m128i*)&coeff[8 * 5], B10); _mm_store_si128((__m128i*)&coeff[8 * 6], B12); _mm_store_si128((__m128i*)&coeff[8 * 7], B14); _mm_store_si128((__m128i*)&coeff[8 * 8], B16); _mm_store_si128((__m128i*)&coeff[8 * 9], B18); _mm_store_si128((__m128i*)&coeff[8 * 10], B20); _mm_store_si128((__m128i*)&coeff[8 * 11], B22); _mm_store_si128((__m128i*)&coeff[8 * 12], B24); _mm_store_si128((__m128i*)&coeff[8 * 13], B26); _mm_store_si128((__m128i*)&coeff[8 * 14], B28); _mm_store_si128((__m128i*)&coeff[8 * 15], B30); _mm_store_si128((__m128i*)&coeff[8 * 16], B32); _mm_store_si128((__m128i*)&coeff[8 * 17], B34); _mm_store_si128((__m128i*)&coeff[8 * 18], B36); _mm_store_si128((__m128i*)&coeff[8 * 19], B38); _mm_store_si128((__m128i*)&coeff[8 * 20], B40); _mm_store_si128((__m128i*)&coeff[8 * 21], B42); _mm_store_si128((__m128i*)&coeff[8 * 22], B44); _mm_store_si128((__m128i*)&coeff[8 * 23], B46); _mm_store_si128((__m128i*)&coeff[8 * 24], B48); _mm_store_si128((__m128i*)&coeff[8 * 25], B50); _mm_store_si128((__m128i*)&coeff[8 * 26], B52); _mm_store_si128((__m128i*)&coeff[8 * 27], B54); _mm_store_si128((__m128i*)&coeff[8 * 28], B56); _mm_store_si128((__m128i*)&coeff[8 * 29], B58); _mm_store_si128((__m128i*)&coeff[8 * 30], B60); _mm_store_si128((__m128i*)&coeff[8 * 31], B62); } void wavelet_64x16_sse128(coeff_t *coeff) { //���� 16*64 __m128i V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2]; //���� 64*16 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8]; //��ʱ 64*16 __m128i A00[4], A01[4], A02[4], A03[4], A04[4], A05[4], A06[4], A07[4], A08[4], A09[4], A10[4], A11[4], A12[4], A13[4], A14[4], A15[4]; //ʱ __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; int i; __m128i mAddOffset1 = _mm_set1_epi16(1); __m128i mAddOffset2 = _mm_set1_epi16(2); //load for (i = 0; i < 8; i++) { T00[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 0]); T01[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 1]); T02[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 2]); T03[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 3]); T04[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 4]); T05[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 5]); T06[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 6]); T07[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 7]); T08[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 8]); T09[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 9]); T10[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 10]); T11[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 11]); T12[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 12]); T13[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 13]); T14[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 14]); T15[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 15]); } TRANSPOSE_8x8_16BIT(T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0], V00[0], V01[0], V02[0], V03[0], V04[0], V05[0], V06[0], V07[0]); TRANSPOSE_8x8_16BIT(T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1], V08[0], V09[0], V10[0], V11[0], V12[0], V13[0], V14[0], V15[0]); TRANSPOSE_8x8_16BIT(T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2], V16[0], V17[0], V18[0], V19[0], V20[0], V21[0], V22[0], V23[0]); TRANSPOSE_8x8_16BIT(T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3], V24[0], V25[0], V26[0], V27[0], V28[0], V29[0], V30[0], V31[0]); TRANSPOSE_8x8_16BIT(T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4], V32[0], V33[0], V34[0], V35[0], V36[0], V37[0], V38[0], V39[0]); TRANSPOSE_8x8_16BIT(T00[5], T01[5], T02[5], T03[5], T04[5], T05[5], T06[5], T07[5], V40[0], V41[0], V42[0], V43[0], V44[0], V45[0], V46[0], V47[0]); TRANSPOSE_8x8_16BIT(T00[6], T01[6], T02[6], T03[6], T04[6], T05[6], T06[6], T07[6], V48[0], V49[0], V50[0], V51[0], V52[0], V53[0], V54[0], V55[0]); TRANSPOSE_8x8_16BIT(T00[7], T01[7], T02[7], T03[7], T04[7], T05[7], T06[7], T07[7], V56[0], V57[0], V58[0], V59[0], V60[0], V61[0], V62[0], V63[0]); TRANSPOSE_8x8_16BIT(T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0], V00[1], V01[1], V02[1], V03[1], V04[1], V05[1], V06[1], V07[1]); TRANSPOSE_8x8_16BIT(T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1], V08[1], V09[1], V10[1], V11[1], V12[1], V13[1], V14[1], V15[1]); TRANSPOSE_8x8_16BIT(T08[2], T09[2], T10[2], T11[2], T12[2], T13[2], T14[2], T15[2], V16[1], V17[1], V18[1], V19[1], V20[1], V21[1], V22[1], V23[1]); TRANSPOSE_8x8_16BIT(T08[3], T09[3], T10[3], T11[3], T12[3], T13[3], T14[3], T15[3], V24[1], V25[1], V26[1], V27[1], V28[1], V29[1], V30[1], V31[1]); TRANSPOSE_8x8_16BIT(T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4], V32[1], V33[1], V34[1], V35[1], V36[1], V37[1], V38[1], V39[1]); TRANSPOSE_8x8_16BIT(T08[5], T09[5], T10[5], T11[5], T12[5], T13[5], T14[5], T15[5], V40[1], V41[1], V42[1], V43[1], V44[1], V45[1], V46[1], V47[1]); TRANSPOSE_8x8_16BIT(T08[6], T09[6], T10[6], T11[6], T12[6], T13[6], T14[6], T15[6], V48[1], V49[1], V50[1], V51[1], V52[1], V53[1], V54[1], V55[1]); TRANSPOSE_8x8_16BIT(T08[7], T09[7], T10[7], T11[7], T12[7], T13[7], T14[7], T15[7], V56[1], V57[1], V58[1], V59[1], V60[1], V61[1], V62[1], V63[1]); //pExt[x] -= (pExt[x - 1] + pExt[x + 1]) >> 1; V01[0] = _mm_sub_epi16(V01[0], _mm_srai_epi16(_mm_add_epi16(V00[0], V02[0]), 1)); V03[0] = _mm_sub_epi16(V03[0], _mm_srai_epi16(_mm_add_epi16(V02[0], V04[0]), 1)); V05[0] = _mm_sub_epi16(V05[0], _mm_srai_epi16(_mm_add_epi16(V04[0], V06[0]), 1)); V07[0] = _mm_sub_epi16(V07[0], _mm_srai_epi16(_mm_add_epi16(V06[0], V08[0]), 1)); V09[0] = _mm_sub_epi16(V09[0], _mm_srai_epi16(_mm_add_epi16(V08[0], V10[0]), 1)); V11[0] = _mm_sub_epi16(V11[0], _mm_srai_epi16(_mm_add_epi16(V10[0], V12[0]), 1)); V13[0] = _mm_sub_epi16(V13[0], _mm_srai_epi16(_mm_add_epi16(V12[0], V14[0]), 1)); V15[0] = _mm_sub_epi16(V15[0], _mm_srai_epi16(_mm_add_epi16(V14[0], V16[0]), 1)); V17[0] = _mm_sub_epi16(V17[0], _mm_srai_epi16(_mm_add_epi16(V16[0], V18[0]), 1)); V19[0] = _mm_sub_epi16(V19[0], _mm_srai_epi16(_mm_add_epi16(V18[0], V20[0]), 1)); V21[0] = _mm_sub_epi16(V21[0], _mm_srai_epi16(_mm_add_epi16(V20[0], V22[0]), 1)); V23[0] = _mm_sub_epi16(V23[0], _mm_srai_epi16(_mm_add_epi16(V22[0], V24[0]), 1)); V25[0] = _mm_sub_epi16(V25[0], _mm_srai_epi16(_mm_add_epi16(V24[0], V26[0]), 1)); V27[0] = _mm_sub_epi16(V27[0], _mm_srai_epi16(_mm_add_epi16(V26[0], V28[0]), 1)); V29[0] = _mm_sub_epi16(V29[0], _mm_srai_epi16(_mm_add_epi16(V28[0], V30[0]), 1)); V31[0] = _mm_sub_epi16(V31[0], _mm_srai_epi16(_mm_add_epi16(V30[0], V32[0]), 1)); V33[0] = _mm_sub_epi16(V33[0], _mm_srai_epi16(_mm_add_epi16(V32[0], V34[0]), 1)); V35[0] = _mm_sub_epi16(V35[0], _mm_srai_epi16(_mm_add_epi16(V34[0], V36[0]), 1)); V37[0] = _mm_sub_epi16(V37[0], _mm_srai_epi16(_mm_add_epi16(V36[0], V38[0]), 1)); V39[0] = _mm_sub_epi16(V39[0], _mm_srai_epi16(_mm_add_epi16(V38[0], V40[0]), 1)); V41[0] = _mm_sub_epi16(V41[0], _mm_srai_epi16(_mm_add_epi16(V40[0], V42[0]), 1)); V43[0] = _mm_sub_epi16(V43[0], _mm_srai_epi16(_mm_add_epi16(V42[0], V44[0]), 1)); V45[0] = _mm_sub_epi16(V45[0], _mm_srai_epi16(_mm_add_epi16(V44[0], V46[0]), 1)); V47[0] = _mm_sub_epi16(V47[0], _mm_srai_epi16(_mm_add_epi16(V46[0], V48[0]), 1)); V49[0] = _mm_sub_epi16(V49[0], _mm_srai_epi16(_mm_add_epi16(V48[0], V50[0]), 1)); V51[0] = _mm_sub_epi16(V51[0], _mm_srai_epi16(_mm_add_epi16(V50[0], V52[0]), 1)); V53[0] = _mm_sub_epi16(V53[0], _mm_srai_epi16(_mm_add_epi16(V52[0], V54[0]), 1)); V55[0] = _mm_sub_epi16(V55[0], _mm_srai_epi16(_mm_add_epi16(V54[0], V56[0]), 1)); V57[0] = _mm_sub_epi16(V57[0], _mm_srai_epi16(_mm_add_epi16(V56[0], V58[0]), 1)); V59[0] = _mm_sub_epi16(V59[0], _mm_srai_epi16(_mm_add_epi16(V58[0], V60[0]), 1)); V61[0] = _mm_sub_epi16(V61[0], _mm_srai_epi16(_mm_add_epi16(V60[0], V62[0]), 1)); V63[0] = _mm_sub_epi16(V63[0], _mm_srai_epi16(_mm_add_epi16(V62[0], V62[0]), 1)); V01[1] = _mm_sub_epi16(V01[1], _mm_srai_epi16(_mm_add_epi16(V00[1], V02[1]), 1)); V03[1] = _mm_sub_epi16(V03[1], _mm_srai_epi16(_mm_add_epi16(V02[1], V04[1]), 1)); V05[1] = _mm_sub_epi16(V05[1], _mm_srai_epi16(_mm_add_epi16(V04[1], V06[1]), 1)); V07[1] = _mm_sub_epi16(V07[1], _mm_srai_epi16(_mm_add_epi16(V06[1], V08[1]), 1)); V09[1] = _mm_sub_epi16(V09[1], _mm_srai_epi16(_mm_add_epi16(V08[1], V10[1]), 1)); V11[1] = _mm_sub_epi16(V11[1], _mm_srai_epi16(_mm_add_epi16(V10[1], V12[1]), 1)); V13[1] = _mm_sub_epi16(V13[1], _mm_srai_epi16(_mm_add_epi16(V12[1], V14[1]), 1)); V15[1] = _mm_sub_epi16(V15[1], _mm_srai_epi16(_mm_add_epi16(V14[1], V16[1]), 1)); V17[1] = _mm_sub_epi16(V17[1], _mm_srai_epi16(_mm_add_epi16(V16[1], V18[1]), 1)); V19[1] = _mm_sub_epi16(V19[1], _mm_srai_epi16(_mm_add_epi16(V18[1], V20[1]), 1)); V21[1] = _mm_sub_epi16(V21[1], _mm_srai_epi16(_mm_add_epi16(V20[1], V22[1]), 1)); V23[1] = _mm_sub_epi16(V23[1], _mm_srai_epi16(_mm_add_epi16(V22[1], V24[1]), 1)); V25[1] = _mm_sub_epi16(V25[1], _mm_srai_epi16(_mm_add_epi16(V24[1], V26[1]), 1)); V27[1] = _mm_sub_epi16(V27[1], _mm_srai_epi16(_mm_add_epi16(V26[1], V28[1]), 1)); V29[1] = _mm_sub_epi16(V29[1], _mm_srai_epi16(_mm_add_epi16(V28[1], V30[1]), 1)); V31[1] = _mm_sub_epi16(V31[1], _mm_srai_epi16(_mm_add_epi16(V30[1], V32[1]), 1)); V33[1] = _mm_sub_epi16(V33[1], _mm_srai_epi16(_mm_add_epi16(V32[1], V34[1]), 1)); V35[1] = _mm_sub_epi16(V35[1], _mm_srai_epi16(_mm_add_epi16(V34[1], V36[1]), 1)); V37[1] = _mm_sub_epi16(V37[1], _mm_srai_epi16(_mm_add_epi16(V36[1], V38[1]), 1)); V39[1] = _mm_sub_epi16(V39[1], _mm_srai_epi16(_mm_add_epi16(V38[1], V40[1]), 1)); V41[1] = _mm_sub_epi16(V41[1], _mm_srai_epi16(_mm_add_epi16(V40[1], V42[1]), 1)); V43[1] = _mm_sub_epi16(V43[1], _mm_srai_epi16(_mm_add_epi16(V42[1], V44[1]), 1)); V45[1] = _mm_sub_epi16(V45[1], _mm_srai_epi16(_mm_add_epi16(V44[1], V46[1]), 1)); V47[1] = _mm_sub_epi16(V47[1], _mm_srai_epi16(_mm_add_epi16(V46[1], V48[1]), 1)); V49[1] = _mm_sub_epi16(V49[1], _mm_srai_epi16(_mm_add_epi16(V48[1], V50[1]), 1)); V51[1] = _mm_sub_epi16(V51[1], _mm_srai_epi16(_mm_add_epi16(V50[1], V52[1]), 1)); V53[1] = _mm_sub_epi16(V53[1], _mm_srai_epi16(_mm_add_epi16(V52[1], V54[1]), 1)); V55[1] = _mm_sub_epi16(V55[1], _mm_srai_epi16(_mm_add_epi16(V54[1], V56[1]), 1)); V57[1] = _mm_sub_epi16(V57[1], _mm_srai_epi16(_mm_add_epi16(V56[1], V58[1]), 1)); V59[1] = _mm_sub_epi16(V59[1], _mm_srai_epi16(_mm_add_epi16(V58[1], V60[1]), 1)); V61[1] = _mm_sub_epi16(V61[1], _mm_srai_epi16(_mm_add_epi16(V60[1], V62[1]), 1)); V63[1] = _mm_sub_epi16(V63[1], _mm_srai_epi16(_mm_add_epi16(V62[1], V62[1]), 1)); //pExt[x] += (pExt[x - 1] + pExt[x + 1] + 2) >> 2; V00[0] = _mm_add_epi16(V00[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V01[0], V01[0]), mAddOffset2), 2)); V02[0] = _mm_add_epi16(V02[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V01[0], V03[0]), mAddOffset2), 2)); V04[0] = _mm_add_epi16(V04[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V03[0], V05[0]), mAddOffset2), 2)); V06[0] = _mm_add_epi16(V06[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V05[0], V07[0]), mAddOffset2), 2)); V08[0] = _mm_add_epi16(V08[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V07[0], V09[0]), mAddOffset2), 2)); V10[0] = _mm_add_epi16(V10[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V09[0], V11[0]), mAddOffset2), 2)); V12[0] = _mm_add_epi16(V12[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V11[0], V13[0]), mAddOffset2), 2)); V14[0] = _mm_add_epi16(V14[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V13[0], V15[0]), mAddOffset2), 2)); V16[0] = _mm_add_epi16(V16[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V15[0], V17[0]), mAddOffset2), 2)); V18[0] = _mm_add_epi16(V18[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V17[0], V19[0]), mAddOffset2), 2)); V20[0] = _mm_add_epi16(V20[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V19[0], V21[0]), mAddOffset2), 2)); V22[0] = _mm_add_epi16(V22[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V21[0], V23[0]), mAddOffset2), 2)); V24[0] = _mm_add_epi16(V24[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V23[0], V25[0]), mAddOffset2), 2)); V26[0] = _mm_add_epi16(V26[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V25[0], V27[0]), mAddOffset2), 2)); V28[0] = _mm_add_epi16(V28[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V27[0], V29[0]), mAddOffset2), 2)); V30[0] = _mm_add_epi16(V30[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V29[0], V31[0]), mAddOffset2), 2)); V32[0] = _mm_add_epi16(V32[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V31[0], V33[0]), mAddOffset2), 2)); V34[0] = _mm_add_epi16(V34[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V33[0], V35[0]), mAddOffset2), 2)); V36[0] = _mm_add_epi16(V36[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V35[0], V37[0]), mAddOffset2), 2)); V38[0] = _mm_add_epi16(V38[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V37[0], V39[0]), mAddOffset2), 2)); V40[0] = _mm_add_epi16(V40[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V39[0], V41[0]), mAddOffset2), 2)); V42[0] = _mm_add_epi16(V42[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V41[0], V43[0]), mAddOffset2), 2)); V44[0] = _mm_add_epi16(V44[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V43[0], V45[0]), mAddOffset2), 2)); V46[0] = _mm_add_epi16(V46[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V45[0], V47[0]), mAddOffset2), 2)); V48[0] = _mm_add_epi16(V48[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V47[0], V49[0]), mAddOffset2), 2)); V50[0] = _mm_add_epi16(V50[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V49[0], V51[0]), mAddOffset2), 2)); V52[0] = _mm_add_epi16(V52[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V51[0], V53[0]), mAddOffset2), 2)); V54[0] = _mm_add_epi16(V54[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V53[0], V55[0]), mAddOffset2), 2)); V56[0] = _mm_add_epi16(V56[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V55[0], V57[0]), mAddOffset2), 2)); V58[0] = _mm_add_epi16(V58[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V57[0], V59[0]), mAddOffset2), 2)); V60[0] = _mm_add_epi16(V60[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V59[0], V61[0]), mAddOffset2), 2)); V62[0] = _mm_add_epi16(V62[0], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V61[0], V63[0]), mAddOffset2), 2)); V00[1] = _mm_add_epi16(V00[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V01[1], V01[1]), mAddOffset2), 2)); V02[1] = _mm_add_epi16(V02[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V01[1], V03[1]), mAddOffset2), 2)); V04[1] = _mm_add_epi16(V04[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V03[1], V05[1]), mAddOffset2), 2)); V06[1] = _mm_add_epi16(V06[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V05[1], V07[1]), mAddOffset2), 2)); V08[1] = _mm_add_epi16(V08[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V07[1], V09[1]), mAddOffset2), 2)); V10[1] = _mm_add_epi16(V10[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V09[1], V11[1]), mAddOffset2), 2)); V12[1] = _mm_add_epi16(V12[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V11[1], V13[1]), mAddOffset2), 2)); V14[1] = _mm_add_epi16(V14[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V13[1], V15[1]), mAddOffset2), 2)); V16[1] = _mm_add_epi16(V16[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V15[1], V17[1]), mAddOffset2), 2)); V18[1] = _mm_add_epi16(V18[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V17[1], V19[1]), mAddOffset2), 2)); V20[1] = _mm_add_epi16(V20[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V19[1], V21[1]), mAddOffset2), 2)); V22[1] = _mm_add_epi16(V22[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V21[1], V23[1]), mAddOffset2), 2)); V24[1] = _mm_add_epi16(V24[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V23[1], V25[1]), mAddOffset2), 2)); V26[1] = _mm_add_epi16(V26[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V25[1], V27[1]), mAddOffset2), 2)); V28[1] = _mm_add_epi16(V28[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V27[1], V29[1]), mAddOffset2), 2)); V30[1] = _mm_add_epi16(V30[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V29[1], V31[1]), mAddOffset2), 2)); V32[1] = _mm_add_epi16(V32[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V31[1], V33[1]), mAddOffset2), 2)); V34[1] = _mm_add_epi16(V34[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V33[1], V35[1]), mAddOffset2), 2)); V36[1] = _mm_add_epi16(V36[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V35[1], V37[1]), mAddOffset2), 2)); V38[1] = _mm_add_epi16(V38[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V37[1], V39[1]), mAddOffset2), 2)); V40[1] = _mm_add_epi16(V40[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V39[1], V41[1]), mAddOffset2), 2)); V42[1] = _mm_add_epi16(V42[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V41[1], V43[1]), mAddOffset2), 2)); V44[1] = _mm_add_epi16(V44[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V43[1], V45[1]), mAddOffset2), 2)); V46[1] = _mm_add_epi16(V46[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V45[1], V47[1]), mAddOffset2), 2)); V48[1] = _mm_add_epi16(V48[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V47[1], V49[1]), mAddOffset2), 2)); V50[1] = _mm_add_epi16(V50[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V49[1], V51[1]), mAddOffset2), 2)); V52[1] = _mm_add_epi16(V52[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V51[1], V53[1]), mAddOffset2), 2)); V54[1] = _mm_add_epi16(V54[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V53[1], V55[1]), mAddOffset2), 2)); V56[1] = _mm_add_epi16(V56[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V55[1], V57[1]), mAddOffset2), 2)); V58[1] = _mm_add_epi16(V58[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V57[1], V59[1]), mAddOffset2), 2)); V60[1] = _mm_add_epi16(V60[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V59[1], V61[1]), mAddOffset2), 2)); V62[1] = _mm_add_epi16(V62[1], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V61[1], V63[1]), mAddOffset2), 2)); /* step 2: vertical transform */ //ת�� TRANSPOSE_8x8_16BIT(V00[0], V02[0], V04[0], V06[0], V08[0], V10[0], V12[0], V14[0], A00[0], A01[0], A02[0], A03[0], A04[0], A05[0], A06[0], A07[0]); TRANSPOSE_8x8_16BIT(V16[0], V18[0], V20[0], V22[0], V24[0], V26[0], V28[0], V30[0], A00[1], A01[1], A02[1], A03[1], A04[1], A05[1], A06[1], A07[1]); TRANSPOSE_8x8_16BIT(V32[0], V34[0], V36[0], V38[0], V40[0], V42[0], V44[0], V46[0], A00[2], A01[2], A02[2], A03[2], A04[2], A05[2], A06[2], A07[2]); TRANSPOSE_8x8_16BIT(V48[0], V50[0], V52[0], V54[0], V56[0], V58[0], V60[0], V62[0], A00[3], A01[3], A02[3], A03[3], A04[3], A05[3], A06[3], A07[3]); TRANSPOSE_8x8_16BIT(V00[1], V02[1], V04[1], V06[1], V08[1], V10[1], V12[1], V14[1], A08[0], A09[0], A10[0], A11[0], A12[0], A13[0], A14[0], A15[0]); TRANSPOSE_8x8_16BIT(V16[1], V18[1], V20[1], V22[1], V24[1], V26[1], V28[1], V30[1], A08[1], A09[1], A10[1], A11[1], A12[1], A13[1], A14[1], A15[1]); TRANSPOSE_8x8_16BIT(V32[1], V34[1], V36[1], V38[1], V40[1], V42[1], V44[1], V46[1], A08[2], A09[2], A10[2], A11[2], A12[2], A13[2], A14[2], A15[2]); TRANSPOSE_8x8_16BIT(V48[1], V50[1], V52[1], V54[1], V56[1], V58[1], V60[1], V62[1], A08[3], A09[3], A10[3], A11[3], A12[3], A13[3], A14[3], A15[3]); //pExt[y] -= (pExt[y - 1] + pExt[y + 1]) >> 1; for (i = 0; i < 4; i++){ A01[i] = _mm_sub_epi16(A01[i], _mm_srai_epi16(_mm_add_epi16(A00[i], A02[i]), 1)); A03[i] = _mm_sub_epi16(A03[i], _mm_srai_epi16(_mm_add_epi16(A02[i], A04[i]), 1)); A05[i] = _mm_sub_epi16(A05[i], _mm_srai_epi16(_mm_add_epi16(A04[i], A06[i]), 1)); A07[i] = _mm_sub_epi16(A07[i], _mm_srai_epi16(_mm_add_epi16(A06[i], A08[i]), 1)); A09[i] = _mm_sub_epi16(A09[i], _mm_srai_epi16(_mm_add_epi16(A08[i], A10[i]), 1)); A11[i] = _mm_sub_epi16(A11[i], _mm_srai_epi16(_mm_add_epi16(A10[i], A12[i]), 1)); A13[i] = _mm_sub_epi16(A13[i], _mm_srai_epi16(_mm_add_epi16(A12[i], A14[i]), 1)); A15[i] = _mm_sub_epi16(A15[i], _mm_srai_epi16(_mm_add_epi16(A14[i], A14[i]), 1)); } //pExt[y] = (pExt[y] << 1) + ((pExt[y - 1] + pExt[y + 1] + 1) >> 1); for (i = 0; i < 4; i++){ A00[i] = _mm_add_epi16(_mm_slli_epi16(A00[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A01[i], A01[i]), mAddOffset1), 1)); A02[i] = _mm_add_epi16(_mm_slli_epi16(A02[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A01[i], A03[i]), mAddOffset1), 1)); A04[i] = _mm_add_epi16(_mm_slli_epi16(A04[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A03[i], A05[i]), mAddOffset1), 1)); A06[i] = _mm_add_epi16(_mm_slli_epi16(A06[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A05[i], A07[i]), mAddOffset1), 1)); A08[i] = _mm_add_epi16(_mm_slli_epi16(A08[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A07[i], A09[i]), mAddOffset1), 1)); A10[i] = _mm_add_epi16(_mm_slli_epi16(A10[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A09[i], A11[i]), mAddOffset1), 1)); A12[i] = _mm_add_epi16(_mm_slli_epi16(A12[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A11[i], A13[i]), mAddOffset1), 1)); A14[i] = _mm_add_epi16(_mm_slli_epi16(A14[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A13[i], A15[i]), mAddOffset1), 1)); } //Store for (i = 0; i < 4; i++){ _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 0], A00[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 1], A02[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 2], A04[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 3], A06[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 4], A08[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 5], A10[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 6], A12[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 7], A14[i]); } } void wavelet_64x64_sse128(coeff_t *coeff) { //���� 16*64 __m128i V00[8], V01[8], V02[8], V03[8], V04[8], V05[8], V06[8], V07[8], V08[8], V09[8], V10[8], V11[8], V12[8], V13[8], V14[8], V15[8], V16[8], V17[8], V18[8], V19[8], V20[8], V21[8], V22[8], V23[8], V24[8], V25[8], V26[8], V27[8], V28[8], V29[8], V30[8], V31[8], V32[8], V33[8], V34[8], V35[8], V36[8], V37[8], V38[8], V39[8], V40[8], V41[8], V42[8], V43[8], V44[8], V45[8], V46[8], V47[8], V48[8], V49[8], V50[8], V51[8], V52[8], V53[8], V54[8], V55[8], V56[8], V57[8], V58[8], V59[8], V60[8], V61[8], V62[8], V63[8]; //���� 64*64 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8], T16[8], T17[8], T18[8], T19[8], T20[8], T21[8], T22[8], T23[8], T24[8], T25[8], T26[8], T27[8], T28[8], T29[8], T30[8], T31[8], T32[8], T33[8], T34[8], T35[8], T36[8], T37[8], T38[8], T39[8], T40[8], T41[8], T42[8], T43[8], T44[8], T45[8], T46[8], T47[8], T48[8], T49[8], T50[8], T51[8], T52[8], T53[8], T54[8], T55[8], T56[8], T57[8], T58[8], T59[8], T60[8], T61[8], T62[8], T63[8]; //ʱ 32*64 __m128i A00[4], A01[4], A02[4], A03[4], A04[4], A05[4], A06[4], A07[4], A08[4], A09[4], A10[4], A11[4], A12[4], A13[4], A14[4], A15[4], A16[4], A17[4], A18[4], A19[4], A20[4], A21[4], A22[4], A23[4], A24[4], A25[4], A26[4], A27[4], A28[4], A29[4], A30[4], A31[4], A32[4], A33[4], A34[4], A35[4], A36[4], A37[4], A38[4], A39[4], A40[4], A41[4], A42[4], A43[4], A44[4], A45[4], A46[4], A47[4], A48[4], A49[4], A50[4], A51[4], A52[4], A53[4], A54[4], A55[4], A56[4], A57[4], A58[4], A59[4], A60[4], A61[4], A62[4], A63[4]; __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; __m128i mAddOffset1 = _mm_set1_epi16(1); __m128i mAddOffset2 = _mm_set1_epi16(2); int i; for (i = 0; i < 8; i++){ T00[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 0]); T01[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 1]); T02[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 2]); T03[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 3]); T04[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 4]); T05[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 5]); T06[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 6]); T07[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 7]); T08[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 8]); T09[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 9]); T10[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 10]); T11[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 11]); T12[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 12]); T13[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 13]); T14[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 14]); T15[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 15]); T16[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 16]); T17[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 17]); T18[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 18]); T19[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 19]); T20[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 20]); T21[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 21]); T22[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 22]); T23[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 23]); T24[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 24]); T25[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 25]); T26[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 26]); T27[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 27]); T28[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 28]); T29[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 29]); T30[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 30]); T31[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 31]); T32[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 32]); T33[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 33]); T34[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 34]); T35[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 35]); T36[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 36]); T37[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 37]); T38[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 38]); T39[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 39]); T40[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 40]); T41[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 41]); T42[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 42]); T43[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 43]); T44[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 44]); T45[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 45]); T46[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 46]); T47[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 47]); T48[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 48]); T49[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 49]); T50[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 50]); T51[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 51]); T52[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 52]); T53[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 53]); T54[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 54]); T55[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 55]); T56[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 56]); T57[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 57]); T58[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 58]); T59[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 59]); T60[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 60]); T61[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 61]); T62[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 62]); T63[i] = _mm_load_si128((__m128i*)&coeff[8 * i + 64 * 63]); } //0-15��ת�� TRANSPOSE_16x16_16BIT( T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0], T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1], V00[0], V01[0], V02[0], V03[0], V04[0], V05[0], V06[0], V07[0], V08[0], V09[0], V10[0], V11[0], V12[0], V13[0], V14[0], V15[0], V00[1], V01[1], V02[1], V03[1], V04[1], V05[1], V06[1], V07[1], V08[1], V09[1], V10[1], V11[1], V12[1], V13[1], V14[1], V15[1] ); TRANSPOSE_16x16_16BIT( T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2], T08[2], T09[2], T10[2], T11[2], T12[2], T13[2], T14[2], T15[2], T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3], T08[3], T09[3], T10[3], T11[3], T12[3], T13[3], T14[3], T15[3], V16[0], V17[0], V18[0], V19[0], V20[0], V21[0], V22[0], V23[0], V24[0], V25[0], V26[0], V27[0], V28[0], V29[0], V30[0], V31[0], V16[1], V17[1], V18[1], V19[1], V20[1], V21[1], V22[1], V23[1], V24[1], V25[1], V26[1], V27[1], V28[1], V29[1], V30[1], V31[1] ); TRANSPOSE_16x16_16BIT( T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4], T00[5], T01[5], T02[5], T03[5], T04[5], T05[5], T06[5], T07[5], T08[5], T09[5], T10[5], T11[5], T12[5], T13[5], T14[5], T15[5], V32[0], V33[0], V34[0], V35[0], V36[0], V37[0], V38[0], V39[0], V40[0], V41[0], V42[0], V43[0], V44[0], V45[0], V46[0], V47[0], V32[1], V33[1], V34[1], V35[1], V36[1], V37[1], V38[1], V39[1], V40[1], V41[1], V42[1], V43[1], V44[1], V45[1], V46[1], V47[1] ); TRANSPOSE_16x16_16BIT( T00[6], T01[6], T02[6], T03[6], T04[6], T05[6], T06[6], T07[6], T08[6], T09[6], T10[6], T11[6], T12[6], T13[6], T14[6], T15[6], T00[7], T01[7], T02[7], T03[7], T04[7], T05[7], T06[7], T07[7], T08[7], T09[7], T10[7], T11[7], T12[7], T13[7], T14[7], T15[7], V48[0], V49[0], V50[0], V51[0], V52[0], V53[0], V54[0], V55[0], V56[0], V57[0], V58[0], V59[0], V60[0], V61[0], V62[0], V63[0], V48[1], V49[1], V50[1], V51[1], V52[1], V53[1], V54[1], V55[1], V56[1], V57[1], V58[1], V59[1], V60[1], V61[1], V62[1], V63[1] ); //16-31��ת�� TRANSPOSE_16x16_16BIT( T16[0], T17[0], T18[0], T19[0], T20[0], T21[0], T22[0], T23[0], T24[0], T25[0], T26[0], T27[0], T28[0], T29[0], T30[0], T31[0], T16[1], T17[1], T18[1], T19[1], T20[1], T21[1], T22[1], T23[1], T24[1], T25[1], T26[1], T27[1], T28[1], T29[1], T30[1], T31[1], V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V00[3], V01[3], V02[3], V03[3], V04[3], V05[3], V06[3], V07[3], V08[3], V09[3], V10[3], V11[3], V12[3], V13[3], V14[3], V15[3] ); TRANSPOSE_16x16_16BIT( T16[2], T17[2], T18[2], T19[2], T20[2], T21[2], T22[2], T23[2], T24[2], T25[2], T26[2], T27[2], T28[2], T29[2], T30[2], T31[2], T16[3], T17[3], T18[3], T19[3], T20[3], T21[3], T22[3], T23[3], T24[3], T25[3], T26[3], T27[3], T28[3], T29[3], T30[3], T31[3], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V16[3], V17[3], V18[3], V19[3], V20[3], V21[3], V22[3], V23[3], V24[3], V25[3], V26[3], V27[3], V28[3], V29[3], V30[3], V31[3] ); TRANSPOSE_16x16_16BIT( T16[4], T17[4], T18[4], T19[4], T20[4], T21[4], T22[4], T23[4], T24[4], T25[4], T26[4], T27[4], T28[4], T29[4], T30[4], T31[4], T16[5], T17[5], T18[5], T19[5], T20[5], T21[5], T22[5], T23[5], T24[5], T25[5], T26[5], T27[5], T28[5], T29[5], T30[5], T31[5], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V32[3], V33[3], V34[3], V35[3], V36[3], V37[3], V38[3], V39[3], V40[3], V41[3], V42[3], V43[3], V44[3], V45[3], V46[3], V47[3] ); TRANSPOSE_16x16_16BIT( T16[6], T17[6], T18[6], T19[6], T20[6], T21[6], T22[6], T23[6], T24[6], T25[6], T26[6], T27[6], T28[6], T29[6], T30[6], T31[6], T16[7], T17[7], T18[7], T19[7], T20[7], T21[7], T22[7], T23[7], T24[7], T25[7], T26[7], T27[7], T28[7], T29[7], T30[7], T31[7], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2], V48[3], V49[3], V50[3], V51[3], V52[3], V53[3], V54[3], V55[3], V56[3], V57[3], V58[3], V59[3], V60[3], V61[3], V62[3], V63[3] ); //32-47��ת�� TRANSPOSE_16x16_16BIT( T32[0], T33[0], T34[0], T35[0], T36[0], T37[0], T38[0], T39[0], T40[0], T41[0], T42[0], T43[0], T44[0], T45[0], T46[0], T47[0], T32[1], T33[1], T34[1], T35[1], T36[1], T37[1], T38[1], T39[1], T40[1], T41[1], T42[1], T43[1], T44[1], T45[1], T46[1], T47[1], V00[4], V01[4], V02[4], V03[4], V04[4], V05[4], V06[4], V07[4], V08[4], V09[4], V10[4], V11[4], V12[4], V13[4], V14[4], V15[4], V00[5], V01[5], V02[5], V03[5], V04[5], V05[5], V06[5], V07[5], V08[5], V09[5], V10[5], V11[5], V12[5], V13[5], V14[5], V15[5] ); TRANSPOSE_16x16_16BIT( T32[2], T33[2], T34[2], T35[2], T36[2], T37[2], T38[2], T39[2], T40[2], T41[2], T42[2], T43[2], T44[2], T45[2], T46[2], T47[2], T32[3], T33[3], T34[3], T35[3], T36[3], T37[3], T38[3], T39[3], T40[3], T41[3], T42[3], T43[3], T44[3], T45[3], T46[3], T47[3], V16[4], V17[4], V18[4], V19[4], V20[4], V21[4], V22[4], V23[4], V24[4], V25[4], V26[4], V27[4], V28[4], V29[4], V30[4], V31[4], V16[5], V17[5], V18[5], V19[5], V20[5], V21[5], V22[5], V23[5], V24[5], V25[5], V26[5], V27[5], V28[5], V29[5], V30[5], V31[5] ); TRANSPOSE_16x16_16BIT( T32[4], T33[4], T34[4], T35[4], T36[4], T37[4], T38[4], T39[4], T40[4], T41[4], T42[4], T43[4], T44[4], T45[4], T46[4], T47[4], T32[5], T33[5], T34[5], T35[5], T36[5], T37[5], T38[5], T39[5], T40[5], T41[5], T42[5], T43[5], T44[5], T45[5], T46[5], T47[5], V32[4], V33[4], V34[4], V35[4], V36[4], V37[4], V38[4], V39[4], V40[4], V41[4], V42[4], V43[4], V44[4], V45[4], V46[4], V47[4], V32[5], V33[5], V34[5], V35[5], V36[5], V37[5], V38[5], V39[5], V40[5], V41[5], V42[5], V43[5], V44[5], V45[5], V46[5], V47[5] ); TRANSPOSE_16x16_16BIT( T32[6], T33[6], T34[6], T35[6], T36[6], T37[6], T38[6], T39[6], T40[6], T41[6], T42[6], T43[6], T44[6], T45[6], T46[6], T47[6], T32[7], T33[7], T34[7], T35[7], T36[7], T37[7], T38[7], T39[7], T40[7], T41[7], T42[7], T43[7], T44[7], T45[7], T46[7], T47[7], V48[4], V49[4], V50[4], V51[4], V52[4], V53[4], V54[4], V55[4], V56[4], V57[4], V58[4], V59[4], V60[4], V61[4], V62[4], V63[4], V48[5], V49[5], V50[5], V51[5], V52[5], V53[5], V54[5], V55[5], V56[5], V57[5], V58[5], V59[5], V60[5], V61[5], V62[5], V63[5] ); //48-63��ת�� TRANSPOSE_16x16_16BIT( T48[0], T49[0], T50[0], T51[0], T52[0], T53[0], T54[0], T55[0], T56[0], T57[0], T58[0], T59[0], T60[0], T61[0], T62[0], T63[0], T48[1], T49[1], T50[1], T51[1], T52[1], T53[1], T54[1], T55[1], T56[1], T57[1], T58[1], T59[1], T60[1], T61[1], T62[1], T63[1], V00[6], V01[6], V02[6], V03[6], V04[6], V05[6], V06[6], V07[6], V08[6], V09[6], V10[6], V11[6], V12[6], V13[6], V14[6], V15[6], V00[7], V01[7], V02[7], V03[7], V04[7], V05[7], V06[7], V07[7], V08[7], V09[7], V10[7], V11[7], V12[7], V13[7], V14[7], V15[7] ); TRANSPOSE_16x16_16BIT( T48[2], T49[2], T50[2], T51[2], T52[2], T53[2], T54[2], T55[2], T56[2], T57[2], T58[2], T59[2], T60[2], T61[2], T62[2], T63[2], T48[3], T49[3], T50[3], T51[3], T52[3], T53[3], T54[3], T55[3], T56[3], T57[3], T58[3], T59[3], T60[3], T61[3], T62[3], T63[3], V16[6], V17[6], V18[6], V19[6], V20[6], V21[6], V22[6], V23[6], V24[6], V25[6], V26[6], V27[6], V28[6], V29[6], V30[6], V31[6], V16[7], V17[7], V18[7], V19[7], V20[7], V21[7], V22[7], V23[7], V24[7], V25[7], V26[7], V27[7], V28[7], V29[7], V30[7], V31[7] ); TRANSPOSE_16x16_16BIT( T48[4], T49[4], T50[4], T51[4], T52[4], T53[4], T54[4], T55[4], T56[4], T57[4], T58[4], T59[4], T60[4], T61[4], T62[4], T63[4], T48[5], T49[5], T50[5], T51[5], T52[5], T53[5], T54[5], T55[5], T56[5], T57[5], T58[5], T59[5], T60[5], T61[5], T62[5], T63[5], V32[6], V33[6], V34[6], V35[6], V36[6], V37[6], V38[6], V39[6], V40[6], V41[6], V42[6], V43[6], V44[6], V45[6], V46[6], V47[6], V32[7], V33[7], V34[7], V35[7], V36[7], V37[7], V38[7], V39[7], V40[7], V41[7], V42[7], V43[7], V44[7], V45[7], V46[7], V47[7] ); TRANSPOSE_16x16_16BIT( T48[6], T49[6], T50[6], T51[6], T52[6], T53[6], T54[6], T55[6], T56[6], T57[6], T58[6], T59[6], T60[6], T61[6], T62[6], T63[6], T48[7], T49[7], T50[7], T51[7], T52[7], T53[7], T54[7], T55[7], T56[7], T57[7], T58[7], T59[7], T60[7], T61[7], T62[7], T63[7], V48[6], V49[6], V50[6], V51[6], V52[6], V53[6], V54[6], V55[6], V56[6], V57[6], V58[6], V59[6], V60[6], V61[6], V62[6], V63[6], V48[7], V49[7], V50[7], V51[7], V52[7], V53[7], V54[7], V55[7], V56[7], V57[7], V58[7], V59[7], V60[7], V61[7], V62[7], V63[7] ); //pExt[x] -= (pExt[x - 1] + pExt[x + 1]) >> 1; for (i = 0; i < 8; i++){ V01[i] = _mm_sub_epi16(V01[i], _mm_srai_epi16(_mm_add_epi16(V00[i], V02[i]), 1)); V03[i] = _mm_sub_epi16(V03[i], _mm_srai_epi16(_mm_add_epi16(V02[i], V04[i]), 1)); V05[i] = _mm_sub_epi16(V05[i], _mm_srai_epi16(_mm_add_epi16(V04[i], V06[i]), 1)); V07[i] = _mm_sub_epi16(V07[i], _mm_srai_epi16(_mm_add_epi16(V06[i], V08[i]), 1)); V09[i] = _mm_sub_epi16(V09[i], _mm_srai_epi16(_mm_add_epi16(V08[i], V10[i]), 1)); V11[i] = _mm_sub_epi16(V11[i], _mm_srai_epi16(_mm_add_epi16(V10[i], V12[i]), 1)); V13[i] = _mm_sub_epi16(V13[i], _mm_srai_epi16(_mm_add_epi16(V12[i], V14[i]), 1)); V15[i] = _mm_sub_epi16(V15[i], _mm_srai_epi16(_mm_add_epi16(V14[i], V16[i]), 1)); V17[i] = _mm_sub_epi16(V17[i], _mm_srai_epi16(_mm_add_epi16(V16[i], V18[i]), 1)); V19[i] = _mm_sub_epi16(V19[i], _mm_srai_epi16(_mm_add_epi16(V18[i], V20[i]), 1)); V21[i] = _mm_sub_epi16(V21[i], _mm_srai_epi16(_mm_add_epi16(V20[i], V22[i]), 1)); V23[i] = _mm_sub_epi16(V23[i], _mm_srai_epi16(_mm_add_epi16(V22[i], V24[i]), 1)); V25[i] = _mm_sub_epi16(V25[i], _mm_srai_epi16(_mm_add_epi16(V24[i], V26[i]), 1)); V27[i] = _mm_sub_epi16(V27[i], _mm_srai_epi16(_mm_add_epi16(V26[i], V28[i]), 1)); V29[i] = _mm_sub_epi16(V29[i], _mm_srai_epi16(_mm_add_epi16(V28[i], V30[i]), 1)); V31[i] = _mm_sub_epi16(V31[i], _mm_srai_epi16(_mm_add_epi16(V30[i], V32[i]), 1)); V33[i] = _mm_sub_epi16(V33[i], _mm_srai_epi16(_mm_add_epi16(V32[i], V34[i]), 1)); V35[i] = _mm_sub_epi16(V35[i], _mm_srai_epi16(_mm_add_epi16(V34[i], V36[i]), 1)); V37[i] = _mm_sub_epi16(V37[i], _mm_srai_epi16(_mm_add_epi16(V36[i], V38[i]), 1)); V39[i] = _mm_sub_epi16(V39[i], _mm_srai_epi16(_mm_add_epi16(V38[i], V40[i]), 1)); V41[i] = _mm_sub_epi16(V41[i], _mm_srai_epi16(_mm_add_epi16(V40[i], V42[i]), 1)); V43[i] = _mm_sub_epi16(V43[i], _mm_srai_epi16(_mm_add_epi16(V42[i], V44[i]), 1)); V45[i] = _mm_sub_epi16(V45[i], _mm_srai_epi16(_mm_add_epi16(V44[i], V46[i]), 1)); V47[i] = _mm_sub_epi16(V47[i], _mm_srai_epi16(_mm_add_epi16(V46[i], V48[i]), 1)); V49[i] = _mm_sub_epi16(V49[i], _mm_srai_epi16(_mm_add_epi16(V48[i], V50[i]), 1)); V51[i] = _mm_sub_epi16(V51[i], _mm_srai_epi16(_mm_add_epi16(V50[i], V52[i]), 1)); V53[i] = _mm_sub_epi16(V53[i], _mm_srai_epi16(_mm_add_epi16(V52[i], V54[i]), 1)); V55[i] = _mm_sub_epi16(V55[i], _mm_srai_epi16(_mm_add_epi16(V54[i], V56[i]), 1)); V57[i] = _mm_sub_epi16(V57[i], _mm_srai_epi16(_mm_add_epi16(V56[i], V58[i]), 1)); V59[i] = _mm_sub_epi16(V59[i], _mm_srai_epi16(_mm_add_epi16(V58[i], V60[i]), 1)); V61[i] = _mm_sub_epi16(V61[i], _mm_srai_epi16(_mm_add_epi16(V60[i], V62[i]), 1)); V63[i] = _mm_sub_epi16(V63[i], _mm_srai_epi16(_mm_add_epi16(V62[i], V62[i]), 1)); } //pExt[x] += (pExt[x - 1] + pExt[x + 1] + 2) >> 2; for (i = 0; i < 8; i++){ V00[i] = _mm_add_epi16(V00[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V01[i], V01[i]), mAddOffset2), 2)); V02[i] = _mm_add_epi16(V02[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V01[i], V03[i]), mAddOffset2), 2)); V04[i] = _mm_add_epi16(V04[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V03[i], V05[i]), mAddOffset2), 2)); V06[i] = _mm_add_epi16(V06[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V05[i], V07[i]), mAddOffset2), 2)); V08[i] = _mm_add_epi16(V08[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V07[i], V09[i]), mAddOffset2), 2)); V10[i] = _mm_add_epi16(V10[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V09[i], V11[i]), mAddOffset2), 2)); V12[i] = _mm_add_epi16(V12[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V11[i], V13[i]), mAddOffset2), 2)); V14[i] = _mm_add_epi16(V14[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V13[i], V15[i]), mAddOffset2), 2)); V16[i] = _mm_add_epi16(V16[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V15[i], V17[i]), mAddOffset2), 2)); V18[i] = _mm_add_epi16(V18[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V17[i], V19[i]), mAddOffset2), 2)); V20[i] = _mm_add_epi16(V20[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V19[i], V21[i]), mAddOffset2), 2)); V22[i] = _mm_add_epi16(V22[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V21[i], V23[i]), mAddOffset2), 2)); V24[i] = _mm_add_epi16(V24[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V23[i], V25[i]), mAddOffset2), 2)); V26[i] = _mm_add_epi16(V26[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V25[i], V27[i]), mAddOffset2), 2)); V28[i] = _mm_add_epi16(V28[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V27[i], V29[i]), mAddOffset2), 2)); V30[i] = _mm_add_epi16(V30[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V29[i], V31[i]), mAddOffset2), 2)); V32[i] = _mm_add_epi16(V32[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V31[i], V33[i]), mAddOffset2), 2)); V34[i] = _mm_add_epi16(V34[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V33[i], V35[i]), mAddOffset2), 2)); V36[i] = _mm_add_epi16(V36[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V35[i], V37[i]), mAddOffset2), 2)); V38[i] = _mm_add_epi16(V38[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V37[i], V39[i]), mAddOffset2), 2)); V40[i] = _mm_add_epi16(V40[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V39[i], V41[i]), mAddOffset2), 2)); V42[i] = _mm_add_epi16(V42[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V41[i], V43[i]), mAddOffset2), 2)); V44[i] = _mm_add_epi16(V44[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V43[i], V45[i]), mAddOffset2), 2)); V46[i] = _mm_add_epi16(V46[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V45[i], V47[i]), mAddOffset2), 2)); V48[i] = _mm_add_epi16(V48[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V47[i], V49[i]), mAddOffset2), 2)); V50[i] = _mm_add_epi16(V50[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V49[i], V51[i]), mAddOffset2), 2)); V52[i] = _mm_add_epi16(V52[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V51[i], V53[i]), mAddOffset2), 2)); V54[i] = _mm_add_epi16(V54[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V53[i], V55[i]), mAddOffset2), 2)); V56[i] = _mm_add_epi16(V56[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V55[i], V57[i]), mAddOffset2), 2)); V58[i] = _mm_add_epi16(V58[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V57[i], V59[i]), mAddOffset2), 2)); V60[i] = _mm_add_epi16(V60[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V59[i], V61[i]), mAddOffset2), 2)); V62[i] = _mm_add_epi16(V62[i], _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(V61[i], V63[i]), mAddOffset2), 2)); } //1-15 TRANSPOSE_16x16_16BIT( V00[0], V02[0], V04[0], V06[0], V08[0], V10[0], V12[0], V14[0], V16[0], V18[0], V20[0], V22[0], V24[0], V26[0], V28[0], V30[0], V00[1], V02[1], V04[1], V06[1], V08[1], V10[1], V12[1], V14[1], V16[1], V18[1], V20[1], V22[1], V24[1], V26[1], V28[1], V30[1], A00[0], A01[0], A02[0], A03[0], A04[0], A05[0], A06[0], A07[0], A08[0], A09[0], A10[0], A11[0], A12[0], A13[0], A14[0], A15[0], A00[1], A01[1], A02[1], A03[1], A04[1], A05[1], A06[1], A07[1], A08[1], A09[1], A10[1], A11[1], A12[1], A13[1], A14[1], A15[1] ); TRANSPOSE_16x16_16BIT( V00[2], V02[2], V04[2], V06[2], V08[2], V10[2], V12[2], V14[2], V16[2], V18[2], V20[2], V22[2], V24[2], V26[2], V28[2], V30[2], V00[3], V02[3], V04[3], V06[3], V08[3], V10[3], V12[3], V14[3], V16[3], V18[3], V20[3], V22[3], V24[3], V26[3], V28[3], V30[3], A16[0], A17[0], A18[0], A19[0], A20[0], A21[0], A22[0], A23[0], A24[0], A25[0], A26[0], A27[0], A28[0], A29[0], A30[0], A31[0], A16[1], A17[1], A18[1], A19[1], A20[1], A21[1], A22[1], A23[1], A24[1], A25[1], A26[1], A27[1], A28[1], A29[1], A30[1], A31[1] ); TRANSPOSE_16x16_16BIT( V00[4], V02[4], V04[4], V06[4], V08[4], V10[4], V12[4], V14[4], V16[4], V18[4], V20[4], V22[4], V24[4], V26[4], V28[4], V30[4], V00[5], V02[5], V04[5], V06[5], V08[5], V10[5], V12[5], V14[5], V16[5], V18[5], V20[5], V22[5], V24[5], V26[5], V28[5], V30[5], A32[0], A33[0], A34[0], A35[0], A36[0], A37[0], A38[0], A39[0], A40[0], A41[0], A42[0], A43[0], A44[0], A45[0], A46[0], A47[0], A32[1], A33[1], A34[1], A35[1], A36[1], A37[1], A38[1], A39[1], A40[1], A41[1], A42[1], A43[1], A44[1], A45[1], A46[1], A47[1] ); TRANSPOSE_16x16_16BIT( V00[6], V02[6], V04[6], V06[6], V08[6], V10[6], V12[6], V14[6], V16[6], V18[6], V20[6], V22[6], V24[6], V26[6], V28[6], V30[6], V00[7], V02[7], V04[7], V06[7], V08[7], V10[7], V12[7], V14[7], V16[7], V18[7], V20[7], V22[7], V24[7], V26[7], V28[7], V30[7], A48[0], A49[0], A50[0], A51[0], A52[0], A53[0], A54[0], A55[0], A56[0], A57[0], A58[0], A59[0], A60[0], A61[0], A62[0], A63[0], A48[1], A49[1], A50[1], A51[1], A52[1], A53[1], A54[1], A55[1], A56[1], A57[1], A58[1], A59[1], A60[1], A61[1], A62[1], A63[1] ); //16-31�� TRANSPOSE_16x16_16BIT( V32[0], V34[0], V36[0], V38[0], V40[0], V42[0], V44[0], V46[0], V48[0], V50[0], V52[0], V54[0], V56[0], V58[0], V60[0], V62[0], V32[1], V34[1], V36[1], V38[1], V40[1], V42[1], V44[1], V46[1], V48[1], V50[1], V52[1], V54[1], V56[1], V58[1], V60[1], V62[1], A00[2], A01[2], A02[2], A03[2], A04[2], A05[2], A06[2], A07[2], A08[2], A09[2], A10[2], A11[2], A12[2], A13[2], A14[2], A15[2], A00[3], A01[3], A02[3], A03[3], A04[3], A05[3], A06[3], A07[3], A08[3], A09[3], A10[3], A11[3], A12[3], A13[3], A14[3], A15[3] ); TRANSPOSE_16x16_16BIT( V32[2], V34[2], V36[2], V38[2], V40[2], V42[2], V44[2], V46[2], V48[2], V50[2], V52[2], V54[2], V56[2], V58[2], V60[2], V62[2], V32[3], V34[3], V36[3], V38[3], V40[3], V42[3], V44[3], V46[3], V48[3], V50[3], V52[3], V54[3], V56[3], V58[3], V60[3], V62[3], A16[2], A17[2], A18[2], A19[2], A20[2], A21[2], A22[2], A23[2], A24[2], A25[2], A26[2], A27[2], A28[2], A29[2], A30[2], A31[2], A16[3], A17[3], A18[3], A19[3], A20[3], A21[3], A22[3], A23[3], A24[3], A25[3], A26[3], A27[3], A28[3], A29[3], A30[3], A31[3] ); TRANSPOSE_16x16_16BIT( V32[4], V34[4], V36[4], V38[4], V40[4], V42[4], V44[4], V46[4], V48[4], V50[4], V52[4], V54[4], V56[4], V58[4], V60[4], V62[4], V32[5], V34[5], V36[5], V38[5], V40[5], V42[5], V44[5], V46[5], V48[5], V50[5], V52[5], V54[5], V56[5], V58[5], V60[5], V62[5], A32[2], A33[2], A34[2], A35[2], A36[2], A37[2], A38[2], A39[2], A40[2], A41[2], A42[2], A43[2], A44[2], A45[2], A46[2], A47[2], A32[3], A33[3], A34[3], A35[3], A36[3], A37[3], A38[3], A39[3], A40[3], A41[3], A42[3], A43[3], A44[3], A45[3], A46[3], A47[3] ); TRANSPOSE_16x16_16BIT( V32[6], V34[6], V36[6], V38[6], V40[6], V42[6], V44[6], V46[6], V48[6], V50[6], V52[6], V54[6], V56[6], V58[6], V60[6], V62[6], V32[7], V34[7], V36[7], V38[7], V40[7], V42[7], V44[7], V46[7], V48[7], V50[7], V52[7], V54[7], V56[7], V58[7], V60[7], V62[7], A48[2], A49[2], A50[2], A51[2], A52[2], A53[2], A54[2], A55[2], A56[2], A57[2], A58[2], A59[2], A60[2], A61[2], A62[2], A63[2], A48[3], A49[3], A50[3], A51[3], A52[3], A53[3], A54[3], A55[3], A56[3], A57[3], A58[3], A59[3], A60[3], A61[3], A62[3], A63[3] ); //pExt[y] -= (pExt[y - 1] + pExt[y + 1]) >> 1; for (i = 0; i < 4; i++){ A01[i] = _mm_sub_epi16(A01[i], _mm_srai_epi16(_mm_add_epi16(A00[i], A02[i]), 1)); A03[i] = _mm_sub_epi16(A03[i], _mm_srai_epi16(_mm_add_epi16(A02[i], A04[i]), 1)); A05[i] = _mm_sub_epi16(A05[i], _mm_srai_epi16(_mm_add_epi16(A04[i], A06[i]), 1)); A07[i] = _mm_sub_epi16(A07[i], _mm_srai_epi16(_mm_add_epi16(A06[i], A08[i]), 1)); A09[i] = _mm_sub_epi16(A09[i], _mm_srai_epi16(_mm_add_epi16(A08[i], A10[i]), 1)); A11[i] = _mm_sub_epi16(A11[i], _mm_srai_epi16(_mm_add_epi16(A10[i], A12[i]), 1)); A13[i] = _mm_sub_epi16(A13[i], _mm_srai_epi16(_mm_add_epi16(A12[i], A14[i]), 1)); A15[i] = _mm_sub_epi16(A15[i], _mm_srai_epi16(_mm_add_epi16(A14[i], A16[i]), 1)); A17[i] = _mm_sub_epi16(A17[i], _mm_srai_epi16(_mm_add_epi16(A16[i], A18[i]), 1)); A19[i] = _mm_sub_epi16(A19[i], _mm_srai_epi16(_mm_add_epi16(A18[i], A20[i]), 1)); A21[i] = _mm_sub_epi16(A21[i], _mm_srai_epi16(_mm_add_epi16(A20[i], A22[i]), 1)); A23[i] = _mm_sub_epi16(A23[i], _mm_srai_epi16(_mm_add_epi16(A22[i], A24[i]), 1)); A25[i] = _mm_sub_epi16(A25[i], _mm_srai_epi16(_mm_add_epi16(A24[i], A26[i]), 1)); A27[i] = _mm_sub_epi16(A27[i], _mm_srai_epi16(_mm_add_epi16(A26[i], A28[i]), 1)); A29[i] = _mm_sub_epi16(A29[i], _mm_srai_epi16(_mm_add_epi16(A28[i], A30[i]), 1)); A31[i] = _mm_sub_epi16(A31[i], _mm_srai_epi16(_mm_add_epi16(A30[i], A32[i]), 1)); A33[i] = _mm_sub_epi16(A33[i], _mm_srai_epi16(_mm_add_epi16(A32[i], A34[i]), 1)); A35[i] = _mm_sub_epi16(A35[i], _mm_srai_epi16(_mm_add_epi16(A34[i], A36[i]), 1)); A37[i] = _mm_sub_epi16(A37[i], _mm_srai_epi16(_mm_add_epi16(A36[i], A38[i]), 1)); A39[i] = _mm_sub_epi16(A39[i], _mm_srai_epi16(_mm_add_epi16(A38[i], A40[i]), 1)); A41[i] = _mm_sub_epi16(A41[i], _mm_srai_epi16(_mm_add_epi16(A40[i], A42[i]), 1)); A43[i] = _mm_sub_epi16(A43[i], _mm_srai_epi16(_mm_add_epi16(A42[i], A44[i]), 1)); A45[i] = _mm_sub_epi16(A45[i], _mm_srai_epi16(_mm_add_epi16(A44[i], A46[i]), 1)); A47[i] = _mm_sub_epi16(A47[i], _mm_srai_epi16(_mm_add_epi16(A46[i], A48[i]), 1)); A49[i] = _mm_sub_epi16(A49[i], _mm_srai_epi16(_mm_add_epi16(A48[i], A50[i]), 1)); A51[i] = _mm_sub_epi16(A51[i], _mm_srai_epi16(_mm_add_epi16(A50[i], A52[i]), 1)); A53[i] = _mm_sub_epi16(A53[i], _mm_srai_epi16(_mm_add_epi16(A52[i], A54[i]), 1)); A55[i] = _mm_sub_epi16(A55[i], _mm_srai_epi16(_mm_add_epi16(A54[i], A56[i]), 1)); A57[i] = _mm_sub_epi16(A57[i], _mm_srai_epi16(_mm_add_epi16(A56[i], A58[i]), 1)); A59[i] = _mm_sub_epi16(A59[i], _mm_srai_epi16(_mm_add_epi16(A58[i], A60[i]), 1)); A61[i] = _mm_sub_epi16(A61[i], _mm_srai_epi16(_mm_add_epi16(A60[i], A62[i]), 1)); A63[i] = _mm_sub_epi16(A63[i], _mm_srai_epi16(_mm_add_epi16(A62[i], A62[i]), 1)); } //pExt[y] = (pExt[y] << 1) + ((pExt[y - 1] + pExt[y + 1] + 1) >> 1); for (i = 0; i < 4; i++){ A00[i] = _mm_add_epi16(_mm_slli_epi16(A00[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A01[i], A01[i]), mAddOffset1), 1)); A02[i] = _mm_add_epi16(_mm_slli_epi16(A02[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A01[i], A03[i]), mAddOffset1), 1)); A04[i] = _mm_add_epi16(_mm_slli_epi16(A04[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A03[i], A05[i]), mAddOffset1), 1)); A06[i] = _mm_add_epi16(_mm_slli_epi16(A06[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A05[i], A07[i]), mAddOffset1), 1)); A08[i] = _mm_add_epi16(_mm_slli_epi16(A08[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A07[i], A09[i]), mAddOffset1), 1)); A10[i] = _mm_add_epi16(_mm_slli_epi16(A10[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A09[i], A11[i]), mAddOffset1), 1)); A12[i] = _mm_add_epi16(_mm_slli_epi16(A12[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A11[i], A13[i]), mAddOffset1), 1)); A14[i] = _mm_add_epi16(_mm_slli_epi16(A14[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A13[i], A15[i]), mAddOffset1), 1)); A16[i] = _mm_add_epi16(_mm_slli_epi16(A16[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A15[i], A17[i]), mAddOffset1), 1)); A18[i] = _mm_add_epi16(_mm_slli_epi16(A18[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A17[i], A19[i]), mAddOffset1), 1)); A20[i] = _mm_add_epi16(_mm_slli_epi16(A20[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A19[i], A21[i]), mAddOffset1), 1)); A22[i] = _mm_add_epi16(_mm_slli_epi16(A22[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A21[i], A23[i]), mAddOffset1), 1)); A24[i] = _mm_add_epi16(_mm_slli_epi16(A24[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A23[i], A25[i]), mAddOffset1), 1)); A26[i] = _mm_add_epi16(_mm_slli_epi16(A26[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A25[i], A27[i]), mAddOffset1), 1)); A28[i] = _mm_add_epi16(_mm_slli_epi16(A28[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A27[i], A29[i]), mAddOffset1), 1)); A30[i] = _mm_add_epi16(_mm_slli_epi16(A30[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A29[i], A31[i]), mAddOffset1), 1)); A32[i] = _mm_add_epi16(_mm_slli_epi16(A32[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A31[i], A33[i]), mAddOffset1), 1)); A34[i] = _mm_add_epi16(_mm_slli_epi16(A34[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A33[i], A35[i]), mAddOffset1), 1)); A36[i] = _mm_add_epi16(_mm_slli_epi16(A36[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A35[i], A37[i]), mAddOffset1), 1)); A38[i] = _mm_add_epi16(_mm_slli_epi16(A38[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A37[i], A39[i]), mAddOffset1), 1)); A40[i] = _mm_add_epi16(_mm_slli_epi16(A40[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A39[i], A41[i]), mAddOffset1), 1)); A42[i] = _mm_add_epi16(_mm_slli_epi16(A42[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A41[i], A43[i]), mAddOffset1), 1)); A44[i] = _mm_add_epi16(_mm_slli_epi16(A44[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A43[i], A45[i]), mAddOffset1), 1)); A46[i] = _mm_add_epi16(_mm_slli_epi16(A46[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A45[i], A47[i]), mAddOffset1), 1)); A48[i] = _mm_add_epi16(_mm_slli_epi16(A48[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A47[i], A49[i]), mAddOffset1), 1)); A50[i] = _mm_add_epi16(_mm_slli_epi16(A50[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A49[i], A51[i]), mAddOffset1), 1)); A52[i] = _mm_add_epi16(_mm_slli_epi16(A52[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A51[i], A53[i]), mAddOffset1), 1)); A54[i] = _mm_add_epi16(_mm_slli_epi16(A54[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A53[i], A55[i]), mAddOffset1), 1)); A56[i] = _mm_add_epi16(_mm_slli_epi16(A56[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A55[i], A57[i]), mAddOffset1), 1)); A58[i] = _mm_add_epi16(_mm_slli_epi16(A58[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A57[i], A59[i]), mAddOffset1), 1)); A60[i] = _mm_add_epi16(_mm_slli_epi16(A60[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A59[i], A61[i]), mAddOffset1), 1)); A62[i] = _mm_add_epi16(_mm_slli_epi16(A62[i], 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(A61[i], A63[i]), mAddOffset1), 1)); } //STORE for (i = 0; i < 4; i++){ _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 0], A00[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 1], A02[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 2], A04[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 3], A06[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 4], A08[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 5], A10[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 6], A12[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 7], A14[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 8], A16[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 9], A18[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 10], A20[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 11], A22[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 12], A24[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 13], A26[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 14], A28[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 15], A30[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 16], A32[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 17], A34[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 18], A36[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 19], A38[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 20], A40[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 21], A42[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 22], A44[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 23], A46[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 24], A48[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 25], A50[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 26], A52[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 27], A54[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 28], A56[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 29], A58[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 30], A60[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 32 * 31], A62[i]); } } /* --------------------------------------------------------------------------- */ void dct_c_64x64_sse128(const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(src); UNUSED_PARAMETER(i_src); wavelet_64x64_sse128(dst); dct_c_32x32_sse128(dst, dst, 32 | 1); } /* --------------------------------------------------------------------------- */ void dct_c_64x64_half_sse128(const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(src); UNUSED_PARAMETER(i_src); wavelet_64x64_sse128(dst); dct_c_32x32_half_sse128(dst, dst, 32 | 1); } /* --------------------------------------------------------------------------- */ void dct_c_64x16_sse128(const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(src); UNUSED_PARAMETER(i_src); wavelet_64x16_sse128(dst); dct_c_32x8_sse128(dst, dst, 32 | 0x01); } /* --------------------------------------------------------------------------- */ void dct_c_16x64_sse128(const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(src); UNUSED_PARAMETER(i_src); wavelet_16x64_sse128(dst); dct_c_8x32_sse128(dst, dst, 8 | 0x01); } xavs2-1.3/source/common/vec/intrinsic_dct_avx.c000066400000000000000000007655311340660520300216630ustar00rootroot00000000000000/* * intrinsic_dct_avx.c * * Description of this file: * AVX2 assembly functions of DCT module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO Jiaqi ZHANG Tianliang FU * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include // SSE #include // SSE3 #include // SSSE3 #include // AVX and AVX2 #include "../basic_types.h" #include "intrinsic.h" #include "../avs2_defs.h" /* disable warnings */ #ifdef _MSC_VER #pragma warning(disable:4127) // warning C4127: ʽdz #endif #define pair_set_epi16(a, b) \ _mm_set_epi16(b, a, b, a, b, a, b, a) /* --------------------------------------------------------------------------- * functions defined in this file: * dct16, dct32 */ ALIGN32(static const int16_t tab_dct_4[][8]) = { { 32, 32, 32, 32, 32, 32, 32, 32 }, { 42, 17, 42, 17, 42, 17, 42, 17 }, { 32, -32, 32, -32, 32, -32, 32, -32 }, { 17, -42, 17, -42, 17, -42, 17, -42 }, }; ALIGN32(int16_t tab_dct_16_avx2[][16][16]) = { { // order is 0 7 3 4 1 6 2 5, for dct 1 { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, // 0 { 45, 4, 35, 29, 43, 13, 40, 21, 45, 4, 35, 29, 43, 13, 40, 21 }, // 1 { 44, 9, 38, 25, 44, 9, 38, 25, 44, 9, 38, 25, 44, 9, 38, 25 }, // 2 { 43, -13, -21, -40, 29, -35, 4, -45, 43, -13, -21, -40, 29, -35, 4, -45 }, // 3 { 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17 }, // 4 { 40, 21, -43, -13, 4, 45, -35, 29, 40, 21, -43, -13, 4, 45, -35, 29 }, // 5 { 38, -25, -9, -44, 38, -25, -9, -44, 38, -25, -9, -44, 38, -25, -9, -44 }, // 6 { 35, -29, 4, 45, -21, -40, -43, 13, 35, -29, 4, 45, -21, -40, -43, 13 }, // 7 { 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32 }, // 8 { 29, 35, 45, -4, -40, 21, -13, -43, 29, 35, 45, -4, -40, 21, -13, -43 }, // 9 { 25, 38, -44, 9, 25, 38, -44, 9, 25, 38, -44, 9, 25, 38, -44, 9 }, // 10 { 21, -40, 13, -43, -45, 4, 29, 35, 21, -40, 13, -43, -45, 4, 29, 35 }, // 11 { 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42 }, // 12 { 13, 43, -40, 21, -35, -29, 45, 4, 13, 43, -40, 21, -35, -29, 45, 4 }, // 13 { 9, -44, -25, 38, 9, -44, -25, 38, 9, -44, -25, 38, 9, -44, -25, 38 }, // 14 { 4, -45, -29, 35, -13, 43, 21, -40, 4, -45, -29, 35, -13, 43, 21, -40 } // 15 }, { { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, // 0 { 45, -45, 35, -35, 43, -43, 40, -40, 45, -45, 35, -35, 43, -43, 40, -40 }, // 1 { 44, 44, 9, 9, 38, 38, 25, 25, 44, 44, 9, 9, 38, 38, 25, 25 }, // 2 { 43, -43, -21, 21, 29, -29, 4, -4, 43, -43, -21, 21, 29, -29, 4, -4 }, // 3 { 42, 42, -42, -42, 17, 17, -17, -17, 42, 42, -42, -42, 17, 17, -17, -17 }, // 4 { 40, -40, -43, 43, 4, -4, -35, 35, 40, -40, -43, 43, 4, -4, -35, 35 }, // 5 { 38, 38, -25, -25, -9, -9, -44, -44, 38, 38, -25, -25, -9, -9, -44, -44 }, // 6 { 35, -35, 4, -4, -21, 21, -43, 43, 35, -35, 4, -4, -21, 21, -43, 43 }, // 7 { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, // 8 { 29, -29, 45, -45, -40, 40, -13, 13, 29, -29, 45, -45, -40, 40, -13, 13 }, // 9 { 25, 25, 38, 38, -44, -44, 9, 9, 25, 25, 38, 38, -44, -44, 9, 9 }, // 10 { 21, -21, 13, -13, -45, 45, 29, -29, 21, -21, 13, -13, -45, 45, 29, -29 }, // 11 { 17, 17, -17, -17, -42, -42, 42, 42, 17, 17, -17, -17, -42, -42, 42, 42 }, // 12 { 13, -13, -40, 40, -35, 35, 45, -45, 13, -13, -40, 40, -35, 35, 45, -45 }, // 13 { 9, 9, -44, -44, -25, -25, 38, 38, 9, 9, -44, -44, -25, -25, 38, 38 }, // 14 { 4, -4, -29, 29, -13, 13, 21, -21, 4, -4, -29, 29, -13, 13, 21, -21 } // 15 }, { { 4, -4, 29, -29, 13, -13, 21, -21, 4, -4, 29, -29, 13, -13, 21, -21 }, // 0 { -13, 13, -40, 40, -35, 35, -45, 45, -13, 13, -40, 40, -35, 35, -45, 45 }, // 1 { 21, -21, -13, 13, 45, -45, 29, -29, 21, -21, -13, 13, 45, -45, 29, -29 }, // 2 { -29, 29, 45, -45, -40, 40, 13, -13, -29, 29, 45, -45, -40, 40, 13, -13 }, // 3 { 35, -35, -4, 4, 21, -21, -43, 43, 35, -35, -4, 4, 21, -21, -43, 43 }, // 4 { -40, 40, -43, 43, 4, -4, 35, -35, -40, 40, -43, 43, 4, -4, 35, -35 }, // 5 { 43, -43, 21, -21, -29, 29, 4, -4, 43, -43, 21, -21, -29, 29, 4, -4 }, // 6 { -45, 45, 35, -35, 43, -43, -40, 40, -45, 45, 35, -35, 43, -43, -40, 40 } // 7 } }; ALIGN32(int16_t tab_dct_16_shuffle_avx2[][16]) = { { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A, 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, { 0x0F0E, 0x0100, 0x0908, 0x0706, 0x0D0C, 0x0302, 0x0B0A, 0x0504, 0x0F0E, 0x0100, 0x0908, 0x0706, 0x0D0C, 0x0302, 0x0B0A, 0x0504, }, { 0x0100, 0x0706, 0x0302, 0x0504, 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, 0x0100, 0x0706, 0x0302, 0x0504, 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, }, { 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, 0x0100, 0x0706, 0x0302, 0x0504, 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, 0x0100, 0x0706, 0x0302, 0x0504 } }; ALIGN32(static const int16_t tab_dct_8x32_avx2[][16]) = { { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, //0 { 44, 9, 38, 25, 44, 9, 38, 25, 44, 9, 38, 25, 44, 9, 38, 25 }, //1 { 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17 }, //2 { 38, -25, -9, -44, 38, -25, -9, -44, 38, -25, -9, -44, 38, -25, -9, -44 }, //3 { 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32 }, //4 { 25, 38, -44, 9, 25, 38, -44, 9, 25, 38, -44, 9, 25, 38, -44, 9 }, //5 { 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42 }, //6 { 9, -44, -25, 38, 9, -44, -25, 38, 9, -44, -25, 38, 9, -44, -25, 38 } //7 }; ALIGN32(static const int32_t tab_dct_8x8_avx2[][8] )= { { 32, 32, 32, 32, 32, 32, 32, 32 },// 0 { 44, 38, 44, 38, 25, 9, 25, 9 },// { 42, 17, 42, 17, -17, -42, -17, -42 },// 2 { 38, -9, 38, -9, -44, -25, -44, -25 },// { 32, -32, 32, -32, -32, 32, -32, 32 },// 4 { 25, -44, 25, -44, 9, 38, 9, 38 },// { 17, -42, 17, -42, 42, -17, 42, -17 },// 6 { 9, -25, 9, -25, 38, -44, 38, -44 } // }; ALIGN32(static const int16_t tab_dct1_4[][8]) = { { 32, 32, 32, 32, 32, 32, 32, 32 }, { 42, 17, -17, -42, 42, 17, -17, -42 }, { 32, -32, -32, 32, 32, -32, -32, 32 }, { 17, -42, 42, -17, 17, -42, 42, -17 } }; ALIGN32(static const int16_t tab_dct_8[][8]) = { { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, { 32, 32, 32, 32, 32, 32, 32, 32 }, { 32, -32, 32, -32, 32, -32, 32, -32 }, { 42, 17, 42, 17, 42, 17, 42, 17 }, { 17, -42, 17, -42, 17, -42, 17, -42 }, { 44, 9, 38, 25, 44, 9, 38, 25 }, { 38, -25, -9, -44, 38, -25, -9, -44 }, { 25, 38, -44, 9, 25, 38, -44, 9 }, { 9, -44, -25, 38, 9, -44, -25, 38 }, { 42, 42, -42, -42, 17, 17, -17, -17 }, { 17, 17, -17, -17, -42, -42, 42, 42 }, { 44, -44, 9, -9, 38, -38, 25, -25 }, { 38, -38, -25, 25, -9, 9, -44, 44 }, { 25, -25, 38, -38, -44, 44, 9, -9 }, { 9, -9, -44, 44, -25, 25, 38, -38 } }; ALIGN32(static const int16_t tab_dct_8_1[][8]) = { { 32, 32, 32, 32, 32, 32, 32, 32 }, { 44, 38, 25, 9, - 9, -25, -38, -44 }, { 42, 17, -17, -42, -42, -17, 17, 42 }, { 38, - 9, -44, -25, 25, 44, 9, -38 }, { 32, -32, -32, 32, 32, -32, -32, 32 }, { 25, -44, 9, 38, -38, - 9, 44, -25 }, { 17, -42, 42, -17, -17, 42, -42, 17 }, { 9, -25, 38, -44, 44, -38, 25, - 9 } }; ALIGN32(static const int16_t tab_dct_16_0[][8]) = { { 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100 }, // 0 { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, // 1 { 0x0100, 0x0706, 0x0302, 0x0504, 0x0F0E, 0x0908, 0x0D0C, 0x0B0A }, // 2 { 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, 0x0100, 0x0706, 0x0302, 0x0504 } // 3 }; ALIGN32(static const int16_t tab_dct_16_1[][8]) = { { 45, 43, 40, 35, 29, 21, 13, 4 }, // 0 { 43, 29, 4, -21, -40, -45, -35, -13 }, // 1 { 40, 4, -35, -43, -13, 29, 45, 21 }, // 2 { 35, -21, -43, 4, 45, 13, -40, -29 }, // 3 { 29, -40, -13, 45, -4, -43, 21, 35 }, // 4 { 21, -45, 29, 13, -43, 35, 4, -40 }, // 5 { 13, -35, 45, -40, 21, 4, -29, 43 }, // 6 { 4, -13, 21, -29, 35, -40, 43, -45 }, // 7 { 42, 42, -42, -42, 17, 17, -17, -17 }, // 8 { 17, 17, -17, -17, -42, -42, 42, 42 }, // 9 { 44, 44, 9, 9, 38, 38, 25, 25 }, // 10 { 38, 38, -25, -25, -9, -9, -44, -44 }, // 11 { 25, 25, 38, 38, -44, -44, 9, 9 }, // 12 { 9, 9, -44, -44, -25, -25, 38, 38 }, // 13 #define MAKE_COEF(a0, a1, a2, a3, a4, a5, a6, a7) \ { (a0), -(a0), (a3), -(a3), (a1), -(a1), (a2), -(a2) }, \ { (a7), -(a7), (a4), -(a4), (a6), -(a6), (a5), -(a5) }, MAKE_COEF(45, 43, 40, 35, 29, 21, 13, 4) MAKE_COEF(43, 29, 4, -21, -40, -45, -35, -13) MAKE_COEF(40, 4, -35, -43, -13, 29, 45, 21) MAKE_COEF(35, -21, -43, 4, 45, 13, -40, -29) MAKE_COEF(29, -40, -13, 45, -4, -43, 21, 35) MAKE_COEF(21, -45, 29, 13, -43, 35, 4, -40) MAKE_COEF(13, -35, 45, -40, 21, 4, -29, 43) MAKE_COEF( 4, -13, 21, -29, 35, -40, 43, -45) #undef MAKE_COEF }; ALIGN32(static const int16_t tab_dct_32_0[][8]) = { { 0x0F0E, 0x0100, 0x0908, 0x0706, 0x0D0C, 0x0302, 0x0B0A, 0x0504 }, // 0 }; ALIGN32(static const int16_t tab_dct_32_1[][8]) = { { 44, -44, 9, -9, 38, -38, 25, -25 }, // 0 { 38, -38, -25, 25, -9, 9, -44, 44 }, // 1 { 25, -25, 38, -38, -44, 44, 9, -9 }, // 2 { 9, -9, -44, 44, -25, 25, 38, -38 }, // 3 #define MAKE_COEF8(a0, a1, a2, a3, a4, a5, a6, a7) \ { (a0), (a7), (a3), (a4), (a1), (a6), (a2), (a5) }, MAKE_COEF8(45, 43, 40, 35, 29, 21, 13, 4) // 4 MAKE_COEF8(43, 29, 4, -21, -40, -45, -35, -13) // 5 MAKE_COEF8(40, 4, -35, -43, -13, 29, 45, 21) // 6 MAKE_COEF8(35, -21, -43, 4, 45, 13, -40, -29) // 7 MAKE_COEF8(29, -40, -13, 45, -4, -43, 21, 35) // 8 MAKE_COEF8(21, -45, 29, 13, -43, 35, 4, -40) // 9 MAKE_COEF8(13, -35, 45, -40, 21, 4, -29, 43) // 10 MAKE_COEF8( 4, -13, 21, -29, 35, -40, 43, -45) // 11 #undef MAKE_COEF8 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ { (a00), (a07), (a03), (a04), (a01), (a06), (a02), (a05) }, \ { (a15), (a08), (a12), (a11), (a14), (a09), (a13), (a10) }, MAKE_COEF16(45, 45, 44, 43, 41, 39, 36, 34, 30, 27, 23, 19, 15, 11, 7, 2) // 12 MAKE_COEF16(45, 41, 34, 23, 11, -2, -15, -27, -36, -43, -45, -44, -39, -30, -19, -7) // 14 MAKE_COEF16(44, 34, 15, -7, -27, -41, -45, -39, -23, -2, 19, 36, 45, 43, 30, 11) // 16 MAKE_COEF16(43, 23, -7, -34, -45, -36, -11, 19, 41, 44, 27, -2, -30, -45, -39, -15) // 18 MAKE_COEF16(41, 11, -27, -45, -30, 7, 39, 43, 15, -23, -45, -34, 2, 36, 44, 19) // 20 MAKE_COEF16(39, -2, -41, -36, 7, 43, 34, -11, -44, -30, 15, 45, 27, -19, -45, -23) // 22 MAKE_COEF16(36, -15, -45, -11, 39, 34, -19, -45, -7, 41, 30, -23, -44, -2, 43, 27) // 24 MAKE_COEF16(34, -27, -39, 19, 43, -11, -45, 2, 45, 7, -44, -15, 41, 23, -36, -30) // 26 MAKE_COEF16(30, -36, -23, 41, 15, -44, -7, 45, -2, -45, 11, 43, -19, -39, 27, 34) // 28 MAKE_COEF16(27, -43, -2, 44, -23, -30, 41, 7, -45, 19, 34, -39, -11, 45, -15, -36) // 30 MAKE_COEF16(23, -45, 19, 27, -45, 15, 30, -44, 11, 34, -43, 7, 36, -41, 2, 39) // 32 MAKE_COEF16(19, -44, 36, -2, -34, 45, -23, -15, 43, -39, 7, 30, -45, 27, 11, -41) // 34 MAKE_COEF16(15, -39, 45, -30, 2, 27, -44, 41, -19, -11, 36, -45, 34, -7, -23, 43) // 36 MAKE_COEF16(11, -30, 43, -45, 36, -19, -2, 23, -39, 45, -41, 27, -7, -15, 34, -44) // 38 MAKE_COEF16( 7, -19, 30, -39, 44, -45, 43, -36, 27, -15, 2, 11, -23, 34, -41, 45) // 40 MAKE_COEF16( 2, -7, 11, -15, 19, -23, 27, -30, 34, -36, 39, -41, 43, -44, 45, -45) // 42 #undef MAKE_COEF16 { 32, 32, 32, 32, 32, 32, 32, 32 }, // 44 { 32, 32, -32, -32, -32, -32, 32, 32 }, // 45 { 42, 42, 17, 17, -17, -17, -42, -42 }, // 46 { -42, -42, -17, -17, 17, 17, 42, 42 }, // 47 { 17, 17, -42, -42, 42, 42, -17, -17 }, // 48 { -17, -17, 42, 42, -42, -42, 17, 17 }, // 49 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ { (a00), (a00), (a01), (a01), (a02), (a02), (a03), (a03) }, \ { (a04), (a04), (a05), (a05), (a06), (a06), (a07), (a07) }, \ { (a08), (a08), (a09), (a09), (a10), (a10), (a11), (a11) }, \ { (a12), (a12), (a13), (a13), (a14), (a14), (a15), (a15) }, MAKE_COEF16(44, 38, 25, 9, -9, -25, -38, -44, -44, -38, -25, -9, 9, 25, 38, 44) // 50 MAKE_COEF16(38, -9, -44, -25, 25, 44, 9, -38, -38, 9, 44, 25, -25, -44, -9, 38) // 54 // TODO: convert below table here #undef MAKE_COEF16 { 25, 25, -44, -44, 9, 9, 38, 38 }, // 58 { -38, -38, -9, -9, 44, 44, -25, -25 }, // 59 { -25, -25, 44, 44, -9, -9, -38, -38 }, // 60 { 38, 38, 9, 9, -44, -44, 25, 25 }, // 61 { 9, 9, -25, -25, 38, 38, -44, -44 }, // 62 { 44, 44, -38, -38, 25, 25, -9, -9 }, // 63 { -9, -9, 25, 25, -38, -38, 44, 44 }, // 64 { -44, -44, 38, 38, -25, -25, 9, 9 }, // 65 { 45, 45, 43, 43, 40, 40, 35, 35 }, // 66 { 29, 29, 21, 21, 13, 13, 4, 4 }, // 67 { -4, -4, -13, -13, -21, -21, -29, -29 }, // 68 { -35, -35, -40, -40, -43, -43, -45, -45 }, // 69 { 43, 43, 29, 29, 4, 4, -21, -21 }, // 70 { -40, -40, -45, -45, -35, -35, -13, -13 }, // 71 { 13, 13, 35, 35, 45, 45, 40, 40 }, // 72 { 21, 21, -4, -4, -29, -29, -43, -43 }, // 73 { 40, 40, 4, 4, -35, -35, -43, -43 }, // 74 { -13, -13, 29, 29, 45, 45, 21, 21 }, // 75 { -21, -21, -45, -45, -29, -29, 13, 13 }, // 76 { 43, 43, 35, 35, -4, -4, -40, -40 }, // 77 { 35, 35, -21, -21, -43, -43, 4, 4 }, // 78 { 45, 45, 13, 13, -40, -40, -29, -29 }, // 79 { 29, 29, 40, 40, -13, -13, -45, -45 }, // 80 { -4, -4, 43, 43, 21, 21, -35, -35 }, // 81 { 29, 29, -40, -40, -13, -13, 45, 45 }, // 82 { -4, -4, -43, -43, 21, 21, 35, 35 }, // 83 { -35, -35, -21, -21, 43, 43, 4, 4 }, // 84 { -45, -45, 13, 13, 40, 40, -29, -29 }, // 85 { 21, 21, -45, -45, 29, 29, 13, 13 }, // 86 { -43, -43, 35, 35, 4, 4, -40, -40 }, // 87 { 40, 40, -4, -4, -35, -35, 43, 43 }, // 88 { -13, -13, -29, -29, 45, 45, -21, -21 }, // 89 { 13, 13, -35, -35, 45, 45, -40, -40 }, // 90 { 21, 21, 4, 4, -29, -29, 43, 43 }, // 91 { -43, -43, 29, 29, -4, -4, -21, -21 }, // 92 { 40, 40, -45, -45, 35, 35, -13, -13 }, // 93 { 4, 4, -13, -13, 21, 21, -29, -29 }, // 94 { 35, 35, -40, -40, 43, 43, -45, -45 }, // 95 { 45, 45, -43, -43, 40, 40, -35, -35 }, // 96 { 29, 29, -21, -21, 13, 13, -4, -4 }, // 97 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ { (a00), -(a00), (a01), -(a01), (a02), -(a02), (a03), -(a03) }, \ { (a04), -(a04), (a05), -(a05), (a06), -(a06), (a07), -(a07) }, \ { (a08), -(a08), (a09), -(a09), (a10), -(a10), (a11), -(a11) }, \ { (a12), -(a12), (a13), -(a13), (a14), -(a14), (a15), -(a15) }, MAKE_COEF16(45, 45, 44, 43, 41, 39, 36, 34, 30, 27, 23, 19, 15, 11, 7, 2) // 98 MAKE_COEF16(45, 41, 34, 23, 11, -2, -15, -27, -36, -43, -45, -44, -39, -30, -19, -7) //102 MAKE_COEF16(44, 34, 15, -7, -27, -41, -45, -39, -23, -2, 19, 36, 45, 43, 30, 11) //106 MAKE_COEF16(43, 23, -7, -34, -45, -36, -11, 19, 41, 44, 27, -2, -30, -45, -39, -15) //110 MAKE_COEF16(41, 11, -27, -45, -30, 7, 39, 43, 15, -23, -45, -34, 2, 36, 44, 19) //114 MAKE_COEF16(39, -2, -41, -36, 7, 43, 34, -11, -44, -30, 15, 45, 27, -19, -45, -23) //118 MAKE_COEF16(36, -15, -45, -11, 39, 34, -19, -45, -7, 41, 30, -23, -44, -2, 43, 27) //122 MAKE_COEF16(34, -27, -39, 19, 43, -11, -45, 2, 45, 7, -44, -15, 41, 23, -36, -30) //126 MAKE_COEF16(30, -36, -23, 41, 15, -44, -7, 45, -2, -45, 11, 43, -19, -39, 27, 34) //130 MAKE_COEF16(27, -43, -2, 44, -23, -30, 41, 7, -45, 19, 34, -39, -11, 45, -15, -36) //134 MAKE_COEF16(23, -45, 19, 27, -45, 15, 30, -44, 11, 34, -43, 7, 36, -41, 2, 39) //138 MAKE_COEF16(19, -44, 36, -2, -34, 45, -23, -15, 43, -39, 7, 30, -45, 27, 11, -41) //142 MAKE_COEF16(15, -39, 45, -30, 2, 27, -44, 41, -19, -11, 36, -45, 34, -7, -23, 43) //146 MAKE_COEF16(11, -30, 43, -45, 36, -19, -2, 23, -39, 45, -41, 27, -7, -15, 34, -44) //150 MAKE_COEF16( 7, -19, 30, -39, 44, -45, 43, -36, 27, -15, 2, 11, -23, 34, -41, 45) //154 MAKE_COEF16( 2, -7, 11, -15, 19, -23, 27, -30, 34, -36, 39, -41, 43, -44, 45, -45) //158 #undef MAKE_COEF16 }; /* --------------------------------------------------------------------------- * secondary transform */ //ALIGN16(const int16_t g_2T[SEC_TR_SIZE * SEC_TR_SIZE]) = { // 123, -35, -8, -3, // e0 e1 e2 e3 // -32, -120, 30, 10, // f0 f1 f2 f3 // 14, 25, 123, -22, // g0 g1 g2 g3 // 8, 13, 19, 126 // h0 h1 h2 h3 //}; ALIGN16(static const int16_t g_2T_H[2 * (2 * SEC_TR_SIZE)]) = { 123, -35, -32, -120, 14, 25, 8, 13, // e0 e1 f0 f1 g0 g1 h0 h1 -8, -3, 30, 10, 123, -22, 19, 126 // e2 e3 f2 f3 g2 g3 h2 h3 }; ALIGN16(static const int16_t g_2T_V[8 * (2 * SEC_TR_SIZE)]) = { 123, -35, 123, -35, 123, -35, 123, -35, // e0 e1 e0 e1 e0 e1 e0 e1 -8, -3, -8, -3, -8, -3, -8, -3, // e2 e3 e2 e3 e2 e3 e2 e3 -32, -120, -32, -120, -32, -120, -32, -120, // f0 f1 f0 f1 f0 f1 f0 f1 30, 10, 30, 10, 30, 10, 30, 10, // f2 f3 f2 f3 f2 f3 f2 f3 14, 25, 14, 25, 14, 25, 14, 25, // g0 g1 g0 g1 g0 g1 g0 g1 123, -22, 123, -22, 123, -22, 123, -22, // g2 g3 g2 g3 g2 g3 g2 g3 8, 13, 8, 13, 8, 13, 8, 13, // h0 h1 h0 h1 h0 h1 h0 h1 19, 126, 19, 126, 19, 126, 19, 126, // h2 h3 h2 h3 h2 h3 h2 h3 }; /* --------------------------------------------------------------------------- * secondary transform (only for 4x4) */ //ALIGN16(const int16_t g_2T_C[SEC_TR_SIZE * SEC_TR_SIZE]) = { // 34, 58, 72, 81, // e0 e1 e2 e3 // 77, 69, -7, -75, // f0 f1 f2 f3 // 79, -33, -75, 58, // g0 g1 g2 g3 // 55, -84, 73, -28 // h0 h1 h2 h3 //}; ALIGN16(static const int16_t g_2TC_H[2 * (2 * SEC_TR_SIZE)]) = { 34, 58, 77, 69, 79, -33, 55, -84, // e0 e1 f0 f1 g0 g1 h0 h1 72, 81, -7, -75, -75, 58, 73, -28 // e2 e3 f2 f3 g2 g3 h2 h3 }; ALIGN16(static const int16_t g_2TC_V[8 * (2 * SEC_TR_SIZE)]) = { 34, 58, 34, 58, 34, 58, 34, 58, // e0 e1 e0 e1 e0 e1 e0 e1 72, 81, 72, 81, 72, 81, 72, 81, // e2 e3 e2 e3 e2 e3 e2 e3 77, 69, 77, 69, 77, 69, 77, 69, // f0 f1 f0 f1 f0 f1 f0 f1 -7, -75, -7, -75, -7, -75, -7, -75, // f2 f3 f2 f3 f2 f3 f2 f3 79, -33, 79, -33, 79, -33, 79, -33, // g0 g1 g0 g1 g0 g1 g0 g1 -75, 58, -75, 58, -75, 58, -75, 58, // g2 g3 g2 g3 g2 g3 g2 g3 55, -84, 55, -84, 55, -84, 55, -84, // h0 h1 h0 h1 h0 h1 h0 h1 73, -28, 73, -28, 73, -28, 73, -28, // h2 h3 h2 h3 h2 h3 h2 h3 }; //************************************************************** //futl //************************************************************** ALIGN32(static const int16_t tab_dct_4_avx2_1[][16]) = { { 32, 32, 32, 32, 32, 32, 32, 32, 32, -32, 32, -32, 32, -32, 32, -32 }, { 42, 17, 42, 17, 42, 17, 42, 17, 17, -42, 17, -42, 17, -42, 17, -42 }, }; ALIGN32(static const int16_t tab_dct_4_avx2[][16]) = { { 32, 32, 32, 32, 42, 17, 42, 17, 32, 32, 32, 32, 42, 17, 42, 17 }, { 32, -32, 32, -32, 17, -42, 17, -42, 32, -32, 32, -32, 17, -42, 17, -42 }, }; /* --------------------------------------------------------------------------- */ void dct_c_4x4_avx2(const coeff_t *src, coeff_t *dst, int i_src) { #define ADD1 0 #define ADD2 64 #define SHIFT1 0 #define SHIFT2 7 __m256i T20; __m256i T30, T31, T32, T40, T41, T42, T50, T51, T60, T70, T71; __m256i c_add2 = _mm256_set1_epi32(ADD2); __m256i Tab0, Tab1; Tab0 = _mm256_load_si256((__m256i*)tab_dct_4_avx2_1[0]); Tab1 = _mm256_load_si256((__m256i*)tab_dct_4_avx2_1[1]); T20 = _mm256_loadu_si256((__m256i*)(src + 0 * i_src)); T30 = _mm256_shufflehi_epi16(T20, 0x9C); //0 1 2 3 4 7 5 6 8 11 9 10 12 15 13 14...... T31 = _mm256_shufflelo_epi16(T30, 0x9C); //0 3 1 2 4 7 5 6 8 11 9 10 12 15 13 14...... T32 = _mm256_permute4x64_epi64(T31, 0xB4); //0 3 1 2 4 7 5 6 12 15 13 14 8 11 9 10 ...... T40 = _mm256_hadd_epi16(T32, T32); T41 = _mm256_hsub_epi16(T32, T32); T42 = _mm256_unpacklo_epi64(T40, T41); T50 = _mm256_madd_epi16(T42, _mm256_load_si256((__m256i*)tab_dct_4_avx2[0])); T51 = _mm256_madd_epi16(T42, _mm256_load_si256((__m256i*)tab_dct_4_avx2[1])); T60 = _mm256_packs_epi32(T50, T51); T70 = _mm256_permute2f128_si256(T60, T60, 0x20); T71 = _mm256_permute2f128_si256(T60, T60, 0x31); T30 = _mm256_madd_epi16(T70, Tab0); T31 = _mm256_madd_epi16(T71, Tab0); T40 = _mm256_add_epi32(T30, T31); T50 = _mm256_srai_epi32(_mm256_add_epi32(T40, c_add2), SHIFT2); T30 = _mm256_madd_epi16(T70, Tab1); T31 = _mm256_madd_epi16(T71, Tab1); T41 = _mm256_sub_epi32(T30, T31); T51 = _mm256_srai_epi32(_mm256_add_epi32(T41, c_add2), SHIFT2); T60 = _mm256_packs_epi32(T50, T51); __m256i mask64 = _mm256_set1_epi8(0xff); _mm256_maskstore_epi64((long long *)(dst), mask64, T60); #undef SHIFT1 #undef ADD1 #undef SHIFT2 #undef ADD2 } ALIGN32(static const int16_t tab_dct_8_avx2[][16]) = { { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A, 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, { 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32 }, { 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17 }, { 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42 }, { 44, 9, 38, 25, 44, 9, 38, 25, 44, 9, 38, 25, 44, 9, 38, 25 }, { 38, -25, -9, -44, 38, -25, -9, -44, 38, -25, -9, -44, 38, -25, -9, -44 }, { 25, 38, -44, 9, 25, 38, -44, 9, 25, 38, -44, 9, 25, 38, -44, 9 }, { 9, -44, -25, 38, 9, -44, -25, 38, 9, -44, -25, 38, 9, -44, -25, 38 }, { 42, 42, -42, -42, 17, 17, -17, -17, 42, 42, -42, -42, 17, 17, -17, -17 }, { 17, 17, -17, -17, -42, -42, 42, 42, 17, 17, -17, -17, -42, -42, 42, 42 }, { 44, -44, 9, -9, 38, -38, 25, -25, 44, -44, 9, -9, 38, -38, 25, -25 }, { 38, -38, -25, 25, -9, 9, -44, 44, 38, -38, -25, 25, -9, 9, -44, 44 }, { 25, -25, 38, -38, -44, 44, 9, -9, 25, -25, 38, -38, -44, 44, 9, -9 }, { 9, -9, -44, 44, -25, 25, 38, -38, 9, -9, -44, 44, -25, 25, 38, -38 } }; /* --------------------------------------------------------------------------- */ void dct_c_8x8_avx2(const coeff_t *src, coeff_t *dst, int i_src) { #define ADD1 1 #define ADD2 128 #define SHIFT1 1 #define SHIFT2 8 // Const __m256i c_add1 = _mm256_set1_epi32(ADD1); // add1 = 1 __m256i c_add2 = _mm256_set1_epi32(ADD2); // add2 = 128 // DCT1 __m256i T00, T01, T02, T03; __m256i T10, T11, T12, T13; __m256i T20, T21, T22, T23; __m256i T30, T31; __m256i T40, T41, T42, T43; __m256i T50, T51, T52, T53; __m256i T60, T61; __m256i T70, T71, T72, T73, T74, T75; __m256i T80, T81, T82, T83; __m256i Tab; T00 = _mm256_loadu_si256((__m256i*)(src + 0 * i_src)); T01 = _mm256_loadu_si256((__m256i*)(src + 2 * i_src)); T02 = _mm256_loadu_si256((__m256i*)(src + 4 * i_src)); T03 = _mm256_loadu_si256((__m256i*)(src + 6 * i_src)); Tab = _mm256_loadu_si256((__m256i*)tab_dct_8_avx2[0]); T10 = _mm256_shuffle_epi8(T00, Tab); T11 = _mm256_shuffle_epi8(T01, Tab); T12 = _mm256_shuffle_epi8(T02, Tab); T13 = _mm256_shuffle_epi8(T03, Tab); T20 = _mm256_hadd_epi16(T10, T11); T21 = _mm256_hadd_epi16(T12, T13); T22 = _mm256_hsub_epi16(T10, T11); T23 = _mm256_hsub_epi16(T12, T13); T30 = _mm256_hadd_epi16(T20, T21); T31 = _mm256_hsub_epi16(T20, T21); T40 = _mm256_madd_epi16(T30, _mm256_load_si256((__m256i*)tab_dct_8_avx2[1])); T40 = _mm256_srai_epi32(_mm256_add_epi32(T40, c_add1), SHIFT1); T41 = _mm256_madd_epi16(T30, _mm256_load_si256((__m256i*)tab_dct_8_avx2[2])); T41 = _mm256_srai_epi32(_mm256_add_epi32(T41, c_add1), SHIFT1); T42 = _mm256_madd_epi16(T31, _mm256_load_si256((__m256i*)tab_dct_8_avx2[3])); T42 = _mm256_srai_epi32(_mm256_add_epi32(T42, c_add1), SHIFT1); T43 = _mm256_madd_epi16(T31, _mm256_load_si256((__m256i*)tab_dct_8_avx2[4])); T43 = _mm256_srai_epi32(_mm256_add_epi32(T43, c_add1), SHIFT1); T50 = _mm256_packs_epi32(T40, T42); T52 = _mm256_packs_epi32(T41, T43); Tab = _mm256_load_si256((__m256i*)tab_dct_8_avx2[5]); T40 = _mm256_madd_epi16(T22, Tab); T41 = _mm256_madd_epi16(T23, Tab); T40 = _mm256_hadd_epi32(T40, T41); T40 = _mm256_srai_epi32(_mm256_add_epi32(T40, c_add1), SHIFT1); Tab = _mm256_load_si256((__m256i*)tab_dct_8_avx2[6]); T42 = _mm256_madd_epi16(T22, Tab); T43 = _mm256_madd_epi16(T23, Tab); T42 = _mm256_hadd_epi32(T42, T43); T42 = _mm256_srai_epi32(_mm256_add_epi32(T42, c_add1), SHIFT1); T51 = _mm256_packs_epi32(T40, T42); Tab = _mm256_load_si256((__m256i*)tab_dct_8_avx2[7]); T40 = _mm256_madd_epi16(T22, Tab); T41 = _mm256_madd_epi16(T23, Tab); T40 = _mm256_hadd_epi32(T40, T41); T40 = _mm256_srai_epi32(_mm256_add_epi32(T40, c_add1), SHIFT1); Tab = _mm256_load_si256((__m256i*)tab_dct_8_avx2[8]); T42 = _mm256_madd_epi16(T22, Tab); T43 = _mm256_madd_epi16(T23, Tab); T42 = _mm256_hadd_epi32(T42, T43); T42 = _mm256_srai_epi32(_mm256_add_epi32(T42, c_add1), SHIFT1); T53 = _mm256_packs_epi32(T40, T42); T60 = _mm256_permute4x64_epi64(T50, 0xD8); T61 = _mm256_permute4x64_epi64(T50, 0x72); T50 = _mm256_unpacklo_epi16(T60, T61); T60 = _mm256_permute4x64_epi64(T51, 0xD8); T61 = _mm256_permute4x64_epi64(T51, 0x72); T51 = _mm256_unpacklo_epi16(T60, T61); T60 = _mm256_permute4x64_epi64(T52, 0xD8); T61 = _mm256_permute4x64_epi64(T52, 0x72); T52 = _mm256_unpacklo_epi16(T60, T61); T60 = _mm256_permute4x64_epi64(T53, 0xD8); T61 = _mm256_permute4x64_epi64(T53, 0x72); T53 = _mm256_unpacklo_epi16(T60, T61); Tab = _mm256_load_si256((__m256i*)tab_dct_8_avx2[0]); T10 = _mm256_shuffle_epi8(T50, Tab); T11 = _mm256_shuffle_epi8(T51, Tab); T12 = _mm256_shuffle_epi8(T52, Tab); T13 = _mm256_shuffle_epi8(T53, Tab); // DCT2 Tab = _mm256_load_si256((__m256i*)tab_dct_8_avx2[1]); T20 = _mm256_madd_epi16(T10, Tab); T21 = _mm256_madd_epi16(T11, Tab); T22 = _mm256_madd_epi16(T12, Tab); T23 = _mm256_madd_epi16(T13, Tab); T30 = _mm256_hadd_epi32(T20, T21); T31 = _mm256_hadd_epi32(T22, T23); T40 = _mm256_hadd_epi32(T30, T31); T41 = _mm256_hsub_epi32(T30, T31); T50 = _mm256_srai_epi32(_mm256_add_epi32(T40, c_add2), SHIFT2); T51 = _mm256_srai_epi32(_mm256_add_epi32(T41, c_add2), SHIFT2); T70 = _mm256_packs_epi32(T50, T51); T70 = _mm256_permute4x64_epi64(T70, 0xD8); T70 = _mm256_shuffle_epi32(T70, 0xD8); #define MAKE_ODD(tab, TT0) \ Tab = _mm256_load_si256((__m256i*)tab_dct_8_avx2[tab]); \ T20 = _mm256_madd_epi16(T10, Tab); \ T21 = _mm256_madd_epi16(T11, Tab); \ T22 = _mm256_madd_epi16(T12, Tab); \ T23 = _mm256_madd_epi16(T13, Tab); \ T30 = _mm256_hadd_epi32(T20, T21); \ T31 = _mm256_hadd_epi32(T22, T23); \ T40 = _mm256_hadd_epi32(T30, T31); \ T50 = _mm256_srai_epi32(_mm256_add_epi32(T40, c_add2), SHIFT2); \ Tab = _mm256_load_si256((__m256i*)tab_dct_8_avx2[tab + 1]); \ T20 = _mm256_madd_epi16(T10, Tab); \ T21 = _mm256_madd_epi16(T11, Tab); \ T22 = _mm256_madd_epi16(T12, Tab); \ T23 = _mm256_madd_epi16(T13, Tab); \ T30 = _mm256_hadd_epi32(T20, T21); \ T31 = _mm256_hadd_epi32(T22, T23); \ T40 = _mm256_hadd_epi32(T30, T31); \ T51 = _mm256_srai_epi32(_mm256_add_epi32(T40, c_add2), SHIFT2); \ TT0 = _mm256_packs_epi32(T50, T51); \ TT0 = _mm256_permute4x64_epi64(TT0, 0xD8); \ TT0 = _mm256_shuffle_epi32(TT0, 0xD8); MAKE_ODD(9, T71); MAKE_ODD(11, T72); MAKE_ODD(13, T73); T74 = _mm256_permute2f128_si256(T70, T71, 0x20);//0 2 T75 = _mm256_permute2f128_si256(T70, T71, 0x31);//4 6 T80 = _mm256_permute2f128_si256(T74, T72, 0x20); T81 = _mm256_permute2f128_si256(T74, T72, 0x31); T82 = _mm256_permute2f128_si256(T75, T73, 0x20); T83 = _mm256_permute2f128_si256(T75, T73, 0x31); __m256i mask64 = _mm256_set1_epi8(0xff); _mm256_maskstore_epi64((long long *)(dst + 0 * i_src), mask64, T80); _mm256_maskstore_epi64((long long *)(dst + 2 * i_src), mask64, T81); _mm256_maskstore_epi64((long long *)(dst + 4 * i_src), mask64, T82); _mm256_maskstore_epi64((long long *)(dst + 6 * i_src), mask64, T83); #undef MAKE_ODD #undef SHIFT1 #undef ADD1 #undef SHIFT2 #undef ADD2 } /* --------------------------------------------------------------------------- */ void dct_c_16x16_avx2(const coeff_t * src, coeff_t * dst, int i_src) { const int SHIFT1 = B16X16_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; const int ADD1 = 1 << (SHIFT1 - 1); const int SHIFT2 = B16X16_IN_BIT + FACTO_BIT; const int ADD2 = 1 << (SHIFT2 - 1); __m256i data0, data1, data2, data3, data4, data5, data6, data7, data8, data9, dataA, dataB, dataC, dataD, dataE, dataF; __m256i s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7; __m256i ss0, ss1, ss2, ss3, sd0, sd1, sd2, sd3; __m256i sss0, sss1, ssd0, ssd1; __m256i shuffle0; __m256i shuffle1; __m256i coeff0, coeff1, coeff2, coeff3, coeff4, coeff5, coeff6, coeff7; __m256i coeff8, coeff9, coeffA, coeffB, coeffC, coeffD, coeffE, coeffF; __m256i coeff_0, coeff_1, coeff_2, coeff_3, coeff_4, coeff_5, coeff_6, coeff_7; __m256i add1, add2; __m256i temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB, rC, rD, rE, rF; int i; add1 = _mm256_set1_epi32(ADD1); shuffle0 = _mm256_load_si256((__m256i *)tab_dct_16_shuffle_avx2[0]); shuffle1 = _mm256_load_si256((__m256i *)tab_dct_16_shuffle_avx2[1]); #define load_coeff(var, line_no) var = _mm256_load_si256((__m256i *) tab_dct_16_avx2[0][line_no]) load_coeff(coeff0, 0); load_coeff(coeff1, 1); load_coeff(coeff2, 2); load_coeff(coeff3, 3); load_coeff(coeff4, 4); load_coeff(coeff5, 5); load_coeff(coeff6, 6); load_coeff(coeff7, 7); load_coeff(coeff8, 8); load_coeff(coeff9, 9); load_coeff(coeffA, 10); load_coeff(coeffB, 11); load_coeff(coeffC, 12); load_coeff(coeffD, 13); load_coeff(coeffE, 14); load_coeff(coeffF, 15); #undef load_coeff // load data from src data0 = _mm256_loadu2_m128i((__m128i *)(src + 8 * i_src + 0), (__m128i *)(src + 0 * i_src + 0)); // [00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87] data1 = _mm256_loadu2_m128i((__m128i *)(src + 8 * i_src + 8), (__m128i *)(src + 0 * i_src + 8)); // [08 09 0A 0B 0C 0D 0E 0F 88 89 8A 8B 8C 8D 8E 8F] data2 = _mm256_loadu2_m128i((__m128i *)(src + 9 * i_src + 0), (__m128i *)(src + 1 * i_src + 0)); data3 = _mm256_loadu2_m128i((__m128i *)(src + 9 * i_src + 8), (__m128i *)(src + 1 * i_src + 8)); data4 = _mm256_loadu2_m128i((__m128i *)(src + 10 * i_src + 0), (__m128i *)(src + 2 * i_src + 0)); data5 = _mm256_loadu2_m128i((__m128i *)(src + 10 * i_src + 8), (__m128i *)(src + 2 * i_src + 8)); data6 = _mm256_loadu2_m128i((__m128i *)(src + 11 * i_src + 0), (__m128i *)(src + 3 * i_src + 0)); data7 = _mm256_loadu2_m128i((__m128i *)(src + 11 * i_src + 8), (__m128i *)(src + 3 * i_src + 8)); data8 = _mm256_loadu2_m128i((__m128i *)(src + 12 * i_src + 0), (__m128i *)(src + 4 * i_src + 0)); data9 = _mm256_loadu2_m128i((__m128i *)(src + 12 * i_src + 8), (__m128i *)(src + 4 * i_src + 8)); dataA = _mm256_loadu2_m128i((__m128i *)(src + 13 * i_src + 0), (__m128i *)(src + 5 * i_src + 0)); dataB = _mm256_loadu2_m128i((__m128i *)(src + 13 * i_src + 8), (__m128i *)(src + 5 * i_src + 8)); dataC = _mm256_loadu2_m128i((__m128i *)(src + 14 * i_src + 0), (__m128i *)(src + 6 * i_src + 0)); dataD = _mm256_loadu2_m128i((__m128i *)(src + 14 * i_src + 8), (__m128i *)(src + 6 * i_src + 8)); dataE = _mm256_loadu2_m128i((__m128i *)(src + 15 * i_src + 0), (__m128i *)(src + 7 * i_src + 0)); dataF = _mm256_loadu2_m128i((__m128i *)(src + 15 * i_src + 8), (__m128i *)(src + 7 * i_src + 8)); // reoder the data data0 = _mm256_shuffle_epi8(data0, shuffle0); // [00 07 03 04 01 06 02 05 80 87 83 84 81 86 82 85] data2 = _mm256_shuffle_epi8(data2, shuffle0); data4 = _mm256_shuffle_epi8(data4, shuffle0); data6 = _mm256_shuffle_epi8(data6, shuffle0); data8 = _mm256_shuffle_epi8(data8, shuffle0); dataA = _mm256_shuffle_epi8(dataA, shuffle0); dataC = _mm256_shuffle_epi8(dataC, shuffle0); dataE = _mm256_shuffle_epi8(dataE, shuffle0); data1 = _mm256_shuffle_epi8(data1, shuffle1); // [0F 08 0B 0C 0E 09 0D 0A 8F 88 8B 8C 8E 89 8D 8A] data3 = _mm256_shuffle_epi8(data3, shuffle1); data5 = _mm256_shuffle_epi8(data5, shuffle1); data7 = _mm256_shuffle_epi8(data7, shuffle1); data9 = _mm256_shuffle_epi8(data9, shuffle1); dataB = _mm256_shuffle_epi8(dataB, shuffle1); dataD = _mm256_shuffle_epi8(dataD, shuffle1); dataF = _mm256_shuffle_epi8(dataF, shuffle1); s0 = _mm256_add_epi16(data0, data1); // [s00 s07 s03 s04 s01 s06 s02 s05 s80 s87 s83 s84 s81 s86 s82 s85] s1 = _mm256_add_epi16(data2, data3); // [s10 s17 s13 s14 s11 s16 s12 s15 s90 s97 s93 s94 s91 s96 s92 s95] s2 = _mm256_add_epi16(data4, data5); s3 = _mm256_add_epi16(data6, data7); s4 = _mm256_add_epi16(data8, data9); s5 = _mm256_add_epi16(dataA, dataB); s6 = _mm256_add_epi16(dataC, dataD); s7 = _mm256_add_epi16(dataE, dataF); d0 = _mm256_sub_epi16(data0, data1); // [d00 d07 d03 d04 d01 d06 d02 d05 d80 d87 d83 d84 d81 d86 d82 d85] d1 = _mm256_sub_epi16(data2, data3); d2 = _mm256_sub_epi16(data4, data5); d3 = _mm256_sub_epi16(data6, data7); d4 = _mm256_sub_epi16(data8, data9); d5 = _mm256_sub_epi16(dataA, dataB); d6 = _mm256_sub_epi16(dataC, dataD); d7 = _mm256_sub_epi16(dataE, dataF); ss0 = _mm256_hadd_epi16(s0, s1); // [ss00 ss03 ss01 ss02 ss10 ss13 ss11 ss12 ss80 ss83 ss81 ss82 ss90 ss93 ss91 ss92] ss1 = _mm256_hadd_epi16(s2, s3); ss2 = _mm256_hadd_epi16(s4, s5); ss3 = _mm256_hadd_epi16(s6, s7); sd0 = _mm256_hsub_epi16(s0, s1); // [sd00 sd03 sd01 sd02 sd80 sd10 sd13 sd11 sd12 sd83 sd81 sd82 sd90 sd93 sd91 sd92] sd1 = _mm256_hsub_epi16(s2, s3); sd2 = _mm256_hsub_epi16(s4, s5); sd3 = _mm256_hsub_epi16(s6, s7); sss0 = _mm256_hadd_epi16(ss0, ss1); // [sss00 sss01 sss10 sss11 sss21 sss22 sss30 sss31 sss80 sss81 sss90 sss91 sssA0 sssA1 sssB0 sssB1] sss1 = _mm256_hadd_epi16(ss2, ss3); ssd0 = _mm256_hsub_epi16(ss0, ss1); // [ssd00 ssd01 ssd10 ssd11 ssd20 ssd21 ssd31 ssd32 ssd80 ssd81 ssd90 ssd9S ssdA0 ssdA1 ssdB0 ssdB1] ssd1 = _mm256_hsub_epi16(ss2, ss3); temp0 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_madd_epi16(sss0, coeff0), add1), SHIFT1); // [00 10 20 30 80 90 A0 B0] temp1 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_madd_epi16(sss1, coeff0), add1), SHIFT1); // [40 50 60 70 C0 D0 E0 F0] data0 = _mm256_packs_epi32(temp0, temp1); // [00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0] temp0 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_madd_epi16(sss0, coeff8), add1), SHIFT1); temp1 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_madd_epi16(sss1, coeff8), add1), SHIFT1); data8 = _mm256_packs_epi32(temp0, temp1); // [08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8] temp0 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_madd_epi16(ssd0, coeff4), add1), SHIFT1); temp1 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_madd_epi16(ssd1, coeff4), add1), SHIFT1); data4 = _mm256_packs_epi32(temp0, temp1); // [04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4] temp0 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_madd_epi16(ssd0, coeffC), add1), SHIFT1); temp1 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_madd_epi16(ssd1, coeffC), add1), SHIFT1); dataC = _mm256_packs_epi32(temp0, temp1); // [0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC] #define CALC_4x(data, coeff) \ temp0 = _mm256_hadd_epi32(_mm256_madd_epi16(sd0, coeff), _mm256_madd_epi16(sd1, coeff)); \ temp1 = _mm256_hadd_epi32(_mm256_madd_epi16(sd2, coeff), _mm256_madd_epi16(sd3, coeff)); \ temp0 = _mm256_srai_epi32(_mm256_add_epi32(temp0, add1), SHIFT1); \ temp1 = _mm256_srai_epi32(_mm256_add_epi32(temp1, add1), SHIFT1); \ data = _mm256_packs_epi32(temp0, temp1); // [0X 1X 2X 3X 4X 5X 6X 7X 8X 9X AX BX CX DX EX FX] -> X = 2 + 4x CALC_4x(data2, coeff2); CALC_4x(data6, coeff6); CALC_4x(dataA, coeffA); CALC_4x(dataE, coeffE); #undef CALC_4x #define CALC_2x(data, coeff) \ temp0 = _mm256_hadd_epi32(_mm256_madd_epi16(d0, coeff), _mm256_madd_epi16(d1, coeff)); \ temp1 = _mm256_hadd_epi32(_mm256_madd_epi16(d2, coeff), _mm256_madd_epi16(d3, coeff)); \ temp2 = _mm256_hadd_epi32(_mm256_madd_epi16(d4, coeff), _mm256_madd_epi16(d5, coeff)); \ temp3 = _mm256_hadd_epi32(_mm256_madd_epi16(d6, coeff), _mm256_madd_epi16(d7, coeff)); \ temp0 = _mm256_hadd_epi32(temp0, temp1); \ temp1 = _mm256_hadd_epi32(temp2, temp3); \ temp0 = _mm256_srai_epi32(_mm256_add_epi32(temp0, add1), SHIFT1); \ temp1 = _mm256_srai_epi32(_mm256_add_epi32(temp1, add1), SHIFT1); \ data = _mm256_packs_epi32(temp0, temp1); // [0X 1X 2X 3X 4X 5X 6X 7X 8X 9X AX BX CX DX EX FX] -> X = 1 + 2x CALC_2x(data1, coeff1); CALC_2x(data3, coeff3); CALC_2x(data5, coeff5); CALC_2x(data7, coeff7); CALC_2x(data9, coeff9); CALC_2x(dataB, coeffB); CALC_2x(dataD, coeffD); CALC_2x(dataF, coeffF); #undef CALC_2x /*-------------------------------------------------------*/ // dct 2 add2 = _mm256_set1_epi32(ADD2); shuffle0 = _mm256_load_si256((__m256i *)tab_dct_16_shuffle_avx2[2]); shuffle1 = _mm256_load_si256((__m256i *)tab_dct_16_shuffle_avx2[3]); #define load_coeff(var, line_no) var = _mm256_load_si256((__m256i *) tab_dct_16_avx2[1][line_no]) load_coeff(coeff0, 0); load_coeff(coeff1, 1); load_coeff(coeff2, 2); load_coeff(coeff3, 3); load_coeff(coeff4, 4); load_coeff(coeff5, 5); load_coeff(coeff6, 6); load_coeff(coeff7, 7); load_coeff(coeff8, 8); load_coeff(coeff9, 9); load_coeff(coeffA, 10); load_coeff(coeffB, 11); load_coeff(coeffC, 12); load_coeff(coeffD, 13); load_coeff(coeffE, 14); load_coeff(coeffF, 15); #undef load_coeff #define load_coeff(var, line_no) var = _mm256_load_si256((__m256i *) tab_dct_16_avx2[2][line_no]) load_coeff(coeff_0, 0); load_coeff(coeff_1, 1); load_coeff(coeff_2, 2); load_coeff(coeff_3, 3); load_coeff(coeff_4, 4); load_coeff(coeff_5, 5); load_coeff(coeff_6, 6); load_coeff(coeff_7, 7); #undef load_coeff // now data0 ~ dataF store all of the data like [00 01 02 03 04 05...] for (i = 0; i < 16; i += 8) { r0 = _mm256_permute2x128_si256(data4, data0, 0x02); // [00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47] r1 = _mm256_permute2x128_si256(data4, data0, 0x13); // [08 09 0A 0B 0C 0D 0E 0F 48 49 4A 4B 4C 4D 4E 4F] r2 = _mm256_permute2x128_si256(data5, data1, 0x02); r3 = _mm256_permute2x128_si256(data5, data1, 0x13); r4 = _mm256_permute2x128_si256(data6, data2, 0x02); r5 = _mm256_permute2x128_si256(data6, data2, 0x13); r6 = _mm256_permute2x128_si256(data7, data3, 0x02); r7 = _mm256_permute2x128_si256(data7, data3, 0x13); r0 = _mm256_shuffle_epi8(r0, shuffle0); // [00 03 01 02 07 04 06 05 40 43 41 42 47 44 46 45] r1 = _mm256_shuffle_epi8(r1, shuffle1); // [0F 0C 0E 0D 08 0B 09 0A 4F 4C 4E 4D 48 4B 49 4A] r2 = _mm256_shuffle_epi8(r2, shuffle0); r3 = _mm256_shuffle_epi8(r3, shuffle1); r4 = _mm256_shuffle_epi8(r4, shuffle0); r5 = _mm256_shuffle_epi8(r5, shuffle1); r6 = _mm256_shuffle_epi8(r6, shuffle0); r7 = _mm256_shuffle_epi8(r7, shuffle1); temp0 = _mm256_unpacklo_epi16(r0, r1); // [00 0F 03 0C 01 0E 02 0D 40 4F 43 4C 41 4E 42 4D] temp1 = _mm256_unpackhi_epi16(r0, r1); // [07 08 04 0B 06 09 05 0A 47 48 44 4B 46 49 45 4A] temp2 = _mm256_unpacklo_epi16(r2, r3); temp3 = _mm256_unpackhi_epi16(r2, r3); temp4 = _mm256_unpacklo_epi16(r4, r5); temp5 = _mm256_unpackhi_epi16(r4, r5); temp6 = _mm256_unpacklo_epi16(r6, r7); temp7 = _mm256_unpackhi_epi16(r6, r7); #define CALC_DATA(data, coeff) \ s0 = _mm256_madd_epi16(temp0, coeff); /* [32*s00 32*s03 32*s01 32*s02 32*s40 32*s43 32*s41 32*s42] */ \ s1 = _mm256_madd_epi16(temp1, coeff); /* [32*s07 32*s04 32*s06 32*s05 32*s47 32*s44 32*s46 32*s45] */ \ s2 = _mm256_madd_epi16(temp2, coeff); \ s3 = _mm256_madd_epi16(temp3, coeff); \ s4 = _mm256_madd_epi16(temp4, coeff); \ s5 = _mm256_madd_epi16(temp5, coeff); \ s6 = _mm256_madd_epi16(temp6, coeff); \ s7 = _mm256_madd_epi16(temp7, coeff); \ \ ss0 = _mm256_add_epi32(s0, s1); /* [32*ss02 32*ss01 32*ss03 32*ss00 32*ss42 32*ss41 32*ss43 32*ss40] */ \ ss1 = _mm256_add_epi32(s2, s3); /* [32*ss12 32*ss11 32*ss13 32*ss10 32*ss52 32*ss51 32*ss53 32*ss50] */ \ ss2 = _mm256_add_epi32(s4, s5); \ ss3 = _mm256_add_epi32(s6, s7); \ \ sss0 = _mm256_hadd_epi32(ss0, ss1); /* [32*sss01 32*sss00 32*sss11 32*sss10 32*sss41 32*sss40 32*sss51 32*sss50] */ \ sss1 = _mm256_hadd_epi32(ss2, ss3); /* [32*sss21 32*sss20 32*sss31 32*sss30 32*sss61 32*sss60 32*sss71 32*sss70] */ \ data = _mm256_srai_epi32(_mm256_add_epi32(_mm256_hadd_epi32(sss0, sss1), add2), SHIFT2) // [00 01 02 03 04 05 06 07] CALC_DATA(r0, coeff0); r8 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_hsub_epi32(sss0, sss1), add2), SHIFT2); // [80 81 82 83 84 85 86 87] temp = _mm256_permute4x64_epi64(_mm256_packs_epi32(r0, r8), 0xD8); _mm256_storeu2_m128i((__m128i *)(dst + 8 * 16 + i), (__m128i *)(dst + 0 * 16 + i), temp); CALC_DATA(r4, coeff4); CALC_DATA(rC, coeffC); temp = _mm256_permute4x64_epi64(_mm256_packs_epi32(r4, rC), 0xD8); _mm256_storeu2_m128i((__m128i *)(dst + 12 * 16 + i), (__m128i *)(dst + 4 * 16 + i), temp); #undef CALC_DATA #define CALC_DATA(data, coeff) \ s0 = _mm256_madd_epi16(temp0, coeff); /* [32*s00 32*s03 32*s01 32*s02 32*s40 32*s43 32*s41 32*s42] */ \ s1 = _mm256_madd_epi16(temp1, coeff); /* [32*s07 32*s04 32*s06 32*s05 32*s47 32*s44 32*s46 32*s45] */ \ s2 = _mm256_madd_epi16(temp2, coeff); \ s3 = _mm256_madd_epi16(temp3, coeff); \ s4 = _mm256_madd_epi16(temp4, coeff); \ s5 = _mm256_madd_epi16(temp5, coeff); \ s6 = _mm256_madd_epi16(temp6, coeff); \ s7 = _mm256_madd_epi16(temp7, coeff); \ \ ss0 = _mm256_sub_epi32(s0, s1); /* [32*ss02 32*ss01 32*ss03 32*ss00 32*ss42 32*ss41 32*ss43 32*ss40] */ \ ss1 = _mm256_sub_epi32(s2, s3); /* [32*ss12 32*ss11 32*ss13 32*ss10 32*ss52 32*ss51 32*ss53 32*ss50] */ \ ss2 = _mm256_sub_epi32(s4, s5); \ ss3 = _mm256_sub_epi32(s6, s7); \ \ sss0 = _mm256_hadd_epi32(ss0, ss1); /* [32*sss01 32*sss00 32*sss11 32*sss10 32*sss41 32*sss40 32*sss51 32*sss50] */ \ sss1 = _mm256_hadd_epi32(ss2, ss3); /* [32*sss21 32*sss20 32*sss31 32*sss30 32*sss61 32*sss60 32*sss71 32*sss70] */ \ data = _mm256_srai_epi32(_mm256_add_epi32(_mm256_hadd_epi32(sss0, sss1), add2), SHIFT2) CALC_DATA(r2, coeff2); CALC_DATA(r6, coeff6); temp = _mm256_permute4x64_epi64(_mm256_packs_epi32(r2, r6), 0xD8); _mm256_storeu2_m128i((__m128i *)(dst + 6 * 16 + i), (__m128i *)(dst + 2 * 16 + i), temp); CALC_DATA(rA, coeffA); CALC_DATA(rE, coeffE); temp = _mm256_permute4x64_epi64(_mm256_packs_epi32(rA, rE), 0xD8); _mm256_storeu2_m128i((__m128i *)(dst + 14 * 16 + i), (__m128i *)(dst + 10 * 16 + i), temp); #undef CALC_DATA #define CALC_DATA(data, c0, c1) \ s0 = _mm256_madd_epi16(temp0, c0); \ s1 = _mm256_madd_epi16(temp1, c1); \ s2 = _mm256_madd_epi16(temp2, c0); \ s3 = _mm256_madd_epi16(temp3, c1); \ s4 = _mm256_madd_epi16(temp4, c0); \ s5 = _mm256_madd_epi16(temp5, c1); \ s6 = _mm256_madd_epi16(temp6, c0); \ s7 = _mm256_madd_epi16(temp7, c1); \ \ ss0 = _mm256_add_epi32(s0, s1); \ ss1 = _mm256_add_epi32(s2, s3); \ ss2 = _mm256_add_epi32(s4, s5); \ ss3 = _mm256_add_epi32(s6, s7); \ \ sss0 = _mm256_hadd_epi32(ss0, ss1); \ sss1 = _mm256_hadd_epi32(ss2, ss3); \ \ data = _mm256_srai_epi32(_mm256_add_epi32(_mm256_hadd_epi32(sss0, sss1), add2), SHIFT2) CALC_DATA(r1, coeff1, coeff_0); CALC_DATA(r3, coeff3, coeff_1); temp = _mm256_permute4x64_epi64(_mm256_packs_epi32(r1, r3), 0xD8); _mm256_storeu2_m128i((__m128i *)(dst + 3 * 16 + i), (__m128i *)(dst + 1 * 16 + i), temp); CALC_DATA(r5, coeff5, coeff_2); CALC_DATA(r7, coeff7, coeff_3); temp = _mm256_permute4x64_epi64(_mm256_packs_epi32(r5, r7), 0xD8); _mm256_storeu2_m128i((__m128i *)(dst + 7 * 16 + i), (__m128i *)(dst + 5 * 16 + i), temp); CALC_DATA(r9, coeff9, coeff_4); CALC_DATA(rB, coeffB, coeff_5); temp = _mm256_permute4x64_epi64(_mm256_packs_epi32(r9, rB), 0xD8); _mm256_storeu2_m128i((__m128i *)(dst + 11 * 16 + i), (__m128i *)(dst + 9 * 16 + i), temp); CALC_DATA(rD, coeffD, coeff_6); CALC_DATA(rF, coeffF, coeff_7); temp = _mm256_permute4x64_epi64(_mm256_packs_epi32(rD, rF), 0xD8); _mm256_storeu2_m128i((__m128i *)(dst + 15 * 16 + i), (__m128i *)(dst + 13 * 16 + i), temp); #undef CALC_DATA data0 = data8; data1 = data9; data2 = dataA; data3 = dataB; data4 = dataC; data5 = dataD; data6 = dataE; data7 = dataF; } } /* --------------------------------------------------------------------------- */ ALIGN32(static const int16_t tab_dct_32x32_avx2[][16]) = { { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, //order 0 // 0 / 16 { 42, 42, -42, -42, 42, 42, -42, -42, 17, 17, -17, -17, 17, 17, -17, -17 }, //oeder 1 // 8 { 17, 17, -17, -17, 17, 17, -17, -17, -42, -42, 42, 42, -42, -42, 42, 42 }, //order 2 // 24 { 44, -44, 9, -9, 44, -44, 9, -9, 38, -38, 25, -25, 38, -38, 25, -25 }, //order 3 //4 { 38, -38, -25, 25, 38, -38, -25, 25, -9, 9, -44, 44, -9, 9, -44, 44 }, //order 4 //12 { 25, -25, 38, -38, 25, -25, 38, -38, -44, 44, 9, -9, -44, 44, 9, -9 }, //order 5 //20 { 9, -9, -44, 44, 9, -9, -44, 44, -25, 25, 38, -38, -25, 25, 38, -38 }, //order 6 //28 #define MAKE_COE16(a0, a1, a2, a3, a4, a5, a6, a7)\ { (a0), (a7), (a3), (a4), (a0), (a7), (a3), (a4), (a1), (a6), (a2), (a5), (a1), (a6), (a2), (a5) }, MAKE_COE16(45, 43, 40, 35, 29, 21, 13, 4) // 7 MAKE_COE16(43, 29, 4, -21, -40, -45, -35, -13) // 8 MAKE_COE16(40, 4, -35, -43, -13, 29, 45, 21) // 9 MAKE_COE16(35, -21, -43, 4, 45, 13, -40, -29) // 10 MAKE_COE16(29, -40, -13, 45, -4, -43, 21, 35) // 11 MAKE_COE16(21, -45, 29, 13, -43, 35, 4, -40) // 12 MAKE_COE16(13, -35, 45, -40, 21, 4, -29, 43) // 13 MAKE_COE16(4, -13, 21, -29, 35, -40, 43, -45) // 14 #undef MAKE_COE16 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ {(a00), (a07), (a03), (a04), (a01), (a06), (a02), (a05), (a08), (a15), (a11), (a12), (a09), (a14), (a10), (a13)}, MAKE_COEF16(45, 45, 44, 43, 41, 39, 36, 34, 30, 27, 23, 19, 15, 11, 7, 2) // 15 MAKE_COEF16(45, 41, 34, 23, 11, -2, -15, -27, -36, -43, -45, -44, -39, -30, -19, -7) // 16 MAKE_COEF16(44, 34, 15, -7, -27, -41, -45, -39, -23, -2, 19, 36, 45, 43, 30, 11) // 17 MAKE_COEF16(43, 23, -7, -34, -45, -36, -11, 19, 41, 44, 27, -2, -30, -45, -39, -15) // 18 MAKE_COEF16(41, 11, -27, -45, -30, 7, 39, 43, 15, -23, -45, -34, 2, 36, 44, 19) // 19 MAKE_COEF16(39, -2, -41, -36, 7, 43, 34, -11, -44, -30, 15, 45, 27, -19, -45, -23) // 20 MAKE_COEF16(36, -15, -45, -11, 39, 34, -19, -45, -7, 41, 30, -23, -44, -2, 43, 27) // 21 MAKE_COEF16(34, -27, -39, 19, 43, -11, -45, 2, 45, 7, -44, -15, 41, 23, -36, -30) // 22 MAKE_COEF16(30, -36, -23, 41, 15, -44, -7, 45, -2, -45, 11, 43, -19, -39, 27, 34) // 23 MAKE_COEF16(27, -43, -2, 44, -23, -30, 41, 7, -45, 19, 34, -39, -11, 45, -15, -36) // 24 MAKE_COEF16(23, -45, 19, 27, -45, 15, 30, -44, 11, 34, -43, 7, 36, -41, 2, 39) // 25 MAKE_COEF16(19, -44, 36, -2, -34, 45, -23, -15, 43, -39, 7, 30, -45, 27, 11, -41) // 26 MAKE_COEF16(15, -39, 45, -30, 2, 27, -44, 41, -19, -11, 36, -45, 34, -7, -23, 43) // 27 MAKE_COEF16(11, -30, 43, -45, 36, -19, -2, 23, -39, 45, -41, 27, -7, -15, 34, -44) // 28 MAKE_COEF16(7, -19, 30, -39, 44, -45, 43, -36, 27, -15, 2, 11, -23, 34, -41, 45) // 29 MAKE_COEF16(2, -7, 11, -15, 19, -23, 27, -30, 34, -36, 39, -41, 43, -44, 45, -45) // 30 #undef MAKE_COE16 }; ALIGN32(static const int32_t tab_dct2_32x32_avx2[][8]) = { #define MAKE_COE8(a0, a1, a2, a3, a4, a5, a6, a7) \ { a0, a1, a2, a3, a4, a5, a6, a7 }, MAKE_COE8(32, 32, 32, 32, 32, 32, 32, 32) //order 0 // 0 MAKE_COE8(42, 17, -17, -42, -42, -17, 17, 42) //order 1 // 8 MAKE_COE8(32, -32, -32, 32, 32, -32, -32, 32) //order 2 // 16 MAKE_COE8(17, -42, 42, -17, -17, 42, -42, 17) //order 3 // 24 MAKE_COE8(44, 38, 25, 9, -9, -25, -38, -44) //order 4 // 4 MAKE_COE8(38, -9, -44, -25, 25, 44, 9, -38) //order 5 // 12 MAKE_COE8(25, -44, 9, 38, -38, -9, 44, -25) //order 6 // 20 MAKE_COE8(9, -25, 38, -44, 44, -38, 25, -9) //order 7 // 28 MAKE_COE8(45, 43, 40, 35, 29, 21, 13, 4) //order 8 // 2 MAKE_COE8(43, 29, 4, -21, -40, -45, -35, -13) //order 9 // 6 MAKE_COE8(40, 4, -35, -43, -13, 29, 45, 21) //order 10 // 10 MAKE_COE8(35, -21, -43, 4, 45, 13, -40, -29) //order 11 // 14 MAKE_COE8(29, -40, -13, 45, -4, -43, 21, 35) //order 12 // 18 MAKE_COE8(21, -45, 29, 13, -43, 35, 4, -40) //order 13 // 22 MAKE_COE8(13, -35, 45, -40, 21, 4, -29, 43) //order 14 // 26 MAKE_COE8(4, -13, 21, -29, 35, -40, 43, -45) //order 15 // 30 #undef MAKE_COE8 #define MAKE_COE16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \ { a00, a01, a02, a03, a04, a05, a06, a07}, \ { a15, a14, a13, a12, a11, a10, a09, a08 }, MAKE_COE16(45, 45, 44, 43, 41, 39, 36, 34, 30, 27, 23, 19, 15, 11, 7, 2) //order 16 // 1 MAKE_COE16(45, 41, 34, 23, 11, -2, -15, -27, -36, -43, -45, -44, -39, -30, -19, -7) //order 18 // 3 MAKE_COE16(44, 34, 15, -7, -27, -41, -45, -39, -23, -2, 19, 36, 45, 43, 30, 11) //order 20 // 5 MAKE_COE16(43, 23, -7, -34, -45, -36, -11, 19, 41, 44, 27, -2, -30, -45, -39, -15) //order 22 // 7 MAKE_COE16(41, 11, -27, -45, -30, 7, 39, 43, 15, -23, -45, -34, 2, 36, 44, 19) //order 24 // 9 MAKE_COE16(39, -2, -41, -36, 7, 43, 34, -11, -44, -30, 15, 45, 27, -19, -45, -23) //order 26 //11 MAKE_COE16(36, -15, -45, -11, 39, 34, -19, -45, -7, 41, 30, -23, -44, -2, 43, 27) //order 28 //13 MAKE_COE16(34, -27, -39, 19, 43, -11, -45, 2, 45, 7, -44, -15, 41, 23, -36, -30) //order 30 //15 MAKE_COE16(30, -36, -23, 41, 15, -44, -7, 45, -2, -45, 11, 43, -19, -39, 27, 34) //order 32 //17 MAKE_COE16(27, -43, -2, 44, -23, -30, 41, 7, -45, 19, 34, -39, -11, 45, -15, -36) //order 34 //19 MAKE_COE16(23, -45, 19, 27, -45, 15, 30, -44, 11, 34, -43, 7, 36, -41, 2, 39) //order 36 //21 MAKE_COE16(19, -44, 36, -2, -34, 45, -23, -15, 43, -39, 7, 30, -45, 27, 11, -41) //order 38 //23 MAKE_COE16(15, -39, 45, -30, 2, 27, -44, 41, -19, -11, 36, -45, 34, -7, -23, 43) //order 40 //25 MAKE_COE16(11, -30, 43, -45, 36, -19, -2, 23, -39, 45, -41, 27, -7, -15, 34, -44) //order 42 //27 MAKE_COE16(7, -19, 30, -39, 44, -45, 43, -36, 27, -15, 2, 11, -23, 34, -41, 45) //order 44 //29 MAKE_COE16(2, -7, 11, -15, 19, -23, 27, -30, 34, -36, 39, -41, 43, -44, 45, -45) //order 46 //31 #undef MAKE_COE16 }; /* --------------------------------------------------------------------------- */ void dct_c_32x32_avx2(const coeff_t *src, coeff_t *dst, int i_src) { const int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01); const int ADD1 = 1 << (shift1 - 1); const int SHIFT2 = B32X32_IN_BIT + FACTO_BIT; const int ADD2 = 1 << (SHIFT2 - 1); __m256i c_add1 = _mm256_set1_epi32(ADD1); __m256i c_add2 = _mm256_set1_epi32(ADD2); //R---row C-column __m256i R0C0, R0C1, R1C0, R1C1, R2C0, R2C1, R3C0, R3C1, R4C0, R4C1, R5C0, R5C1, R6C0, R6C1, R7C0, R7C1; __m256i R8C0, R8C1, R9C0, R9C1, R10C0, R10C1, R11C0, R11C1, R12C0, R12C1, R13C0, R13C1, R14C0, R14C1, R15C0, R15C1; //store anser __m256i A0C0, A0C1, A1C0, A1C1, A2C0, A2C1, A3C0, A3C1, A4C0, A4C1, A5C0, A5C1, A6C0, A6C1, A7C0, A7C1; __m256i R0R1, R2R3, R4R5, R6R7, R8R9, R10R11, R12R13, R14R15; __m256i COE0, COE1, COE2, COE3; __m256i COE_RESULT; __m256i im[32][2]; __m256i R0_ODD, R1_ODD, R2_ODD, R3_ODD, R4_ODD, R5_ODD, R6_ODD, R7_ODD; __m256i R8_ODD, R9_ODD, R10_ODD, R11_ODD, R12_ODD, R13_ODD, R14_ODD, R15_ODD; __m256i tab_t, tab_t1; coeff_t * addr; i_src &= 0xFE; /* remember to remove the flag bit */ int i; // DCT1 for (i = 0; i < 32 / 16; i++) { R0C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 0) * i_src + 0)); //[15 14 13 12 11 10... 03 02 01 00] R0C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 0) * i_src + 16)); //[31 30 29 28 11 10... 19 18 17 16] R1C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 1) * i_src + 0)); R1C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 1) * i_src + 16)); R2C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 2) * i_src + 0)); R2C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 2) * i_src + 16)); R3C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 3) * i_src + 0)); R3C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 3) * i_src + 16)); R4C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 4) * i_src + 0)); R4C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 4) * i_src + 16)); R5C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 5) * i_src + 0)); R5C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 5) * i_src + 16)); R6C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 6) * i_src + 0)); R6C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 6) * i_src + 16)); R7C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 7) * i_src + 0)); R7C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 7) * i_src + 16)); R8C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 8) * i_src + 0)); R8C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 8) * i_src + 16)); R9C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 9) * i_src + 0)); R9C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 9) * i_src + 16)); R10C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 10) * i_src + 0)); R10C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 10) * i_src + 16)); R11C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 11) * i_src + 0)); R11C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 11) * i_src + 16)); R12C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 12) * i_src + 0)); R12C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 12) * i_src + 16)); R13C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 13) * i_src + 0)); R13C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 13) * i_src + 16)); R14C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 14) * i_src + 0)); R14C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 14) * i_src + 16)); R15C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 15) * i_src + 0)); R15C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 15) * i_src + 16)); //notice that different set / setr low dizhi butong __m256i tab_shuffle = _mm256_setr_epi16(0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A, 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A); __m256i tab_shuffle_1 = _mm256_setr_epi16(0x0100, 0x0B0A, 0x0302, 0x0908, 0x0504, 0x0F0E, 0x0706, 0x0D0C, 0x0100, 0x0B0A, 0x0302, 0x0908, 0x0504, 0x0F0E, 0x0706, 0x0D0C); __m256i tab_shuffle_2 = _mm256_setr_epi16(0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C, 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C); //[13 10 14 09 12 11 15 08 05 02 06 01 04 03 07 00] //[29 26 30 25 28 27 31 24 21 18 22 17 20 19 23 16z R0C0 = _mm256_shuffle_epi8(R0C0, tab_shuffle); R0C1 = _mm256_shuffle_epi8(R0C1, tab_shuffle); R0C1 = _mm256_permute2x128_si256(R0C1, R0C1, 0x0003);//permute [21 18 22 17 20 19 23 16 / 29 26 30 25 28 27 31 24] R0C1 = _mm256_shuffle_epi8(R0C1, tab_shuffle_2); // [18 21 17 22 19 20 16 23 / 26 29 25 30 27 28 24 31] //[13 10 14 09 12 11 15 08 / 05 02 06 01 04 03 07 00] //[18 21 17 22 19 20 16 23 / 26 29 25 30 27 28 24 31] R0_ODD = _mm256_sub_epi16(R0C0, R0C1); R0C0 = _mm256_add_epi16(R0C0, R0C1);//[13 10 14 09 12 11 15 08 / 05 02 06 01 04 03 07 00] R0C0 = _mm256_permute4x64_epi64(R0C0, 0x00D8);//[13 10 14 09 05 02 06 01 / 12 11 15 08 04 03 07 00] R0C0 = _mm256_shuffle_epi8(R0C0, tab_shuffle_1);//[10 05 13 02 09 06 14 01 / 11 04 12 03 08 07 15 00] R1C0 = _mm256_shuffle_epi8(R1C0, tab_shuffle); R1C1 = _mm256_shuffle_epi8(R1C1, tab_shuffle); R1C1 = _mm256_permute2x128_si256(R1C1, R1C1, 0x0003); R1C1 = _mm256_shuffle_epi8(R1C1, tab_shuffle_2); R1_ODD = _mm256_sub_epi16(R1C0, R1C1); R1C0 = _mm256_add_epi16(R1C0, R1C1); R1C0 = _mm256_permute4x64_epi64(R1C0, 0x00D8); R1C0 = _mm256_shuffle_epi8(R1C0, tab_shuffle_1); R2C0 = _mm256_shuffle_epi8(R2C0, tab_shuffle); R2C1 = _mm256_shuffle_epi8(R2C1, tab_shuffle); R2C1 = _mm256_permute2x128_si256(R2C1, R2C1, 0x0003); R2C1 = _mm256_shuffle_epi8(R2C1, tab_shuffle_2); R2_ODD = _mm256_sub_epi16(R2C0, R2C1); R2C0 = _mm256_add_epi16(R2C0, R2C1); R2C0 = _mm256_permute4x64_epi64(R2C0, 0x00D8); R2C0 = _mm256_shuffle_epi8(R2C0, tab_shuffle_1); R3C0 = _mm256_shuffle_epi8(R3C0, tab_shuffle); R3C1 = _mm256_shuffle_epi8(R3C1, tab_shuffle); R3C1 = _mm256_permute2x128_si256(R3C1, R3C1, 0x0003); R3C1 = _mm256_shuffle_epi8(R3C1, tab_shuffle_2); R3_ODD = _mm256_sub_epi16(R3C0, R3C1); R3C0 = _mm256_add_epi16(R3C0, R3C1); R3C0 = _mm256_permute4x64_epi64(R3C0, 0x00D8); R3C0 = _mm256_shuffle_epi8(R3C0, tab_shuffle_1); R4C0 = _mm256_shuffle_epi8(R4C0, tab_shuffle); R4C1 = _mm256_shuffle_epi8(R4C1, tab_shuffle); R4C1 = _mm256_permute2x128_si256(R4C1, R4C1, 0x0003); R4C1 = _mm256_shuffle_epi8(R4C1, tab_shuffle_2); R4_ODD = _mm256_sub_epi16(R4C0, R4C1); R4C0 = _mm256_add_epi16(R4C0, R4C1); R4C0 = _mm256_permute4x64_epi64(R4C0, 0x00D8); R4C0 = _mm256_shuffle_epi8(R4C0, tab_shuffle_1); R5C0 = _mm256_shuffle_epi8(R5C0, tab_shuffle); R5C1 = _mm256_shuffle_epi8(R5C1, tab_shuffle); R5C1 = _mm256_permute2x128_si256(R5C1, R5C1, 0x0003); R5C1 = _mm256_shuffle_epi8(R5C1, tab_shuffle_2); R5_ODD = _mm256_sub_epi16(R5C0, R5C1); R5C0 = _mm256_add_epi16(R5C0, R5C1); R5C0 = _mm256_permute4x64_epi64(R5C0, 0x00D8); R5C0 = _mm256_shuffle_epi8(R5C0, tab_shuffle_1); R6C0 = _mm256_shuffle_epi8(R6C0, tab_shuffle); R6C1 = _mm256_shuffle_epi8(R6C1, tab_shuffle); R6C1 = _mm256_permute2x128_si256(R6C1, R6C1, 0x0003); R6C1 = _mm256_shuffle_epi8(R6C1, tab_shuffle_2); R6_ODD = _mm256_sub_epi16(R6C0, R6C1); R6C0 = _mm256_add_epi16(R6C0, R6C1); R6C0 = _mm256_permute4x64_epi64(R6C0, 0x00D8); R6C0 = _mm256_shuffle_epi8(R6C0, tab_shuffle_1); R7C0 = _mm256_shuffle_epi8(R7C0, tab_shuffle); R7C1 = _mm256_shuffle_epi8(R7C1, tab_shuffle); R7C1 = _mm256_permute2x128_si256(R7C1, R7C1, 0x0003); R7C1 = _mm256_shuffle_epi8(R7C1, tab_shuffle_2); R7_ODD = _mm256_sub_epi16(R7C0, R7C1); R7C0 = _mm256_add_epi16(R7C0, R7C1); R7C0 = _mm256_permute4x64_epi64(R7C0, 0x00D8); R7C0 = _mm256_shuffle_epi8(R7C0, tab_shuffle_1); R8C0 = _mm256_shuffle_epi8(R8C0, tab_shuffle); R8C1 = _mm256_shuffle_epi8(R8C1, tab_shuffle); R8C1 = _mm256_permute2x128_si256(R8C1, R8C1, 0x0003); R8C1 = _mm256_shuffle_epi8(R8C1, tab_shuffle_2); R8_ODD = _mm256_sub_epi16(R8C0, R8C1); R8C0 = _mm256_add_epi16(R8C0, R8C1); R8C0 = _mm256_permute4x64_epi64(R8C0, 0x00D8); R8C0 = _mm256_shuffle_epi8(R8C0, tab_shuffle_1); R9C0 = _mm256_shuffle_epi8(R9C0, tab_shuffle); R9C1 = _mm256_shuffle_epi8(R9C1, tab_shuffle); R9C1 = _mm256_permute2x128_si256(R9C1, R9C1, 0x0003); R9C1 = _mm256_shuffle_epi8(R9C1, tab_shuffle_2); R9_ODD = _mm256_sub_epi16(R9C0, R9C1); R9C0 = _mm256_add_epi16(R9C0, R9C1); R9C0 = _mm256_permute4x64_epi64(R9C0, 0x00D8); R9C0 = _mm256_shuffle_epi8(R9C0, tab_shuffle_1); R10C0 = _mm256_shuffle_epi8(R10C0, tab_shuffle); R10C1 = _mm256_shuffle_epi8(R10C1, tab_shuffle); R10C1 = _mm256_permute2x128_si256(R10C1, R10C1, 0x0003); R10C1 = _mm256_shuffle_epi8(R10C1, tab_shuffle_2); R10_ODD = _mm256_sub_epi16(R10C0, R10C1); R10C0 = _mm256_add_epi16(R10C0, R10C1); R10C0 = _mm256_permute4x64_epi64(R10C0, 0x00D8); R10C0 = _mm256_shuffle_epi8(R10C0, tab_shuffle_1); R11C0 = _mm256_shuffle_epi8(R11C0, tab_shuffle); R11C1 = _mm256_shuffle_epi8(R11C1, tab_shuffle); R11C1 = _mm256_permute2x128_si256(R11C1, R11C1, 0x0003); R11C1 = _mm256_shuffle_epi8(R11C1, tab_shuffle_2); R11_ODD = _mm256_sub_epi16(R11C0, R11C1); R11C0 = _mm256_add_epi16(R11C0, R11C1); R11C0 = _mm256_permute4x64_epi64(R11C0, 0x00D8); R11C0 = _mm256_shuffle_epi8(R11C0, tab_shuffle_1); R12C0 = _mm256_shuffle_epi8(R12C0, tab_shuffle); R12C1 = _mm256_shuffle_epi8(R12C1, tab_shuffle); R12C1 = _mm256_permute2x128_si256(R12C1, R12C1, 0x0003); R12C1 = _mm256_shuffle_epi8(R12C1, tab_shuffle_2); R12_ODD = _mm256_sub_epi16(R12C0, R12C1); R12C0 = _mm256_add_epi16(R12C0, R12C1); R12C0 = _mm256_permute4x64_epi64(R12C0, 0x00D8); R12C0 = _mm256_shuffle_epi8(R12C0, tab_shuffle_1); R13C0 = _mm256_shuffle_epi8(R13C0, tab_shuffle); R13C1 = _mm256_shuffle_epi8(R13C1, tab_shuffle); R13C1 = _mm256_permute2x128_si256(R13C1, R13C1, 0x0003); R13C1 = _mm256_shuffle_epi8(R13C1, tab_shuffle_2); R13_ODD = _mm256_sub_epi16(R13C0, R13C1); R13C0 = _mm256_add_epi16(R13C0, R13C1); R13C0 = _mm256_permute4x64_epi64(R13C0, 0x00D8); R13C0 = _mm256_shuffle_epi8(R13C0, tab_shuffle_1); R14C0 = _mm256_shuffle_epi8(R14C0, tab_shuffle); R14C1 = _mm256_shuffle_epi8(R14C1, tab_shuffle); R14C1 = _mm256_permute2x128_si256(R14C1, R14C1, 0x0003); R14C1 = _mm256_shuffle_epi8(R14C1, tab_shuffle_2); R14_ODD = _mm256_sub_epi16(R14C0, R14C1); R14C0 = _mm256_add_epi16(R14C0, R14C1); R14C0 = _mm256_permute4x64_epi64(R14C0, 0x00D8); R14C0 = _mm256_shuffle_epi8(R14C0, tab_shuffle_1); R15C0 = _mm256_shuffle_epi8(R15C0, tab_shuffle); R15C1 = _mm256_shuffle_epi8(R15C1, tab_shuffle); R15C1 = _mm256_permute2x128_si256(R15C1, R15C1, 0x0003); R15C1 = _mm256_shuffle_epi8(R15C1, tab_shuffle_2); R15_ODD = _mm256_sub_epi16(R15C0, R15C1); R15C0 = _mm256_add_epi16(R15C0, R15C1); R15C0 = _mm256_permute4x64_epi64(R15C0, 0x00D8); R15C0 = _mm256_shuffle_epi8(R15C0, tab_shuffle_1); R0R1 = _mm256_hadd_epi16(R0C0, R1C0);//[105 102 106 101 005 002 006 001 / 104 103 107 100 004 003 007 000] R2R3 = _mm256_hadd_epi16(R2C0, R3C0); R4R5 = _mm256_hadd_epi16(R4C0, R5C0); R6R7 = _mm256_hadd_epi16(R6C0, R7C0); R8R9 = _mm256_hadd_epi16(R8C0, R9C0);//[905 902 906 901 805 802 806 801 / 904 903 907 900 804 803 807 800] R10R11 = _mm256_hadd_epi16(R10C0, R11C0); R12R13 = _mm256_hadd_epi16(R12C0, R13C0); R14R15 = _mm256_hadd_epi16(R14C0, R15C0); // mul the coefficient //0th row ,1th row [105+102 106+101 005+002 006+001 / 104+103 107+100 004+003 007+000] tab_t = _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[0]); A0C0 = _mm256_madd_epi16(R0R1, tab_t); A1C0 = _mm256_madd_epi16(R2R3, tab_t);// 2 3 A2C0 = _mm256_madd_epi16(R4R5, tab_t);// 4 5 A3C0 = _mm256_madd_epi16(R6R7, tab_t);// 6 7 A4C0 = _mm256_madd_epi16(R8R9, tab_t);// 8 9 A5C0 = _mm256_madd_epi16(R10R11, tab_t);//10 11 A6C0 = _mm256_madd_epi16(R12R13, tab_t);//12 13 A7C0 = _mm256_madd_epi16(R14R15, tab_t);//14 15 A0C0 = _mm256_hadd_epi32(A0C0, A1C0); //[3B 2B 1B 0B(05+02+06+01) / 3A 2A 1A 0A(04+03+07+00)] A1C0 = _mm256_permute2f128_si256(A0C0, A0C0, 0x0001); //[3A 2A 1A 0A / 3B 2B 1B 0B] A2C0 = _mm256_hadd_epi32(A2C0, A3C0); //[7B 6B 5B 4B / 7A 6A 5A 4A] A3C0 = _mm256_permute2f128_si256(A2C0, A2C0, 0x0001);//[7A 6A 5A 4A / 7B 6B 5B 4B] A4C0 = _mm256_hadd_epi32(A4C0, A5C0); //[11B 10B 9B 8B / 11A 10A 9A 8A] A5C0 = _mm256_permute2f128_si256(A4C0, A4C0, 0x0001);//[11A 10A 9A 8A / 11B 10B 9B 8B] A6C0 = _mm256_hadd_epi32(A6C0, A7C0); //[15B 14B 13B 12B / 15A 14A 13A 12A] A7C0 = _mm256_permute2f128_si256(A6C0, A6C0, 0x0001);//[15A 14A 13A 12A / 15B 14B 13B 12B] COE0 = _mm256_add_epi32(A0C0, A1C0); //the same line`s data add to low 128 bit COE1 = _mm256_add_epi32(A2C0, A3C0); COE2 = _mm256_add_epi32(A4C0, A5C0); COE3 = _mm256_add_epi32(A6C0, A7C0); //low 128 bit is 0 1 2 3 rows data ,the high 128 bit is 8 9 10 11 rows data COE0 = _mm256_blend_epi32(COE0, COE2, 0x00F0);//[11 10 9 8 3 2 1 0] COE1 = _mm256_blend_epi32(COE1, COE3, 0x00F0);//[15 14 13 12 7 6 5 4] COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add1), shift1); COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add1), shift1); COE_RESULT = _mm256_packs_epi32(COE0, COE1);//[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] im[0][i] = COE_RESULT; COE0 = _mm256_sub_epi32(A0C0, A1C0); COE1 = _mm256_sub_epi32(A2C0, A3C0); COE2 = _mm256_sub_epi32(A4C0, A5C0); COE3 = _mm256_sub_epi32(A6C0, A7C0); COE0 = _mm256_permute2f128_si256(COE0, COE2, 0x0020); COE1 = _mm256_permute2f128_si256(COE1, COE3, 0x0020); COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add1), shift1); COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add1), shift1); COE_RESULT = _mm256_packs_epi32(COE0, COE1);//[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] im[16][i] = COE_RESULT; #define MAKE_ODD(tab,dstPos) \ tab_t = _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab]); \ A0C0 = _mm256_madd_epi16(R0R1, tab_t); \ A1C0 = _mm256_madd_epi16(R2R3, tab_t); \ A2C0 = _mm256_madd_epi16(R4R5, tab_t); \ A3C0 = _mm256_madd_epi16(R6R7, tab_t); \ A4C0 = _mm256_madd_epi16(R8R9, tab_t); \ A5C0 = _mm256_madd_epi16(R10R11, tab_t); \ A6C0 = _mm256_madd_epi16(R12R13, tab_t); \ A7C0 = _mm256_madd_epi16(R14R15, tab_t); \ \ A0C0 = _mm256_hadd_epi32(A0C0, A1C0); \ A1C0 = _mm256_permute2f128_si256(A0C0, A0C0, 0x0001); \ \ A2C0 = _mm256_hadd_epi32(A2C0, A3C0); \ A3C0 = _mm256_permute2f128_si256(A2C0, A2C0, 0x0001); \ \ A4C0 = _mm256_hadd_epi32(A4C0, A5C0); \ A5C0 = _mm256_permute2f128_si256(A4C0, A4C0, 0x0001); \ \ A6C0 = _mm256_hadd_epi32(A6C0, A7C0); \ A7C0 = _mm256_permute2f128_si256(A6C0, A6C0, 0x0001); \ \ COE0 = _mm256_add_epi32(A0C0, A1C0); \ COE1 = _mm256_add_epi32(A2C0, A3C0); \ COE2 = _mm256_add_epi32(A4C0, A5C0); \ COE3 = _mm256_add_epi32(A6C0, A7C0); \ \ COE0 = _mm256_blend_epi32(COE0, COE2, 0x00F0); \ COE1 = _mm256_blend_epi32(COE1, COE3, 0x00F0); \ \ COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add1), shift1); \ COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add1), shift1); \ \ COE_RESULT = _mm256_packs_epi32(COE0, COE1); \ im[dstPos][i] = COE_RESULT; MAKE_ODD(1, 8); MAKE_ODD(2, 24); #undef MAKE_ODD #define MAKE_ODD(tab,dstPos) \ tab_t = _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab]); \ A0C0 = _mm256_madd_epi16(R0R1, tab_t); \ A1C0 = _mm256_madd_epi16(R2R3, tab_t); \ A2C0 = _mm256_madd_epi16(R4R5, tab_t); \ A3C0 = _mm256_madd_epi16(R6R7, tab_t); \ A4C0 = _mm256_madd_epi16(R8R9, tab_t); \ A5C0 = _mm256_madd_epi16(R10R11, tab_t); \ A6C0 = _mm256_madd_epi16(R12R13, tab_t); \ A7C0 = _mm256_madd_epi16(R14R15, tab_t); \ \ A0C0 = _mm256_hadd_epi32(A0C0, A1C0); \ A1C0 = _mm256_permute2f128_si256(A0C0, A0C0, 0x0001); \ \ A2C0 = _mm256_hadd_epi32(A2C0, A3C0); \ A3C0 = _mm256_permute2f128_si256(A2C0, A2C0, 0x0001); \ \ A4C0 = _mm256_hadd_epi32(A4C0, A5C0); \ A5C0 = _mm256_permute2f128_si256(A4C0, A4C0, 0x0001); \ \ A6C0 = _mm256_hadd_epi32(A6C0, A7C0); \ A7C0 = _mm256_permute2f128_si256(A6C0, A6C0, 0x0001); \ \ COE0 = _mm256_add_epi32(A0C0, A1C0); \ COE1 = _mm256_add_epi32(A2C0, A3C0); \ COE2 = _mm256_add_epi32(A4C0, A5C0); \ COE3 = _mm256_add_epi32(A6C0, A7C0); \ \ COE0 = _mm256_blend_epi32(COE0, COE2, 0x00F0); \ COE1 = _mm256_blend_epi32(COE1, COE3, 0x00F0); \ \ COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add1), shift1); \ COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add1), shift1); \ \ COE_RESULT = _mm256_packs_epi32(COE0, COE1); \ im[dstPos][i] = COE_RESULT; MAKE_ODD(3, 4); MAKE_ODD(4, 12); MAKE_ODD(5, 20); MAKE_ODD(6, 28); R0R1 = _mm256_hsub_epi16(R0C0, R1C0);//[105 102 106 101 005 002 006 001 / 104 103 107 100 004 003 007 000] R2R3 = _mm256_hsub_epi16(R2C0, R3C0); R4R5 = _mm256_hsub_epi16(R4C0, R5C0); R6R7 = _mm256_hsub_epi16(R6C0, R7C0); R8R9 = _mm256_hsub_epi16(R8C0, R9C0);//[905 902 906 901 805 802 806 801 / 904 903 907 900 804 803 807 800] R10R11 = _mm256_hsub_epi16(R10C0, R11C0); R12R13 = _mm256_hsub_epi16(R12C0, R13C0); R14R15 = _mm256_hsub_epi16(R14C0, R15C0); MAKE_ODD(7, 2); MAKE_ODD(8, 6); MAKE_ODD(9, 10); MAKE_ODD(10, 14); MAKE_ODD(11, 18); MAKE_ODD(12, 22); MAKE_ODD(13, 26); MAKE_ODD(14, 30); #undef MAKE_ODD #define MAKE_ODD(tab,dstPos) \ tab_t = _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab]); \ A0C0 = _mm256_madd_epi16(R0_ODD, tab_t); \ A0C1 = _mm256_madd_epi16(R1_ODD, tab_t); \ A1C0 = _mm256_madd_epi16(R2_ODD, tab_t); \ A1C1 = _mm256_madd_epi16(R3_ODD, tab_t); \ A2C0 = _mm256_madd_epi16(R4_ODD, tab_t); \ A2C1 = _mm256_madd_epi16(R5_ODD, tab_t); \ A3C0 = _mm256_madd_epi16(R6_ODD, tab_t); \ A3C1 = _mm256_madd_epi16(R7_ODD, tab_t); \ A4C0 = _mm256_madd_epi16(R8_ODD, tab_t); \ A4C1 = _mm256_madd_epi16(R9_ODD, tab_t); \ A5C0 = _mm256_madd_epi16(R10_ODD, tab_t); \ A5C1 = _mm256_madd_epi16(R11_ODD, tab_t); \ A6C0 = _mm256_madd_epi16(R12_ODD, tab_t); \ A6C1 = _mm256_madd_epi16(R13_ODD, tab_t); \ A7C0 = _mm256_madd_epi16(R14_ODD, tab_t); \ A7C1 = _mm256_madd_epi16(R15_ODD, tab_t); \ \ A0C0 = _mm256_hadd_epi32(A0C0, A0C1); \ A1C0 = _mm256_hadd_epi32(A1C0, A1C1); \ A2C0 = _mm256_hadd_epi32(A2C0, A2C1); \ A3C0 = _mm256_hadd_epi32(A3C0, A3C1); \ A4C0 = _mm256_hadd_epi32(A4C0, A4C1); \ A5C0 = _mm256_hadd_epi32(A5C0, A5C1); \ A6C0 = _mm256_hadd_epi32(A6C0, A6C1); \ A7C0 = _mm256_hadd_epi32(A7C0, A7C1); \ \ A0C0 = _mm256_hadd_epi32(A0C0, A1C0); \ A1C0 = _mm256_hadd_epi32(A2C0, A3C0); \ A2C0 = _mm256_hadd_epi32(A4C0, A5C0); \ A3C0 = _mm256_hadd_epi32(A6C0, A7C0); \ \ A0C1 = _mm256_permute2f128_si256(A0C0, A2C0, 0x0020); \ A1C1 = _mm256_permute2f128_si256(A0C0, A2C0, 0x0031); \ A2C1 = _mm256_permute2f128_si256(A1C0, A3C0, 0x0020); \ A3C1 = _mm256_permute2f128_si256(A1C0, A3C0, 0x0031); \ \ COE0 = _mm256_add_epi32(A0C1, A1C1); \ COE1 = _mm256_add_epi32(A2C1, A3C1); \ COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add1), shift1); \ COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add1), shift1); \ \ COE_RESULT = _mm256_packs_epi32(COE0, COE1); \ im[dstPos][i] = COE_RESULT; MAKE_ODD(15, 1); MAKE_ODD(16, 3); MAKE_ODD(17, 5); MAKE_ODD(18, 7); MAKE_ODD(19, 9); MAKE_ODD(20, 11); MAKE_ODD(21, 13); MAKE_ODD(22, 15); MAKE_ODD(23, 17); MAKE_ODD(24, 19); MAKE_ODD(25, 21); MAKE_ODD(26, 23); MAKE_ODD(27, 25); MAKE_ODD(28, 27); MAKE_ODD(29, 29); MAKE_ODD(30, 31); #undef MAKE_ODD } __m128i mask = _mm_set1_epi16(0xffff); //DCT2 for (i = 0; i < 32 / 8; i++){ R0C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 0) * i_src) + 0), mask)); R0C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 0) * i_src) + 8), mask)); R1C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 0) * i_src) + 16), mask)); R1C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 0) * i_src) + 24), mask)); R2C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 1) * i_src) + 0), mask)); R2C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 1) * i_src) + 8), mask)); R3C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 1) * i_src) + 16), mask)); R3C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 1) * i_src) + 24), mask)); R4C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 2) * i_src) + 0), mask)); R4C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 2) * i_src) + 8), mask)); R5C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 2) * i_src) + 16), mask)); R5C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 2) * i_src) + 24), mask)); R6C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 3) * i_src) + 0), mask)); R6C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 3) * i_src) + 8), mask)); R7C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 3) * i_src) + 16), mask)); R7C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 3) * i_src) + 24), mask)); R8C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 4) * i_src) + 0), mask)); R8C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 4) * i_src) + 8), mask)); R9C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 4) * i_src) + 16), mask)); R9C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 4) * i_src) + 24), mask)); R10C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 5) * i_src) + 0), mask)); R10C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 5) * i_src) + 8), mask)); R11C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 5) * i_src) + 16), mask)); R11C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 5) * i_src) + 24), mask)); R12C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 6) * i_src) + 0), mask)); R12C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 6) * i_src) + 8), mask)); R13C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 6) * i_src) + 16), mask)); R13C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 6) * i_src) + 24), mask)); R14C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 7) * i_src) + 0), mask)); R14C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 7) * i_src) + 8), mask)); R15C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 7) * i_src) + 16), mask)); R15C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 7) * i_src) + 24), mask)); // inverse _m256i per 32 bit __m256i tab_inv = _mm256_setr_epi32(0x0007, 0x0006, 0x0005, 0x0004, 0x0003, 0x0002, 0x0001, 0x0000); R0C1 = _mm256_permutevar8x32_epi32(R0C1, tab_inv); //[8 9 10 11 / 12 13 14 15] R1C1 = _mm256_permutevar8x32_epi32(R1C1, tab_inv); //[24 25 26 27 / 28 29 30 31] R0_ODD = _mm256_sub_epi32(R0C0, R1C1); //[7-24 6-25 5-26 4-27 / 3-28 2-29 1-30 0-31] R15_ODD = _mm256_sub_epi32(R0C1, R1C0); //[8-23 9-22 10-21 11-20 / 12-19 13-18 14-17 15-16] R0C0 = _mm256_add_epi32(R0C0, R1C1); //[7 6 5 4 / 3 2 1 0] R0C1 = _mm256_add_epi32(R0C1, R1C0); //[8 9 10 11 / 12 13 14 15] A0C0 = _mm256_add_epi32(R0C0, R0C1); //[7 6 5 4 / 3 2 1 0] R2C1 = _mm256_permutevar8x32_epi32(R2C1, tab_inv); R3C1 = _mm256_permutevar8x32_epi32(R3C1, tab_inv); R1_ODD = _mm256_sub_epi32(R2C0, R3C1); R14_ODD = _mm256_sub_epi32(R2C1, R3C0); R1C0 = _mm256_add_epi32(R2C0, R3C1); R1C1 = _mm256_add_epi32(R2C1, R3C0); A1C0 = _mm256_add_epi32(R1C0, R1C1); R4C1 = _mm256_permutevar8x32_epi32(R4C1, tab_inv); R5C1 = _mm256_permutevar8x32_epi32(R5C1, tab_inv); R2_ODD = _mm256_sub_epi32(R4C0, R5C1); R13_ODD = _mm256_sub_epi32(R4C1, R5C0); R2C0 = _mm256_add_epi32(R4C0, R5C1); R2C1 = _mm256_add_epi32(R4C1, R5C0); A2C0 = _mm256_add_epi32(R2C0, R2C1); R6C1 = _mm256_permutevar8x32_epi32(R6C1, tab_inv); R7C1 = _mm256_permutevar8x32_epi32(R7C1, tab_inv); R3_ODD = _mm256_sub_epi32(R6C0, R7C1); R12_ODD = _mm256_sub_epi32(R6C1, R7C0); R3C0 = _mm256_add_epi32(R6C0, R7C1); R3C1 = _mm256_add_epi32(R6C1, R7C0); A3C0 = _mm256_add_epi32(R3C0, R3C1); R8C1 = _mm256_permutevar8x32_epi32(R8C1, tab_inv); R9C1 = _mm256_permutevar8x32_epi32(R9C1, tab_inv); R4_ODD = _mm256_sub_epi32(R8C0, R9C1); R11_ODD = _mm256_sub_epi32(R8C1, R9C0); R4C0 = _mm256_add_epi32(R8C0, R9C1); R4C1 = _mm256_add_epi32(R8C1, R9C0); A4C0 = _mm256_add_epi32(R4C0, R4C1); R10C1 = _mm256_permutevar8x32_epi32(R10C1, tab_inv); R11C1 = _mm256_permutevar8x32_epi32(R11C1, tab_inv); R5_ODD = _mm256_sub_epi32(R10C0, R11C1); R10_ODD = _mm256_sub_epi32(R10C1, R11C0); R5C0 = _mm256_add_epi32(R10C0, R11C1); R5C1 = _mm256_add_epi32(R10C1, R11C0); A5C0 = _mm256_add_epi32(R5C0, R5C1); R12C1 = _mm256_permutevar8x32_epi32(R12C1, tab_inv); R13C1 = _mm256_permutevar8x32_epi32(R13C1, tab_inv); R6_ODD = _mm256_sub_epi32(R12C0, R13C1); R9_ODD = _mm256_sub_epi32(R12C1, R13C0); R6C0 = _mm256_add_epi32(R12C0, R13C1); R6C1 = _mm256_add_epi32(R12C1, R13C0); A6C0 = _mm256_add_epi32(R6C0, R6C1); R14C1 = _mm256_permutevar8x32_epi32(R14C1, tab_inv); R15C1 = _mm256_permutevar8x32_epi32(R15C1, tab_inv); R7_ODD = _mm256_sub_epi32(R14C0, R15C1);//[7-24 6-25 5-26 4-27 / 3-28 2-29 1-30 0-31] R8_ODD = _mm256_sub_epi32(R14C1, R15C0); //[8-23 9-22 10-21 11-20 / 12-19 13-18 14-17 15-16] R7C0 = _mm256_add_epi32(R14C0, R15C1); //[7 6 5 4 / 3 2 1 0] R7C1 = _mm256_add_epi32(R14C1, R15C0); //[8 9 10 11 / 12 13 14 15] A7C0 = _mm256_add_epi32(R7C0, R7C1); //[7 6 5 4 / 3 2 1 0] __m256i result_mask = _mm256_setr_epi32(0xf0000000, 0xf0000000, 0xf0000000, 0xf0000000, 0, 0, 0, 0); #define MAKE_ODD(tab,dstPos) \ tab_t = _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab]); \ A0C1 = _mm256_mullo_epi32(A0C0, tab_t); \ A1C1 = _mm256_mullo_epi32(A1C0, tab_t); \ A2C1 = _mm256_mullo_epi32(A2C0, tab_t); \ A3C1 = _mm256_mullo_epi32(A3C0, tab_t); \ A4C1 = _mm256_mullo_epi32(A4C0, tab_t); \ A5C1 = _mm256_mullo_epi32(A5C0, tab_t); \ A6C1 = _mm256_mullo_epi32(A6C0, tab_t); \ A7C1 = _mm256_mullo_epi32(A7C0, tab_t); \ \ COE0 = _mm256_hadd_epi32(A0C1, A1C1); /* [107+106 105+104 007+006 005+004 / 103+102 101+100 003+002 001+000] */\ COE1 = _mm256_hadd_epi32(A2C1, A3C1); \ COE2 = _mm256_hadd_epi32(A4C1, A5C1); \ COE3 = _mm256_hadd_epi32(A6C1, A7C1); \ \ COE0 = _mm256_hadd_epi32(COE0, COE1); /* [3A 2A 1A 0A / 3B 2B 1B 0B] */\ COE1 = _mm256_hadd_epi32(COE2, COE3); /* [7A 6A 5A 4A / 7B 6B 5B 4B] */\ \ COE2 = _mm256_permute2f128_si256(COE0, COE1, 0x0020); /*[7B 6B 5B 4B / 3B 2B 1B 0B]*/\ COE3 = _mm256_permute2f128_si256(COE0, COE1, 0x0031); /*[7A 6A 5A 4A / 3A 2A 1A 0A]*/\ \ COE_RESULT = _mm256_add_epi32(COE2, COE3); /* [7 6 5 4 / 3 2 1 0] */\ COE_RESULT = _mm256_srai_epi32(_mm256_add_epi32(COE_RESULT, c_add2), SHIFT2); \ COE0 = _mm256_permute2f128_si256(COE_RESULT, COE_RESULT, 0x0001);/* [3 2 1 0 / 7 6 5 4] */ \ COE_RESULT = _mm256_packs_epi32(COE_RESULT, COE0); /*[3 2 1 0 7 6 5 4 / 7 6 5 4 3 2 1 0]*/\ addr = (dst + (dstPos * 32) + (i * 8)); \ \ _mm256_maskstore_epi32((int*)addr, result_mask, COE_RESULT); //_mm256_storeu2_m128i(addr, &COE3, COE_RESULT); /*_mm256_storeu_si256(addr, COE_RESULT);*/ MAKE_ODD(0, 0); MAKE_ODD(1, 8); MAKE_ODD(2, 16); MAKE_ODD(3, 24); MAKE_ODD(4, 4); MAKE_ODD(5, 12); MAKE_ODD(6, 20); MAKE_ODD(7, 28); A0C0 = _mm256_sub_epi32(R0C0, R0C1); A1C0 = _mm256_sub_epi32(R1C0, R1C1); A2C0 = _mm256_sub_epi32(R2C0, R2C1); A3C0 = _mm256_sub_epi32(R3C0, R3C1); A4C0 = _mm256_sub_epi32(R4C0, R4C1); A5C0 = _mm256_sub_epi32(R5C0, R5C1); A6C0 = _mm256_sub_epi32(R6C0, R6C1); A7C0 = _mm256_sub_epi32(R7C0, R7C1); MAKE_ODD(8, 2); MAKE_ODD(9, 6); MAKE_ODD(10, 10); MAKE_ODD(11, 14); MAKE_ODD(12, 18); MAKE_ODD(13, 22); MAKE_ODD(14, 26); MAKE_ODD(15, 30); #undef MAKE_ODD #define MAKE_ODD(tab,dstPos) \ tab_t = _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab]); \ tab_t1 = _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab + 1]); \ A0C1 = _mm256_add_epi32(_mm256_mullo_epi32(R0_ODD, tab_t), _mm256_mullo_epi32(R15_ODD, tab_t1)); \ A1C1 = _mm256_add_epi32(_mm256_mullo_epi32(R1_ODD, tab_t), _mm256_mullo_epi32(R14_ODD, tab_t1)); \ A2C1 = _mm256_add_epi32(_mm256_mullo_epi32(R2_ODD, tab_t), _mm256_mullo_epi32(R13_ODD, tab_t1)); \ A3C1 = _mm256_add_epi32(_mm256_mullo_epi32(R3_ODD, tab_t), _mm256_mullo_epi32(R12_ODD, tab_t1)); \ A4C1 = _mm256_add_epi32(_mm256_mullo_epi32(R4_ODD, tab_t), _mm256_mullo_epi32(R11_ODD, tab_t1)); \ A5C1 = _mm256_add_epi32(_mm256_mullo_epi32(R5_ODD, tab_t), _mm256_mullo_epi32(R10_ODD, tab_t1)); \ A6C1 = _mm256_add_epi32(_mm256_mullo_epi32(R6_ODD, tab_t), _mm256_mullo_epi32(R9_ODD, tab_t1)); \ A7C1 = _mm256_add_epi32(_mm256_mullo_epi32(R7_ODD, tab_t), _mm256_mullo_epi32(R8_ODD, tab_t1)); \ \ COE0 = _mm256_hadd_epi32(A0C1, A1C1); \ COE1 = _mm256_hadd_epi32(A2C1, A3C1); \ COE2 = _mm256_hadd_epi32(A4C1, A5C1); \ COE3 = _mm256_hadd_epi32(A6C1, A7C1); \ \ COE0 = _mm256_hadd_epi32(COE0, COE1); \ COE1 = _mm256_hadd_epi32(COE2, COE3); \ \ COE2 = _mm256_permute2f128_si256(COE0, COE1, 0x0020); \ COE3 = _mm256_permute2f128_si256(COE0, COE1, 0x0031); \ \ COE_RESULT = _mm256_add_epi32(COE2, COE3); \ COE_RESULT = _mm256_srai_epi32(_mm256_add_epi32(COE_RESULT, c_add2), SHIFT2); \ COE0 = _mm256_permute2f128_si256(COE_RESULT, COE_RESULT, 0x0001); \ COE_RESULT = _mm256_packs_epi32(COE_RESULT, COE0); \ addr = (dst + (dstPos * 32) + (i * 8)); \ \ _mm256_maskstore_epi32((int*)addr, result_mask, COE_RESULT); //_mm256_storeu2_m128i(addr, &COE3, COE_RESULT); //_mm256_storeu_si256(addr, COE_RESULT); MAKE_ODD(16, 1); MAKE_ODD(18, 3); MAKE_ODD(20, 5); MAKE_ODD(22, 7); MAKE_ODD(24, 9); MAKE_ODD(26, 11); MAKE_ODD(28, 13); MAKE_ODD(30, 15); MAKE_ODD(32, 17); MAKE_ODD(34, 19); MAKE_ODD(36, 21); MAKE_ODD(38, 23); MAKE_ODD(40, 25); MAKE_ODD(42, 27); MAKE_ODD(44, 29); MAKE_ODD(46, 31); #undef MAKE_ODD } } /* --------------------------------------------------------------------------- */ void dct_c_32x32_half_avx2(const coeff_t *src, coeff_t *dst, int i_src) { const int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT + (i_src & 0x01); const int ADD1 = 1 << (shift1 - 1); const int SHIFT2 = B32X32_IN_BIT + FACTO_BIT; const int ADD2 = 1 << (SHIFT2 - 1); __m256i c_add1 = _mm256_set1_epi32(ADD1); __m256i c_add2 = _mm256_set1_epi32(ADD2); //R---row C-column __m256i R0C0, R0C1, R1C0, R1C1, R2C0, R2C1, R3C0, R3C1, R4C0, R4C1, R5C0, R5C1, R6C0, R6C1, R7C0, R7C1; __m256i R8C0, R8C1, R9C0, R9C1, R10C0, R10C1, R11C0, R11C1, R12C0, R12C1, R13C0, R13C1, R14C0, R14C1, R15C0, R15C1; //store anser __m256i A0C0, A0C1, A1C0, A1C1, A2C0, A2C1, A3C0, A3C1, A4C0, A4C1, A5C0, A5C1, A6C0, A6C1, A7C0, A7C1; __m256i R0R1, R2R3, R4R5, R6R7, R8R9, R10R11, R12R13, R14R15; __m256i COE0, COE1, COE2, COE3; __m256i COE_RESULT; __m256i im[16][2]; __m256i R0_ODD, R1_ODD, R2_ODD, R3_ODD, R4_ODD, R5_ODD, R6_ODD, R7_ODD; __m256i R8_ODD, R9_ODD, R10_ODD, R11_ODD, R12_ODD, R13_ODD, R14_ODD, R15_ODD; __m256i tab_t, tab_t1; coeff_t * addr; i_src &= 0xFE; /* remember to remove the flag bit */ int i; // DCT1 for (i = 0; i < 32 / 16; i++) { R0C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 0) * i_src + 0)); //[15 14 13 12 11 10... 03 02 01 00] R0C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 0) * i_src + 16)); //[31 30 29 28 11 10... 19 18 17 16] R1C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 1) * i_src + 0)); R1C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 1) * i_src + 16)); R2C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 2) * i_src + 0)); R2C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 2) * i_src + 16)); R3C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 3) * i_src + 0)); R3C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 3) * i_src + 16)); R4C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 4) * i_src + 0)); R4C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 4) * i_src + 16)); R5C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 5) * i_src + 0)); R5C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 5) * i_src + 16)); R6C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 6) * i_src + 0)); R6C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 6) * i_src + 16)); R7C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 7) * i_src + 0)); R7C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 7) * i_src + 16)); R8C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 8) * i_src + 0)); R8C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 8) * i_src + 16)); R9C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 9) * i_src + 0)); R9C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 9) * i_src + 16)); R10C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 10) * i_src + 0)); R10C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 10) * i_src + 16)); R11C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 11) * i_src + 0)); R11C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 11) * i_src + 16)); R12C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 12) * i_src + 0)); R12C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 12) * i_src + 16)); R13C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 13) * i_src + 0)); R13C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 13) * i_src + 16)); R14C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 14) * i_src + 0)); R14C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 14) * i_src + 16)); R15C0 = _mm256_load_si256((__m256i*)(src + (i * 16 + 15) * i_src + 0)); R15C1 = _mm256_load_si256((__m256i*)(src + (i * 16 + 15) * i_src + 16)); //notice that different set / setr low dizhi butong __m256i tab_shuffle = _mm256_setr_epi16(0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A, 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A); __m256i tab_shuffle_1 = _mm256_setr_epi16(0x0100, 0x0B0A, 0x0302, 0x0908, 0x0504, 0x0F0E, 0x0706, 0x0D0C, 0x0100, 0x0B0A, 0x0302, 0x0908, 0x0504, 0x0F0E, 0x0706, 0x0D0C); __m256i tab_shuffle_2 = _mm256_setr_epi16(0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C, 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C); //[13 10 14 09 12 11 15 08 05 02 06 01 04 03 07 00] //[29 26 30 25 28 27 31 24 21 18 22 17 20 19 23 16z R0C0 = _mm256_shuffle_epi8(R0C0, tab_shuffle); R0C1 = _mm256_shuffle_epi8(R0C1, tab_shuffle); R0C1 = _mm256_permute2x128_si256(R0C1, R0C1, 0x0003);//permute [21 18 22 17 20 19 23 16 / 29 26 30 25 28 27 31 24] R0C1 = _mm256_shuffle_epi8(R0C1, tab_shuffle_2); // [18 21 17 22 19 20 16 23 / 26 29 25 30 27 28 24 31] //[13 10 14 09 12 11 15 08 / 05 02 06 01 04 03 07 00] //[18 21 17 22 19 20 16 23 / 26 29 25 30 27 28 24 31] R0_ODD = _mm256_sub_epi16(R0C0, R0C1); R0C0 = _mm256_add_epi16(R0C0, R0C1);//[13 10 14 09 12 11 15 08 / 05 02 06 01 04 03 07 00] R0C0 = _mm256_permute4x64_epi64(R0C0, 0x00D8);//[13 10 14 09 05 02 06 01 / 12 11 15 08 04 03 07 00] R0C0 = _mm256_shuffle_epi8(R0C0, tab_shuffle_1);//[10 05 13 02 09 06 14 01 / 11 04 12 03 08 07 15 00] R1C0 = _mm256_shuffle_epi8(R1C0, tab_shuffle); R1C1 = _mm256_shuffle_epi8(R1C1, tab_shuffle); R1C1 = _mm256_permute2x128_si256(R1C1, R1C1, 0x0003); R1C1 = _mm256_shuffle_epi8(R1C1, tab_shuffle_2); R1_ODD = _mm256_sub_epi16(R1C0, R1C1); R1C0 = _mm256_add_epi16(R1C0, R1C1); R1C0 = _mm256_permute4x64_epi64(R1C0, 0x00D8); R1C0 = _mm256_shuffle_epi8(R1C0, tab_shuffle_1); R2C0 = _mm256_shuffle_epi8(R2C0, tab_shuffle); R2C1 = _mm256_shuffle_epi8(R2C1, tab_shuffle); R2C1 = _mm256_permute2x128_si256(R2C1, R2C1, 0x0003); R2C1 = _mm256_shuffle_epi8(R2C1, tab_shuffle_2); R2_ODD = _mm256_sub_epi16(R2C0, R2C1); R2C0 = _mm256_add_epi16(R2C0, R2C1); R2C0 = _mm256_permute4x64_epi64(R2C0, 0x00D8); R2C0 = _mm256_shuffle_epi8(R2C0, tab_shuffle_1); R3C0 = _mm256_shuffle_epi8(R3C0, tab_shuffle); R3C1 = _mm256_shuffle_epi8(R3C1, tab_shuffle); R3C1 = _mm256_permute2x128_si256(R3C1, R3C1, 0x0003); R3C1 = _mm256_shuffle_epi8(R3C1, tab_shuffle_2); R3_ODD = _mm256_sub_epi16(R3C0, R3C1); R3C0 = _mm256_add_epi16(R3C0, R3C1); R3C0 = _mm256_permute4x64_epi64(R3C0, 0x00D8); R3C0 = _mm256_shuffle_epi8(R3C0, tab_shuffle_1); R4C0 = _mm256_shuffle_epi8(R4C0, tab_shuffle); R4C1 = _mm256_shuffle_epi8(R4C1, tab_shuffle); R4C1 = _mm256_permute2x128_si256(R4C1, R4C1, 0x0003); R4C1 = _mm256_shuffle_epi8(R4C1, tab_shuffle_2); R4_ODD = _mm256_sub_epi16(R4C0, R4C1); R4C0 = _mm256_add_epi16(R4C0, R4C1); R4C0 = _mm256_permute4x64_epi64(R4C0, 0x00D8); R4C0 = _mm256_shuffle_epi8(R4C0, tab_shuffle_1); R5C0 = _mm256_shuffle_epi8(R5C0, tab_shuffle); R5C1 = _mm256_shuffle_epi8(R5C1, tab_shuffle); R5C1 = _mm256_permute2x128_si256(R5C1, R5C1, 0x0003); R5C1 = _mm256_shuffle_epi8(R5C1, tab_shuffle_2); R5_ODD = _mm256_sub_epi16(R5C0, R5C1); R5C0 = _mm256_add_epi16(R5C0, R5C1); R5C0 = _mm256_permute4x64_epi64(R5C0, 0x00D8); R5C0 = _mm256_shuffle_epi8(R5C0, tab_shuffle_1); R6C0 = _mm256_shuffle_epi8(R6C0, tab_shuffle); R6C1 = _mm256_shuffle_epi8(R6C1, tab_shuffle); R6C1 = _mm256_permute2x128_si256(R6C1, R6C1, 0x0003); R6C1 = _mm256_shuffle_epi8(R6C1, tab_shuffle_2); R6_ODD = _mm256_sub_epi16(R6C0, R6C1); R6C0 = _mm256_add_epi16(R6C0, R6C1); R6C0 = _mm256_permute4x64_epi64(R6C0, 0x00D8); R6C0 = _mm256_shuffle_epi8(R6C0, tab_shuffle_1); R7C0 = _mm256_shuffle_epi8(R7C0, tab_shuffle); R7C1 = _mm256_shuffle_epi8(R7C1, tab_shuffle); R7C1 = _mm256_permute2x128_si256(R7C1, R7C1, 0x0003); R7C1 = _mm256_shuffle_epi8(R7C1, tab_shuffle_2); R7_ODD = _mm256_sub_epi16(R7C0, R7C1); R7C0 = _mm256_add_epi16(R7C0, R7C1); R7C0 = _mm256_permute4x64_epi64(R7C0, 0x00D8); R7C0 = _mm256_shuffle_epi8(R7C0, tab_shuffle_1); R8C0 = _mm256_shuffle_epi8(R8C0, tab_shuffle); R8C1 = _mm256_shuffle_epi8(R8C1, tab_shuffle); R8C1 = _mm256_permute2x128_si256(R8C1, R8C1, 0x0003); R8C1 = _mm256_shuffle_epi8(R8C1, tab_shuffle_2); R8_ODD = _mm256_sub_epi16(R8C0, R8C1); R8C0 = _mm256_add_epi16(R8C0, R8C1); R8C0 = _mm256_permute4x64_epi64(R8C0, 0x00D8); R8C0 = _mm256_shuffle_epi8(R8C0, tab_shuffle_1); R9C0 = _mm256_shuffle_epi8(R9C0, tab_shuffle); R9C1 = _mm256_shuffle_epi8(R9C1, tab_shuffle); R9C1 = _mm256_permute2x128_si256(R9C1, R9C1, 0x0003); R9C1 = _mm256_shuffle_epi8(R9C1, tab_shuffle_2); R9_ODD = _mm256_sub_epi16(R9C0, R9C1); R9C0 = _mm256_add_epi16(R9C0, R9C1); R9C0 = _mm256_permute4x64_epi64(R9C0, 0x00D8); R9C0 = _mm256_shuffle_epi8(R9C0, tab_shuffle_1); R10C0 = _mm256_shuffle_epi8(R10C0, tab_shuffle); R10C1 = _mm256_shuffle_epi8(R10C1, tab_shuffle); R10C1 = _mm256_permute2x128_si256(R10C1, R10C1, 0x0003); R10C1 = _mm256_shuffle_epi8(R10C1, tab_shuffle_2); R10_ODD = _mm256_sub_epi16(R10C0, R10C1); R10C0 = _mm256_add_epi16(R10C0, R10C1); R10C0 = _mm256_permute4x64_epi64(R10C0, 0x00D8); R10C0 = _mm256_shuffle_epi8(R10C0, tab_shuffle_1); R11C0 = _mm256_shuffle_epi8(R11C0, tab_shuffle); R11C1 = _mm256_shuffle_epi8(R11C1, tab_shuffle); R11C1 = _mm256_permute2x128_si256(R11C1, R11C1, 0x0003); R11C1 = _mm256_shuffle_epi8(R11C1, tab_shuffle_2); R11_ODD = _mm256_sub_epi16(R11C0, R11C1); R11C0 = _mm256_add_epi16(R11C0, R11C1); R11C0 = _mm256_permute4x64_epi64(R11C0, 0x00D8); R11C0 = _mm256_shuffle_epi8(R11C0, tab_shuffle_1); R12C0 = _mm256_shuffle_epi8(R12C0, tab_shuffle); R12C1 = _mm256_shuffle_epi8(R12C1, tab_shuffle); R12C1 = _mm256_permute2x128_si256(R12C1, R12C1, 0x0003); R12C1 = _mm256_shuffle_epi8(R12C1, tab_shuffle_2); R12_ODD = _mm256_sub_epi16(R12C0, R12C1); R12C0 = _mm256_add_epi16(R12C0, R12C1); R12C0 = _mm256_permute4x64_epi64(R12C0, 0x00D8); R12C0 = _mm256_shuffle_epi8(R12C0, tab_shuffle_1); R13C0 = _mm256_shuffle_epi8(R13C0, tab_shuffle); R13C1 = _mm256_shuffle_epi8(R13C1, tab_shuffle); R13C1 = _mm256_permute2x128_si256(R13C1, R13C1, 0x0003); R13C1 = _mm256_shuffle_epi8(R13C1, tab_shuffle_2); R13_ODD = _mm256_sub_epi16(R13C0, R13C1); R13C0 = _mm256_add_epi16(R13C0, R13C1); R13C0 = _mm256_permute4x64_epi64(R13C0, 0x00D8); R13C0 = _mm256_shuffle_epi8(R13C0, tab_shuffle_1); R14C0 = _mm256_shuffle_epi8(R14C0, tab_shuffle); R14C1 = _mm256_shuffle_epi8(R14C1, tab_shuffle); R14C1 = _mm256_permute2x128_si256(R14C1, R14C1, 0x0003); R14C1 = _mm256_shuffle_epi8(R14C1, tab_shuffle_2); R14_ODD = _mm256_sub_epi16(R14C0, R14C1); R14C0 = _mm256_add_epi16(R14C0, R14C1); R14C0 = _mm256_permute4x64_epi64(R14C0, 0x00D8); R14C0 = _mm256_shuffle_epi8(R14C0, tab_shuffle_1); R15C0 = _mm256_shuffle_epi8(R15C0, tab_shuffle); R15C1 = _mm256_shuffle_epi8(R15C1, tab_shuffle); R15C1 = _mm256_permute2x128_si256(R15C1, R15C1, 0x0003); R15C1 = _mm256_shuffle_epi8(R15C1, tab_shuffle_2); R15_ODD = _mm256_sub_epi16(R15C0, R15C1); R15C0 = _mm256_add_epi16(R15C0, R15C1); R15C0 = _mm256_permute4x64_epi64(R15C0, 0x00D8); R15C0 = _mm256_shuffle_epi8(R15C0, tab_shuffle_1); R0R1 = _mm256_hadd_epi16(R0C0, R1C0);//[105 102 106 101 005 002 006 001 / 104 103 107 100 004 003 007 000] R2R3 = _mm256_hadd_epi16(R2C0, R3C0); R4R5 = _mm256_hadd_epi16(R4C0, R5C0); R6R7 = _mm256_hadd_epi16(R6C0, R7C0); R8R9 = _mm256_hadd_epi16(R8C0, R9C0);//[905 902 906 901 805 802 806 801 / 904 903 907 900 804 803 807 800] R10R11 = _mm256_hadd_epi16(R10C0, R11C0); R12R13 = _mm256_hadd_epi16(R12C0, R13C0); R14R15 = _mm256_hadd_epi16(R14C0, R15C0); // mul the coefficient //0th row ,1th row [105+102 106+101 005+002 006+001 / 104+103 107+100 004+003 007+000] tab_t = _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[0]); A0C0 = _mm256_madd_epi16(R0R1, tab_t); A1C0 = _mm256_madd_epi16(R2R3, tab_t);// 2 3 A2C0 = _mm256_madd_epi16(R4R5, tab_t);// 4 5 A3C0 = _mm256_madd_epi16(R6R7, tab_t);// 6 7 A4C0 = _mm256_madd_epi16(R8R9, tab_t);// 8 9 A5C0 = _mm256_madd_epi16(R10R11, tab_t);//10 11 A6C0 = _mm256_madd_epi16(R12R13, tab_t);//12 13 A7C0 = _mm256_madd_epi16(R14R15, tab_t);//14 15 A0C0 = _mm256_hadd_epi32(A0C0, A1C0); //[3B 2B 1B 0B(05+02+06+01) / 3A 2A 1A 0A(04+03+07+00)] A1C0 = _mm256_permute2f128_si256(A0C0, A0C0, 0x0001); //[3A 2A 1A 0A / 3B 2B 1B 0B] A2C0 = _mm256_hadd_epi32(A2C0, A3C0); //[7B 6B 5B 4B / 7A 6A 5A 4A] A3C0 = _mm256_permute2f128_si256(A2C0, A2C0, 0x0001);//[7A 6A 5A 4A / 7B 6B 5B 4B] A4C0 = _mm256_hadd_epi32(A4C0, A5C0); //[11B 10B 9B 8B / 11A 10A 9A 8A] A5C0 = _mm256_permute2f128_si256(A4C0, A4C0, 0x0001);//[11A 10A 9A 8A / 11B 10B 9B 8B] A6C0 = _mm256_hadd_epi32(A6C0, A7C0); //[15B 14B 13B 12B / 15A 14A 13A 12A] A7C0 = _mm256_permute2f128_si256(A6C0, A6C0, 0x0001);//[15A 14A 13A 12A / 15B 14B 13B 12B] COE0 = _mm256_add_epi32(A0C0, A1C0); //the same line`s data add to low 128 bit COE1 = _mm256_add_epi32(A2C0, A3C0); COE2 = _mm256_add_epi32(A4C0, A5C0); COE3 = _mm256_add_epi32(A6C0, A7C0); //low 128 bit is 0 1 2 3 rows data ,the high 128 bit is 8 9 10 11 rows data COE0 = _mm256_blend_epi32(COE0, COE2, 0x00F0);//[11 10 9 8 3 2 1 0] COE1 = _mm256_blend_epi32(COE1, COE3, 0x00F0);//[15 14 13 12 7 6 5 4] COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add1), shift1); COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add1), shift1); COE_RESULT = _mm256_packs_epi32(COE0, COE1);//[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] im[0][i] = COE_RESULT; #define MAKE_ODD(tab,dstPos) \ tab_t = _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab]); \ A0C0 = _mm256_madd_epi16(R0R1, tab_t); \ A1C0 = _mm256_madd_epi16(R2R3, tab_t); \ A2C0 = _mm256_madd_epi16(R4R5, tab_t); \ A3C0 = _mm256_madd_epi16(R6R7, tab_t); \ A4C0 = _mm256_madd_epi16(R8R9, tab_t); \ A5C0 = _mm256_madd_epi16(R10R11, tab_t); \ A6C0 = _mm256_madd_epi16(R12R13, tab_t); \ A7C0 = _mm256_madd_epi16(R14R15, tab_t); \ \ A0C0 = _mm256_hadd_epi32(A0C0, A1C0); \ A1C0 = _mm256_permute2f128_si256(A0C0, A0C0, 0x0001); \ \ A2C0 = _mm256_hadd_epi32(A2C0, A3C0); \ A3C0 = _mm256_permute2f128_si256(A2C0, A2C0, 0x0001); \ \ A4C0 = _mm256_hadd_epi32(A4C0, A5C0); \ A5C0 = _mm256_permute2f128_si256(A4C0, A4C0, 0x0001); \ \ A6C0 = _mm256_hadd_epi32(A6C0, A7C0); \ A7C0 = _mm256_permute2f128_si256(A6C0, A6C0, 0x0001); \ \ COE0 = _mm256_add_epi32(A0C0, A1C0); \ COE1 = _mm256_add_epi32(A2C0, A3C0); \ COE2 = _mm256_add_epi32(A4C0, A5C0); \ COE3 = _mm256_add_epi32(A6C0, A7C0); \ \ COE0 = _mm256_blend_epi32(COE0, COE2, 0x00F0); \ COE1 = _mm256_blend_epi32(COE1, COE3, 0x00F0); \ \ COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add1), shift1); \ COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add1), shift1); \ \ COE_RESULT = _mm256_packs_epi32(COE0, COE1); \ im[dstPos][i] = COE_RESULT; MAKE_ODD(1, 8); #undef MAKE_ODD #define MAKE_ODD(tab,dstPos) \ tab_t = _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab]); \ A0C0 = _mm256_madd_epi16(R0R1, tab_t); \ A1C0 = _mm256_madd_epi16(R2R3, tab_t); \ A2C0 = _mm256_madd_epi16(R4R5, tab_t); \ A3C0 = _mm256_madd_epi16(R6R7, tab_t); \ A4C0 = _mm256_madd_epi16(R8R9, tab_t); \ A5C0 = _mm256_madd_epi16(R10R11, tab_t); \ A6C0 = _mm256_madd_epi16(R12R13, tab_t); \ A7C0 = _mm256_madd_epi16(R14R15, tab_t); \ \ A0C0 = _mm256_hadd_epi32(A0C0, A1C0); \ A1C0 = _mm256_permute2f128_si256(A0C0, A0C0, 0x0001); \ \ A2C0 = _mm256_hadd_epi32(A2C0, A3C0); \ A3C0 = _mm256_permute2f128_si256(A2C0, A2C0, 0x0001); \ \ A4C0 = _mm256_hadd_epi32(A4C0, A5C0); \ A5C0 = _mm256_permute2f128_si256(A4C0, A4C0, 0x0001); \ \ A6C0 = _mm256_hadd_epi32(A6C0, A7C0); \ A7C0 = _mm256_permute2f128_si256(A6C0, A6C0, 0x0001); \ \ COE0 = _mm256_add_epi32(A0C0, A1C0); \ COE1 = _mm256_add_epi32(A2C0, A3C0); \ COE2 = _mm256_add_epi32(A4C0, A5C0); \ COE3 = _mm256_add_epi32(A6C0, A7C0); \ \ COE0 = _mm256_blend_epi32(COE0, COE2, 0x00F0); \ COE1 = _mm256_blend_epi32(COE1, COE3, 0x00F0); \ \ COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add1), shift1); \ COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add1), shift1); \ \ COE_RESULT = _mm256_packs_epi32(COE0, COE1); \ im[dstPos][i] = COE_RESULT; MAKE_ODD(3, 4); MAKE_ODD(4, 12); R0R1 = _mm256_hsub_epi16(R0C0, R1C0);//[105 102 106 101 005 002 006 001 / 104 103 107 100 004 003 007 000] R2R3 = _mm256_hsub_epi16(R2C0, R3C0); R4R5 = _mm256_hsub_epi16(R4C0, R5C0); R6R7 = _mm256_hsub_epi16(R6C0, R7C0); R8R9 = _mm256_hsub_epi16(R8C0, R9C0);//[905 902 906 901 805 802 806 801 / 904 903 907 900 804 803 807 800] R10R11 = _mm256_hsub_epi16(R10C0, R11C0); R12R13 = _mm256_hsub_epi16(R12C0, R13C0); R14R15 = _mm256_hsub_epi16(R14C0, R15C0); MAKE_ODD(7, 2); MAKE_ODD(8, 6); MAKE_ODD(9, 10); MAKE_ODD(10, 14); #undef MAKE_ODD #define MAKE_ODD(tab,dstPos) \ tab_t = _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab]); \ A0C0 = _mm256_madd_epi16(R0_ODD, tab_t); \ A0C1 = _mm256_madd_epi16(R1_ODD, tab_t); \ A1C0 = _mm256_madd_epi16(R2_ODD, tab_t); \ A1C1 = _mm256_madd_epi16(R3_ODD, tab_t); \ A2C0 = _mm256_madd_epi16(R4_ODD, tab_t); \ A2C1 = _mm256_madd_epi16(R5_ODD, tab_t); \ A3C0 = _mm256_madd_epi16(R6_ODD, tab_t); \ A3C1 = _mm256_madd_epi16(R7_ODD, tab_t); \ A4C0 = _mm256_madd_epi16(R8_ODD, tab_t); \ A4C1 = _mm256_madd_epi16(R9_ODD, tab_t); \ A5C0 = _mm256_madd_epi16(R10_ODD, tab_t); \ A5C1 = _mm256_madd_epi16(R11_ODD, tab_t); \ A6C0 = _mm256_madd_epi16(R12_ODD, tab_t); \ A6C1 = _mm256_madd_epi16(R13_ODD, tab_t); \ A7C0 = _mm256_madd_epi16(R14_ODD, tab_t); \ A7C1 = _mm256_madd_epi16(R15_ODD, tab_t); \ \ A0C0 = _mm256_hadd_epi32(A0C0, A0C1); \ A1C0 = _mm256_hadd_epi32(A1C0, A1C1); \ A2C0 = _mm256_hadd_epi32(A2C0, A2C1); \ A3C0 = _mm256_hadd_epi32(A3C0, A3C1); \ A4C0 = _mm256_hadd_epi32(A4C0, A4C1); \ A5C0 = _mm256_hadd_epi32(A5C0, A5C1); \ A6C0 = _mm256_hadd_epi32(A6C0, A6C1); \ A7C0 = _mm256_hadd_epi32(A7C0, A7C1); \ \ A0C0 = _mm256_hadd_epi32(A0C0, A1C0); \ A1C0 = _mm256_hadd_epi32(A2C0, A3C0); \ A2C0 = _mm256_hadd_epi32(A4C0, A5C0); \ A3C0 = _mm256_hadd_epi32(A6C0, A7C0); \ \ A0C1 = _mm256_permute2f128_si256(A0C0, A2C0, 0x0020); \ A1C1 = _mm256_permute2f128_si256(A0C0, A2C0, 0x0031); \ A2C1 = _mm256_permute2f128_si256(A1C0, A3C0, 0x0020); \ A3C1 = _mm256_permute2f128_si256(A1C0, A3C0, 0x0031); \ \ COE0 = _mm256_add_epi32(A0C1, A1C1); \ COE1 = _mm256_add_epi32(A2C1, A3C1); \ COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add1), shift1); \ COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add1), shift1); \ \ COE_RESULT = _mm256_packs_epi32(COE0, COE1); \ im[dstPos][i] = COE_RESULT; MAKE_ODD(15, 1); MAKE_ODD(16, 3); MAKE_ODD(17, 5); MAKE_ODD(18, 7); MAKE_ODD(19, 9); MAKE_ODD(20, 11); MAKE_ODD(21, 13); MAKE_ODD(22, 15); #undef MAKE_ODD } /* clear result buffer */ xavs2_memzero_aligned_c_avx(dst, 32 * 32 * sizeof(coeff_t)); __m128i mask = _mm_set1_epi16(0xffff); // DCT2 ֻǰ16к16 for (i = 0; i < 16 / 8; i++) { R0C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 0) * i_src) + 0), mask)); R0C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 0) * i_src) + 8), mask)); R1C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 0) * i_src) + 16), mask)); R1C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 0) * i_src) + 24), mask)); R2C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 1) * i_src) + 0), mask)); R2C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 1) * i_src) + 8), mask)); R3C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 1) * i_src) + 16), mask)); R3C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 1) * i_src) + 24), mask)); R4C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 2) * i_src) + 0), mask)); R4C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 2) * i_src) + 8), mask)); R5C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 2) * i_src) + 16), mask)); R5C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 2) * i_src) + 24), mask)); R6C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 3) * i_src) + 0), mask)); R6C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 3) * i_src) + 8), mask)); R7C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 3) * i_src) + 16), mask)); R7C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 3) * i_src) + 24), mask)); R8C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 4) * i_src) + 0), mask)); R8C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 4) * i_src) + 8), mask)); R9C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 4) * i_src) + 16), mask)); R9C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 4) * i_src) + 24), mask)); R10C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 5) * i_src) + 0), mask)); R10C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 5) * i_src) + 8), mask)); R11C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 5) * i_src) + 16), mask)); R11C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 5) * i_src) + 24), mask)); R12C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 6) * i_src) + 0), mask)); R12C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 6) * i_src) + 8), mask)); R13C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 6) * i_src) + 16), mask)); R13C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 6) * i_src) + 24), mask)); R14C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 7) * i_src) + 0), mask)); R14C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 7) * i_src) + 8), mask)); R15C0 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 7) * i_src) + 16), mask)); R15C1 = _mm256_cvtepi16_epi32(_mm_maskload_epi32((int const*)((int16_t*)(im)+((i * 8 + 7) * i_src) + 24), mask)); // inverse _m256i per 32 bit __m256i tab_inv = _mm256_setr_epi32(0x0007, 0x0006, 0x0005, 0x0004, 0x0003, 0x0002, 0x0001, 0x0000); R0C1 = _mm256_permutevar8x32_epi32(R0C1, tab_inv); //[8 9 10 11 / 12 13 14 15] R1C1 = _mm256_permutevar8x32_epi32(R1C1, tab_inv); //[24 25 26 27 / 28 29 30 31] R0_ODD = _mm256_sub_epi32(R0C0, R1C1); //[7-24 6-25 5-26 4-27 / 3-28 2-29 1-30 0-31] R15_ODD = _mm256_sub_epi32(R0C1, R1C0); //[8-23 9-22 10-21 11-20 / 12-19 13-18 14-17 15-16] R0C0 = _mm256_add_epi32(R0C0, R1C1); //[7 6 5 4 / 3 2 1 0] R0C1 = _mm256_add_epi32(R0C1, R1C0); //[8 9 10 11 / 12 13 14 15] A0C0 = _mm256_add_epi32(R0C0, R0C1); //[7 6 5 4 / 3 2 1 0] R2C1 = _mm256_permutevar8x32_epi32(R2C1, tab_inv); R3C1 = _mm256_permutevar8x32_epi32(R3C1, tab_inv); R1_ODD = _mm256_sub_epi32(R2C0, R3C1); R14_ODD = _mm256_sub_epi32(R2C1, R3C0); R1C0 = _mm256_add_epi32(R2C0, R3C1); R1C1 = _mm256_add_epi32(R2C1, R3C0); A1C0 = _mm256_add_epi32(R1C0, R1C1); R4C1 = _mm256_permutevar8x32_epi32(R4C1, tab_inv); R5C1 = _mm256_permutevar8x32_epi32(R5C1, tab_inv); R2_ODD = _mm256_sub_epi32(R4C0, R5C1); R13_ODD = _mm256_sub_epi32(R4C1, R5C0); R2C0 = _mm256_add_epi32(R4C0, R5C1); R2C1 = _mm256_add_epi32(R4C1, R5C0); A2C0 = _mm256_add_epi32(R2C0, R2C1); R6C1 = _mm256_permutevar8x32_epi32(R6C1, tab_inv); R7C1 = _mm256_permutevar8x32_epi32(R7C1, tab_inv); R3_ODD = _mm256_sub_epi32(R6C0, R7C1); R12_ODD = _mm256_sub_epi32(R6C1, R7C0); R3C0 = _mm256_add_epi32(R6C0, R7C1); R3C1 = _mm256_add_epi32(R6C1, R7C0); A3C0 = _mm256_add_epi32(R3C0, R3C1); R8C1 = _mm256_permutevar8x32_epi32(R8C1, tab_inv); R9C1 = _mm256_permutevar8x32_epi32(R9C1, tab_inv); R4_ODD = _mm256_sub_epi32(R8C0, R9C1); R11_ODD = _mm256_sub_epi32(R8C1, R9C0); R4C0 = _mm256_add_epi32(R8C0, R9C1); R4C1 = _mm256_add_epi32(R8C1, R9C0); A4C0 = _mm256_add_epi32(R4C0, R4C1); R10C1 = _mm256_permutevar8x32_epi32(R10C1, tab_inv); R11C1 = _mm256_permutevar8x32_epi32(R11C1, tab_inv); R5_ODD = _mm256_sub_epi32(R10C0, R11C1); R10_ODD = _mm256_sub_epi32(R10C1, R11C0); R5C0 = _mm256_add_epi32(R10C0, R11C1); R5C1 = _mm256_add_epi32(R10C1, R11C0); A5C0 = _mm256_add_epi32(R5C0, R5C1); R12C1 = _mm256_permutevar8x32_epi32(R12C1, tab_inv); R13C1 = _mm256_permutevar8x32_epi32(R13C1, tab_inv); R6_ODD = _mm256_sub_epi32(R12C0, R13C1); R9_ODD = _mm256_sub_epi32(R12C1, R13C0); R6C0 = _mm256_add_epi32(R12C0, R13C1); R6C1 = _mm256_add_epi32(R12C1, R13C0); A6C0 = _mm256_add_epi32(R6C0, R6C1); R14C1 = _mm256_permutevar8x32_epi32(R14C1, tab_inv); R15C1 = _mm256_permutevar8x32_epi32(R15C1, tab_inv); R7_ODD = _mm256_sub_epi32(R14C0, R15C1);//[7-24 6-25 5-26 4-27 / 3-28 2-29 1-30 0-31] R8_ODD = _mm256_sub_epi32(R14C1, R15C0); //[8-23 9-22 10-21 11-20 / 12-19 13-18 14-17 15-16] R7C0 = _mm256_add_epi32(R14C0, R15C1); //[7 6 5 4 / 3 2 1 0] R7C1 = _mm256_add_epi32(R14C1, R15C0); //[8 9 10 11 / 12 13 14 15] A7C0 = _mm256_add_epi32(R7C0, R7C1); //[7 6 5 4 / 3 2 1 0] __m256i result_mask = _mm256_setr_epi32(0xf0000000, 0xf0000000, 0xf0000000, 0xf0000000, 0, 0, 0, 0); #define MAKE_ODD(tab,dstPos) \ tab_t = _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab]); \ A0C1 = _mm256_mullo_epi32(A0C0, tab_t); \ A1C1 = _mm256_mullo_epi32(A1C0, tab_t); \ A2C1 = _mm256_mullo_epi32(A2C0, tab_t); \ A3C1 = _mm256_mullo_epi32(A3C0, tab_t); \ A4C1 = _mm256_mullo_epi32(A4C0, tab_t); \ A5C1 = _mm256_mullo_epi32(A5C0, tab_t); \ A6C1 = _mm256_mullo_epi32(A6C0, tab_t); \ A7C1 = _mm256_mullo_epi32(A7C0, tab_t); \ \ COE0 = _mm256_hadd_epi32(A0C1, A1C1); /* [107+106 105+104 007+006 005+004 / 103+102 101+100 003+002 001+000] */\ COE1 = _mm256_hadd_epi32(A2C1, A3C1); \ COE2 = _mm256_hadd_epi32(A4C1, A5C1); \ COE3 = _mm256_hadd_epi32(A6C1, A7C1); \ \ COE0 = _mm256_hadd_epi32(COE0, COE1); /* [3A 2A 1A 0A / 3B 2B 1B 0B] */\ COE1 = _mm256_hadd_epi32(COE2, COE3); /* [7A 6A 5A 4A / 7B 6B 5B 4B] */\ \ COE2 = _mm256_permute2f128_si256(COE0, COE1, 0x0020); /*[7B 6B 5B 4B / 3B 2B 1B 0B]*/\ COE3 = _mm256_permute2f128_si256(COE0, COE1, 0x0031); /*[7A 6A 5A 4A / 3A 2A 1A 0A]*/\ \ COE_RESULT = _mm256_add_epi32(COE2, COE3); /* [7 6 5 4 / 3 2 1 0] */\ COE_RESULT = _mm256_srai_epi32(_mm256_add_epi32(COE_RESULT, c_add2), SHIFT2); \ COE0 = _mm256_permute2f128_si256(COE_RESULT, COE_RESULT, 0x0001);/* [3 2 1 0 / 7 6 5 4] */ \ COE_RESULT = _mm256_packs_epi32(COE_RESULT, COE0); /*[3 2 1 0 7 6 5 4 / 7 6 5 4 3 2 1 0]*/\ addr = (dst + (dstPos * 32) + (i * 8)); \ \ _mm256_maskstore_epi32((int*)addr, result_mask, COE_RESULT); //_mm256_storeu2_m128i(addr, &COE3, COE_RESULT); /*_mm256_storeu_si256(addr, COE_RESULT);*/ MAKE_ODD(0, 0); MAKE_ODD(1, 8); MAKE_ODD(4, 4); MAKE_ODD(5, 12); A0C0 = _mm256_sub_epi32(R0C0, R0C1); A1C0 = _mm256_sub_epi32(R1C0, R1C1); A2C0 = _mm256_sub_epi32(R2C0, R2C1); A3C0 = _mm256_sub_epi32(R3C0, R3C1); A4C0 = _mm256_sub_epi32(R4C0, R4C1); A5C0 = _mm256_sub_epi32(R5C0, R5C1); A6C0 = _mm256_sub_epi32(R6C0, R6C1); A7C0 = _mm256_sub_epi32(R7C0, R7C1); MAKE_ODD(8, 2); MAKE_ODD(9, 6); MAKE_ODD(10, 10); MAKE_ODD(11, 14); #undef MAKE_ODD #define MAKE_ODD(tab,dstPos) \ tab_t = _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab]); \ tab_t1 = _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab + 1]); \ A0C1 = _mm256_add_epi32(_mm256_mullo_epi32(R0_ODD, tab_t), _mm256_mullo_epi32(R15_ODD, tab_t1)); \ A1C1 = _mm256_add_epi32(_mm256_mullo_epi32(R1_ODD, tab_t), _mm256_mullo_epi32(R14_ODD, tab_t1)); \ A2C1 = _mm256_add_epi32(_mm256_mullo_epi32(R2_ODD, tab_t), _mm256_mullo_epi32(R13_ODD, tab_t1)); \ A3C1 = _mm256_add_epi32(_mm256_mullo_epi32(R3_ODD, tab_t), _mm256_mullo_epi32(R12_ODD, tab_t1)); \ A4C1 = _mm256_add_epi32(_mm256_mullo_epi32(R4_ODD, tab_t), _mm256_mullo_epi32(R11_ODD, tab_t1)); \ A5C1 = _mm256_add_epi32(_mm256_mullo_epi32(R5_ODD, tab_t), _mm256_mullo_epi32(R10_ODD, tab_t1)); \ A6C1 = _mm256_add_epi32(_mm256_mullo_epi32(R6_ODD, tab_t), _mm256_mullo_epi32(R9_ODD, tab_t1)); \ A7C1 = _mm256_add_epi32(_mm256_mullo_epi32(R7_ODD, tab_t), _mm256_mullo_epi32(R8_ODD, tab_t1)); \ \ COE0 = _mm256_hadd_epi32(A0C1, A1C1); \ COE1 = _mm256_hadd_epi32(A2C1, A3C1); \ COE2 = _mm256_hadd_epi32(A4C1, A5C1); \ COE3 = _mm256_hadd_epi32(A6C1, A7C1); \ \ COE0 = _mm256_hadd_epi32(COE0, COE1); \ COE1 = _mm256_hadd_epi32(COE2, COE3); \ \ COE2 = _mm256_permute2f128_si256(COE0, COE1, 0x0020); \ COE3 = _mm256_permute2f128_si256(COE0, COE1, 0x0031); \ \ COE_RESULT = _mm256_add_epi32(COE2, COE3); \ COE_RESULT = _mm256_srai_epi32(_mm256_add_epi32(COE_RESULT, c_add2), SHIFT2); \ COE0 = _mm256_permute2f128_si256(COE_RESULT, COE_RESULT, 0x0001); \ COE_RESULT = _mm256_packs_epi32(COE_RESULT, COE0); \ addr = (dst + (dstPos * 32) + (i * 8)); \ \ _mm256_maskstore_epi32((int*)addr, result_mask, COE_RESULT); //_mm256_storeu2_m128i(addr, &COE3, COE_RESULT); //_mm256_storeu_si256(addr, COE_RESULT); MAKE_ODD(16, 1); MAKE_ODD(18, 3); MAKE_ODD(20, 5); MAKE_ODD(22, 7); MAKE_ODD(24, 9); MAKE_ODD(26, 11); MAKE_ODD(28, 13); MAKE_ODD(30, 15); #undef MAKE_ODD } } /* --------------------------------------------------------------------------- */ void dct_c_8x32_avx2(const coeff_t *src, coeff_t *dst, int i_src) { __m256i line00, line10, line20, line30, line40, line50, line60, line70; __m256i line01, line11, line21, line31, line41, line51, line61, line71; __m256i line02, line12, line22, line32, line42, line52, line62, line72; __m256i line03, line13, line23, line33, line43, line53, line63, line73; __m256i e0, e1; __m256i o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15; __m256i ee0, eo0; __m256i add1, add2; __m256i im[32][4]; ALIGN32(static const int16_t shuffle[]) = { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A, 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A, }; __m256i A0C0, A0C1, A1C0, A1C1, A2C0, A2C1, A3C0, A3C1, A4C0, A4C1, A5C0, A5C1, A6C0, A6C1, A7C0, A7C1; __m256i COE0, COE1, COE2, COE3; __m256i COE_RESULT; int shift1, shift2; int i; shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT - 2 + (i_src & 0x01); shift2 = B32X32_IN_BIT + FACTO_BIT; const int ADD11 = (1 << shift1) >> 1; const int ADD21 = (1 << shift2) >> 1; add1 = _mm256_set1_epi32(ADD11); // TODO: shift1 = 2 add2 = _mm256_set1_epi32(ADD21); i_src &= 0xFE; /* remember to remove the flag bit */ // dct1 for (i = 0; i < 32 / 8; i++) { line00 = _mm256_loadu2_m128i((__m128i *)(src + (i * 8 + 4) * i_src), (__m128i *)(src + (i * 8 + 0) * i_src)); // [00~07 40~47] line10 = _mm256_loadu2_m128i((__m128i *)(src + (i * 8 + 5) * i_src), (__m128i *)(src + (i * 8 + 1) * i_src)); // [10~17 50~57] line20 = _mm256_loadu2_m128i((__m128i *)(src + (i * 8 + 6) * i_src), (__m128i *)(src + (i * 8 + 2) * i_src)); // [20~27 60~67] line30 = _mm256_loadu2_m128i((__m128i *)(src + (i * 8 + 7) * i_src), (__m128i *)(src + (i * 8 + 3) * i_src)); // [30~37 70~77] line00 = _mm256_shuffle_epi8(line00, _mm256_load_si256((const __m256i *)shuffle)); // [00 07 03 04 01 06 02 05 40 47 43 44 41 46 42 45] line10 = _mm256_shuffle_epi8(line10, _mm256_load_si256((const __m256i *)shuffle)); line20 = _mm256_shuffle_epi8(line20, _mm256_load_si256((const __m256i *)shuffle)); line30 = _mm256_shuffle_epi8(line30, _mm256_load_si256((const __m256i *)shuffle)); e0 = _mm256_hadd_epi16(line00, line10); // [e00 e03 e01 e02 e10 e13 e11 e12 e40 e43 e41 e42 e50 e53 e51 e52] e1 = _mm256_hadd_epi16(line20, line30); o0 = _mm256_hsub_epi16(line00, line10); // [o00 o03 o01 o02 o10 o13 o11 o12 o40 o43 o41 o42 o50 o53 o51 o52] o1 = _mm256_hsub_epi16(line20, line30); ee0 = _mm256_hadd_epi16(e0, e1); // [ee00 ee01 ee10 ee11 ee20 ee21 ee30 ee31 ee40 ee41 ee50 ee51 ee60 ee61 ee70 ee71] eo0 = _mm256_hsub_epi16(e0, e1); line00 = _mm256_madd_epi16(ee0, _mm256_load_si256((const __m256i *)tab_dct_8x32_avx2[0])); line40 = _mm256_madd_epi16(ee0, _mm256_load_si256((const __m256i *)tab_dct_8x32_avx2[4])); line20 = _mm256_madd_epi16(eo0, _mm256_load_si256((const __m256i *)tab_dct_8x32_avx2[2])); line60 = _mm256_madd_epi16(eo0, _mm256_load_si256((const __m256i *)tab_dct_8x32_avx2[6])); #define CALC_DATA(line, tab) \ line = _mm256_hadd_epi32(\ _mm256_madd_epi16(o0, _mm256_load_si256((const __m256i *)tab_dct_8x32_avx2[tab])), \ _mm256_madd_epi16(o1, _mm256_load_si256((const __m256i *)tab_dct_8x32_avx2[tab])) \ ); CALC_DATA(line10, 1); CALC_DATA(line30, 3); CALC_DATA(line50, 5); CALC_DATA(line70, 7); #undef CALC_DATA _mm256_storeu_si256(&im[0][i], _mm256_srai_epi32(_mm256_add_epi32(line00, add1), shift1)); _mm256_storeu_si256(&im[1][i], _mm256_srai_epi32(_mm256_add_epi32(line10, add1), shift1)); _mm256_storeu_si256(&im[2][i], _mm256_srai_epi32(_mm256_add_epi32(line20, add1), shift1)); _mm256_storeu_si256(&im[3][i], _mm256_srai_epi32(_mm256_add_epi32(line30, add1), shift1)); _mm256_storeu_si256(&im[4][i], _mm256_srai_epi32(_mm256_add_epi32(line40, add1), shift1)); _mm256_storeu_si256(&im[5][i], _mm256_srai_epi32(_mm256_add_epi32(line50, add1), shift1)); _mm256_storeu_si256(&im[6][i], _mm256_srai_epi32(_mm256_add_epi32(line60, add1), shift1)); _mm256_storeu_si256(&im[7][i], _mm256_srai_epi32(_mm256_add_epi32(line70, add1), shift1)); } //DCT2 #define load_one_line(x) \ line##x##0 = _mm256_load_si256(&im[x][0]); \ line##x##1 = _mm256_load_si256(&im[x][1]); \ line##x##2 = _mm256_load_si256(&im[x][2]); \ line##x##3 = _mm256_load_si256(&im[x][3]) \ load_one_line(0); load_one_line(1); load_one_line(2); load_one_line(3); load_one_line(4); load_one_line(5); load_one_line(6); load_one_line(7); #undef load_one_line //inverse _m256i per 32 bit __m256i tab_inv = _mm256_setr_epi32(0x0007, 0x0006, 0x0005, 0x0004, 0x0003, 0x0002, 0x0001, 0x0000); line01 = _mm256_permutevar8x32_epi32(line01, tab_inv); //[8 9 10 11 / 12 13 14 15] line03 = _mm256_permutevar8x32_epi32(line03, tab_inv); //[24 25 26 27 / 28 29 30 31] o0 = _mm256_sub_epi32(line00, line03); //[7-24 6-25 5-26 4-27 / 3-28 2-29 1-30 0-31] o15 = _mm256_sub_epi32(line01, line02); //[8-23 9-22 10-21 11-20 / 12-19 13-18 14-17 15-16] line00 = _mm256_add_epi32(line00, line03); //[7 6 5 4 / 3 2 1 0] line01 = _mm256_add_epi32(line01, line02); //[8 9 10 11 / 12 13 14 15] A0C0 = _mm256_add_epi32(line00, line01); //[7 6 5 4 / 3 2 1 0] line11 = _mm256_permutevar8x32_epi32(line11, tab_inv); line13 = _mm256_permutevar8x32_epi32(line13, tab_inv); o1 = _mm256_sub_epi32(line10, line13); o14 = _mm256_sub_epi32(line11, line12); line02 = _mm256_add_epi32(line10, line13); line03 = _mm256_add_epi32(line11, line12); A1C0 = _mm256_add_epi32(line02, line03); line21 = _mm256_permutevar8x32_epi32(line21, tab_inv); line23 = _mm256_permutevar8x32_epi32(line23, tab_inv); o2 = _mm256_sub_epi32(line20, line23); o13 = _mm256_sub_epi32(line21, line22); line10 = _mm256_add_epi32(line20, line23); line11 = _mm256_add_epi32(line21, line22); A2C0 = _mm256_add_epi32(line10, line11); line31 = _mm256_permutevar8x32_epi32(line31, tab_inv); line33 = _mm256_permutevar8x32_epi32(line33, tab_inv); o3 = _mm256_sub_epi32(line30, line33); o12 = _mm256_sub_epi32(line31, line32); line12 = _mm256_add_epi32(line30, line33); line13 = _mm256_add_epi32(line31, line32); A3C0 = _mm256_add_epi32(line12, line13); line41 = _mm256_permutevar8x32_epi32(line41, tab_inv); line43 = _mm256_permutevar8x32_epi32(line43, tab_inv); o4 = _mm256_sub_epi32(line40, line43); o11 = _mm256_sub_epi32(line41, line42); line20 = _mm256_add_epi32(line40, line43); line21 = _mm256_add_epi32(line41, line42); A4C0 = _mm256_add_epi32(line20, line21); line51 = _mm256_permutevar8x32_epi32(line51, tab_inv); line53 = _mm256_permutevar8x32_epi32(line53, tab_inv); o5 = _mm256_sub_epi32(line50, line53); o10 = _mm256_sub_epi32(line51, line52); line22 = _mm256_add_epi32(line50, line53); line23 = _mm256_add_epi32(line51, line52); A5C0 = _mm256_add_epi32(line22, line23); line61 = _mm256_permutevar8x32_epi32(line61, tab_inv); line63 = _mm256_permutevar8x32_epi32(line63, tab_inv); o6 = _mm256_sub_epi32(line60, line63); o9 = _mm256_sub_epi32(line61, line62); line30 = _mm256_add_epi32(line60, line63); line31 = _mm256_add_epi32(line61, line62); A6C0 = _mm256_add_epi32(line30, line31); line71 = _mm256_permutevar8x32_epi32(line71, tab_inv); line73 = _mm256_permutevar8x32_epi32(line73, tab_inv); o7 = _mm256_sub_epi32(line70, line73);//[7-24 6-25 5-26 4-27 / 3-28 2-29 1-30 0-31] o8 = _mm256_sub_epi32(line71, line72); //[8-23 9-22 10-21 11-20 / 12-19 13-18 14-17 15-16] line32 = _mm256_add_epi32(line70, line73); //[7 6 5 4 / 3 2 1 0] line33 = _mm256_add_epi32(line71, line72); //[8 9 10 11 / 12 13 14 15] A7C0 = _mm256_add_epi32(line32, line33); //[7 6 5 4 / 3 2 1 0] __m256i result_mask = _mm256_setr_epi32(0xf0000000, 0xf0000000, 0xf0000000, 0xf0000000, 0, 0, 0, 0); #define MAKE_ODD(tab,dstPos)\ A0C1 = _mm256_mullo_epi32(A0C0, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])); \ A1C1 = _mm256_mullo_epi32(A1C0, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])); \ A2C1 = _mm256_mullo_epi32(A2C0, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])); \ A3C1 = _mm256_mullo_epi32(A3C0, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])); \ A4C1 = _mm256_mullo_epi32(A4C0, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])); \ A5C1 = _mm256_mullo_epi32(A5C0, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])); \ A6C1 = _mm256_mullo_epi32(A6C0, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])); \ A7C1 = _mm256_mullo_epi32(A7C0, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])); \ \ COE0 = _mm256_hadd_epi32(A0C1, A1C1); /* [107+106 105+104 007+006 005+004 / 103+102 101+100 003+002 001+000] */\ COE1 = _mm256_hadd_epi32(A2C1, A3C1); \ COE2 = _mm256_hadd_epi32(A4C1, A5C1); \ COE3 = _mm256_hadd_epi32(A6C1, A7C1); \ \ COE0 = _mm256_hadd_epi32(COE0, COE1); /* [3A 2A 1A 0A / 3B 2B 1B 0B] */\ COE1 = _mm256_hadd_epi32(COE2, COE3); /* [7A 6A 5A 4A / 7B 6B 5B 4B] */\ \ COE2 = _mm256_permute2f128_si256(COE0, COE1, 0x0020); /*[7B 6B 5B 4B / 3B 2B 1B 0B]*/\ COE3 = _mm256_permute2f128_si256(COE0, COE1, 0x0031); /*[7A 6A 5A 4A / 3A 2A 1A 0A]*/\ \ COE_RESULT = _mm256_add_epi32(COE2, COE3); /* [7 6 5 4 / 3 2 1 0] */\ COE_RESULT = _mm256_srai_epi32(_mm256_add_epi32(COE_RESULT, add2), shift2); \ COE0 = _mm256_permute2f128_si256(COE_RESULT, COE_RESULT, 0x0001);/* [3 2 1 0 / 7 6 5 4] */ \ COE_RESULT = _mm256_packs_epi32(COE_RESULT, COE0); /*[3 2 1 0 7 6 5 4 / 7 6 5 4 3 2 1 0]*/\ _mm256_maskstore_epi32((int *)(dst + dstPos * 8), result_mask, COE_RESULT); MAKE_ODD(0, 0); MAKE_ODD(1, 8); MAKE_ODD(2, 16); MAKE_ODD(3, 24); MAKE_ODD(4, 4); MAKE_ODD(5, 12); MAKE_ODD(6, 20); MAKE_ODD(7, 28); A0C0 = _mm256_sub_epi32(line00, line01); A1C0 = _mm256_sub_epi32(line02, line03); A2C0 = _mm256_sub_epi32(line10, line11); A3C0 = _mm256_sub_epi32(line12, line13); A4C0 = _mm256_sub_epi32(line20, line21); A5C0 = _mm256_sub_epi32(line22, line23); A6C0 = _mm256_sub_epi32(line30, line31); A7C0 = _mm256_sub_epi32(line32, line33); MAKE_ODD(8, 2); MAKE_ODD(9, 6); MAKE_ODD(10, 10); MAKE_ODD(11, 14); MAKE_ODD(12, 18); MAKE_ODD(13, 22); MAKE_ODD(14, 26); MAKE_ODD(15, 30); #undef MAKE_ODD #define MAKE_ODD(tab,dstPos) \ A0C1 = _mm256_add_epi32(_mm256_mullo_epi32(o0, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])), _mm256_mullo_epi32(o15, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab + 1]))); \ A1C1 = _mm256_add_epi32(_mm256_mullo_epi32(o1, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])), _mm256_mullo_epi32(o14, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab + 1]))); \ A2C1 = _mm256_add_epi32(_mm256_mullo_epi32(o2, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])), _mm256_mullo_epi32(o13, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab + 1]))); \ A3C1 = _mm256_add_epi32(_mm256_mullo_epi32(o3, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])), _mm256_mullo_epi32(o12, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab + 1]))); \ A4C1 = _mm256_add_epi32(_mm256_mullo_epi32(o4, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])), _mm256_mullo_epi32(o11, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab + 1]))); \ A5C1 = _mm256_add_epi32(_mm256_mullo_epi32(o5, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])), _mm256_mullo_epi32(o10, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab + 1]))); \ A6C1 = _mm256_add_epi32(_mm256_mullo_epi32(o6, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])), _mm256_mullo_epi32(o9, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab + 1]))); \ A7C1 = _mm256_add_epi32(_mm256_mullo_epi32(o7, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab])), _mm256_mullo_epi32(o8, _mm256_load_si256((__m256i*)tab_dct2_32x32_avx2[tab + 1]))); \ \ COE0 = _mm256_hadd_epi32(A0C1, A1C1); \ COE1 = _mm256_hadd_epi32(A2C1, A3C1); \ COE2 = _mm256_hadd_epi32(A4C1, A5C1); \ COE3 = _mm256_hadd_epi32(A6C1, A7C1); \ \ COE0 = _mm256_hadd_epi32(COE0, COE1); \ COE1 = _mm256_hadd_epi32(COE2, COE3); \ \ COE2 = _mm256_permute2f128_si256(COE0, COE1, 0x0020); \ COE3 = _mm256_permute2f128_si256(COE0, COE1, 0x0031); \ \ COE_RESULT = _mm256_add_epi32(COE2, COE3); \ COE_RESULT = _mm256_srai_epi32(_mm256_add_epi32(COE_RESULT, add2), shift2); \ COE0 = _mm256_permute2f128_si256(COE_RESULT, COE_RESULT, 0x0001); \ COE_RESULT = _mm256_packs_epi32(COE_RESULT, COE0); \ _mm256_maskstore_epi32((int *)(dst + dstPos * 8), result_mask, COE_RESULT); MAKE_ODD(16, 1); MAKE_ODD(18, 3); MAKE_ODD(20, 5); MAKE_ODD(22, 7); MAKE_ODD(24, 9); MAKE_ODD(26, 11); MAKE_ODD(28, 13); MAKE_ODD(30, 15); MAKE_ODD(32, 17); MAKE_ODD(34, 19); MAKE_ODD(36, 21); MAKE_ODD(38, 23); MAKE_ODD(40, 25); MAKE_ODD(42, 27); MAKE_ODD(44, 29); MAKE_ODD(46, 31); #undef MAKE_ODD } /* --------------------------------------------------------------------------- */ void dct_c_32x8_avx2(const coeff_t *src, coeff_t *dst, int i_src) { //const int shift1 = SHIFT1 + (i_src & 0x01); int shift1 = B32X32_IN_BIT + FACTO_BIT + g_bit_depth + 1 - LIMIT_BIT; int shift2 = B32X32_IN_BIT + FACTO_BIT - 2 - (i_src & 0x01); const int ADD1 = (1 << shift1) >> 1; const int ADD2 = (1 << shift2) >> 1; const __m256i c_add1 = _mm256_set1_epi32(ADD1); // TODO: shift1 = 2 const __m256i c_add2 = _mm256_set1_epi32(ADD2); //R---row C-column __m256i R0C0, R0C1, R1C0, R1C1, R2C0, R2C1, R3C0, R3C1, R4C0, R4C1, R5C0, R5C1, R6C0, R6C1, R7C0, R7C1; //store anser __m256i A0C0, A0C1, A1C0, A1C1, A2C0, A2C1, A3C0, A3C1; __m256i R0R1, R2R3, R4R5, R6R7; __m256i COE0, COE1; __m256i COE_RESULT; __m256i im[32]; __m256i R0_ODD, R1_ODD, R2_ODD, R3_ODD, R4_ODD, R5_ODD, R6_ODD, R7_ODD; int i; coeff_t* addr; i_src &= 0xFE; R0C0 = _mm256_load_si256((__m256i*)(src + 0 * i_src + 0)); //[15 14 13 12 11 10... 03 02 01 00] R0C1 = _mm256_load_si256((__m256i*)(src + 0 * i_src + 16)); //[31 30 29 28 11 10... 19 18 17 16] R1C0 = _mm256_load_si256((__m256i*)(src + 1 * i_src + 0)); R1C1 = _mm256_load_si256((__m256i*)(src + 1 * i_src + 16)); R2C0 = _mm256_load_si256((__m256i*)(src + 2 * i_src + 0)); R2C1 = _mm256_load_si256((__m256i*)(src + 2 * i_src + 16)); R3C0 = _mm256_load_si256((__m256i*)(src + 3 * i_src + 0)); R3C1 = _mm256_load_si256((__m256i*)(src + 3 * i_src + 16)); R4C0 = _mm256_load_si256((__m256i*)(src + 4 * i_src + 0)); R4C1 = _mm256_load_si256((__m256i*)(src + 4 * i_src + 16)); R5C0 = _mm256_load_si256((__m256i*)(src + 5 * i_src + 0)); R5C1 = _mm256_load_si256((__m256i*)(src + 5 * i_src + 16)); R6C0 = _mm256_load_si256((__m256i*)(src + 6 * i_src + 0)); R6C1 = _mm256_load_si256((__m256i*)(src + 6 * i_src + 16)); R7C0 = _mm256_load_si256((__m256i*)(src + 7 * i_src + 0)); R7C1 = _mm256_load_si256((__m256i*)(src + 7 * i_src + 16)); //notice that different set / setr low dizhi butong __m256i tab_shuffle = _mm256_setr_epi16(0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A, 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A); __m256i tab_shuffle_1 = _mm256_setr_epi16(0x0100, 0x0B0A, 0x0302, 0x0908, 0x0504, 0x0F0E, 0x0706, 0x0D0C, 0x0100, 0x0B0A, 0x0302, 0x0908, 0x0504, 0x0F0E, 0x0706, 0x0D0C); __m256i tab_shuffle_2 = _mm256_setr_epi16(0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C, 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C); //[13 10 14 09 12 11 15 08 05 02 06 01 04 03 07 00] //[29 26 30 25 28 27 31 24 21 18 22 17 20 19 23 16] R0C0 = _mm256_shuffle_epi8(R0C0, tab_shuffle); R0C1 = _mm256_shuffle_epi8(R0C1, tab_shuffle); R0C1 = _mm256_permute2x128_si256(R0C1, R0C1, 0x0003);//permute [21 18 22 17 20 19 23 16 / 29 26 30 25 28 27 31 24] R0C1 = _mm256_shuffle_epi8(R0C1, tab_shuffle_2); // [18 21 17 22 19 20 16 23 / 26 29 25 30 27 28 24 31] //[13 10 14 09 12 11 15 08 / 05 02 06 01 04 03 07 00] //[18 21 17 22 19 20 16 23 / 26 29 25 30 27 28 24 31] R0_ODD = _mm256_sub_epi16(R0C0, R0C1); R0C0 = _mm256_add_epi16(R0C0, R0C1);//[13 10 14 09 12 11 15 08 / 05 02 06 01 04 03 07 00] R0C0 = _mm256_permute4x64_epi64(R0C0, 0x00D8);//[13 10 14 09 05 02 06 01 / 12 11 15 08 04 03 07 00] R0C0 = _mm256_shuffle_epi8(R0C0, tab_shuffle_1);//[10 05 13 02 09 06 14 01 / 11 04 12 03 08 07 15 00] R1C0 = _mm256_shuffle_epi8(R1C0, tab_shuffle); R1C1 = _mm256_shuffle_epi8(R1C1, tab_shuffle); R1C1 = _mm256_permute2x128_si256(R1C1, R1C1, 0x0003); R1C1 = _mm256_shuffle_epi8(R1C1, tab_shuffle_2); R1_ODD = _mm256_sub_epi16(R1C0, R1C1); R1C0 = _mm256_add_epi16(R1C0, R1C1); R1C0 = _mm256_permute4x64_epi64(R1C0, 0x00D8); R1C0 = _mm256_shuffle_epi8(R1C0, tab_shuffle_1); R2C0 = _mm256_shuffle_epi8(R2C0, tab_shuffle); R2C1 = _mm256_shuffle_epi8(R2C1, tab_shuffle); R2C1 = _mm256_permute2x128_si256(R2C1, R2C1, 0x0003); R2C1 = _mm256_shuffle_epi8(R2C1, tab_shuffle_2); R2_ODD = _mm256_sub_epi16(R2C0, R2C1); R2C0 = _mm256_add_epi16(R2C0, R2C1); R2C0 = _mm256_permute4x64_epi64(R2C0, 0x00D8); R2C0 = _mm256_shuffle_epi8(R2C0, tab_shuffle_1); R3C0 = _mm256_shuffle_epi8(R3C0, tab_shuffle); R3C1 = _mm256_shuffle_epi8(R3C1, tab_shuffle); R3C1 = _mm256_permute2x128_si256(R3C1, R3C1, 0x0003); R3C1 = _mm256_shuffle_epi8(R3C1, tab_shuffle_2); R3_ODD = _mm256_sub_epi16(R3C0, R3C1); R3C0 = _mm256_add_epi16(R3C0, R3C1); R3C0 = _mm256_permute4x64_epi64(R3C0, 0x00D8); R3C0 = _mm256_shuffle_epi8(R3C0, tab_shuffle_1); R4C0 = _mm256_shuffle_epi8(R4C0, tab_shuffle); R4C1 = _mm256_shuffle_epi8(R4C1, tab_shuffle); R4C1 = _mm256_permute2x128_si256(R4C1, R4C1, 0x0003); R4C1 = _mm256_shuffle_epi8(R4C1, tab_shuffle_2); R4_ODD = _mm256_sub_epi16(R4C0, R4C1); R4C0 = _mm256_add_epi16(R4C0, R4C1); R4C0 = _mm256_permute4x64_epi64(R4C0, 0x00D8); R4C0 = _mm256_shuffle_epi8(R4C0, tab_shuffle_1); R5C0 = _mm256_shuffle_epi8(R5C0, tab_shuffle); R5C1 = _mm256_shuffle_epi8(R5C1, tab_shuffle); R5C1 = _mm256_permute2x128_si256(R5C1, R5C1, 0x0003); R5C1 = _mm256_shuffle_epi8(R5C1, tab_shuffle_2); R5_ODD = _mm256_sub_epi16(R5C0, R5C1); R5C0 = _mm256_add_epi16(R5C0, R5C1); R5C0 = _mm256_permute4x64_epi64(R5C0, 0x00D8); R5C0 = _mm256_shuffle_epi8(R5C0, tab_shuffle_1); R6C0 = _mm256_shuffle_epi8(R6C0, tab_shuffle); R6C1 = _mm256_shuffle_epi8(R6C1, tab_shuffle); R6C1 = _mm256_permute2x128_si256(R6C1, R6C1, 0x0003); R6C1 = _mm256_shuffle_epi8(R6C1, tab_shuffle_2); R6_ODD = _mm256_sub_epi16(R6C0, R6C1); R6C0 = _mm256_add_epi16(R6C0, R6C1); R6C0 = _mm256_permute4x64_epi64(R6C0, 0x00D8); R6C0 = _mm256_shuffle_epi8(R6C0, tab_shuffle_1); R7C0 = _mm256_shuffle_epi8(R7C0, tab_shuffle); R7C1 = _mm256_shuffle_epi8(R7C1, tab_shuffle); R7C1 = _mm256_permute2x128_si256(R7C1, R7C1, 0x0003); R7C1 = _mm256_shuffle_epi8(R7C1, tab_shuffle_2); R7_ODD = _mm256_sub_epi16(R7C0, R7C1); R7C0 = _mm256_add_epi16(R7C0, R7C1); R7C0 = _mm256_permute4x64_epi64(R7C0, 0x00D8); R7C0 = _mm256_shuffle_epi8(R7C0, tab_shuffle_1); R0R1 = _mm256_hadd_epi16(R0C0, R1C0);//[105 102 106 101 005 002 006 001 / 104 103 107 100 004 003 007 000] R2R3 = _mm256_hadd_epi16(R2C0, R3C0); R4R5 = _mm256_hadd_epi16(R4C0, R5C0); R6R7 = _mm256_hadd_epi16(R6C0, R7C0); // mul the coefficient //0th row ,1th row [105+102 106+101 005+002 006+001 / 104+103 107+100 004+003 007+000] A0C0 = _mm256_madd_epi16(R0R1, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[0])); A1C0 = _mm256_madd_epi16(R2R3, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[0]));// 2 3 A2C0 = _mm256_madd_epi16(R4R5, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[0]));// 4 5 A3C0 = _mm256_madd_epi16(R6R7, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[0]));// 6 7 A0C0 = _mm256_hadd_epi32(A0C0, A1C0); //[3B 2B 1B 0B(05+02+06+01) / 3A 2A 1A 0A(04+03+07+00)] A1C0 = _mm256_permute2f128_si256(A0C0, A0C0, 0x0001); //[3A 2A 1A 0A / 3B 2B 1B 0B] A2C0 = _mm256_hadd_epi32(A2C0, A3C0); //[7B 6B 5B 4B / 7A 6A 5A 4A] A3C0 = _mm256_permute2f128_si256(A2C0, A2C0, 0x0001);//[7A 6A 5A 4A / 7B 6B 5B 4B] COE0 = _mm256_add_epi32(A0C0, A1C0); //the same line`s data add to low 128 bit (3 2 1 0) COE1 = _mm256_add_epi32(A2C0, A3C0); COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add1), shift1); COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add1), shift1); COE_RESULT = _mm256_packs_epi32(COE0, COE1);//[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] im[0] = COE_RESULT; COE0 = _mm256_sub_epi32(A0C0, A1C0); COE1 = _mm256_sub_epi32(A2C0, A3C0); COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add1), shift1); COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add1), shift1); COE_RESULT = _mm256_packs_epi32(COE0, COE1);//[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] im[16] = COE_RESULT; #define MAKE_ODD(tab,dstPos) \ A0C0 = _mm256_madd_epi16(R0R1, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ A1C0 = _mm256_madd_epi16(R2R3, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ A2C0 = _mm256_madd_epi16(R4R5, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ A3C0 = _mm256_madd_epi16(R6R7, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ \ A0C0 = _mm256_hadd_epi32(A0C0, A1C0); \ A1C0 = _mm256_permute2f128_si256(A0C0, A0C0, 0x0001); \ \ A2C0 = _mm256_hadd_epi32(A2C0, A3C0); \ A3C0 = _mm256_permute2f128_si256(A2C0, A2C0, 0x0001); \ \ COE0 = _mm256_add_epi32(A0C0, A1C0); \ COE1 = _mm256_add_epi32(A2C0, A3C0); \ \ COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add1), shift1); \ COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add1), shift1); \ \ COE_RESULT = _mm256_packs_epi32(COE0, COE1); \ im[dstPos] = COE_RESULT; MAKE_ODD(1, 8); MAKE_ODD(2, 24); #undef MAKE_ODD #define MAKE_ODD(tab,dstPos) \ A0C0 = _mm256_madd_epi16(R0R1, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ A1C0 = _mm256_madd_epi16(R2R3, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ A2C0 = _mm256_madd_epi16(R4R5, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ A3C0 = _mm256_madd_epi16(R6R7, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ \ A0C0 = _mm256_hadd_epi32(A0C0, A1C0); \ A1C0 = _mm256_permute2f128_si256(A0C0, A0C0, 0x0001); \ \ A2C0 = _mm256_hadd_epi32(A2C0, A3C0); \ A3C0 = _mm256_permute2f128_si256(A2C0, A2C0, 0x0001); \ \ COE0 = _mm256_add_epi32(A0C0, A1C0); \ COE1 = _mm256_add_epi32(A2C0, A3C0); \ \ COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add1), shift1); \ COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add1), shift1); \ \ COE_RESULT = _mm256_packs_epi32(COE0, COE1); \ \ im[dstPos] = COE_RESULT; MAKE_ODD(3, 4); MAKE_ODD(4, 12); MAKE_ODD(5, 20); MAKE_ODD(6, 28); R0R1 = _mm256_hsub_epi16(R0C0, R1C0);//[105 102 106 101 005 002 006 001 / 104 103 107 100 004 003 007 000] R2R3 = _mm256_hsub_epi16(R2C0, R3C0); R4R5 = _mm256_hsub_epi16(R4C0, R5C0); R6R7 = _mm256_hsub_epi16(R6C0, R7C0); MAKE_ODD(7, 2); MAKE_ODD(8, 6); MAKE_ODD(9, 10); MAKE_ODD(10, 14); MAKE_ODD(11, 18); MAKE_ODD(12, 22); MAKE_ODD(13, 26); MAKE_ODD(14, 30); #undef MAKE_ODD #define MAKE_ODD(tab,dstPos) \ A0C0 = _mm256_madd_epi16(R0_ODD, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ A0C1 = _mm256_madd_epi16(R1_ODD, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ A1C0 = _mm256_madd_epi16(R2_ODD, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ A1C1 = _mm256_madd_epi16(R3_ODD, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ A2C0 = _mm256_madd_epi16(R4_ODD, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ A2C1 = _mm256_madd_epi16(R5_ODD, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ A3C0 = _mm256_madd_epi16(R6_ODD, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ A3C1 = _mm256_madd_epi16(R7_ODD, _mm256_load_si256((__m256i*)tab_dct_32x32_avx2[tab])); \ \ A0C0 = _mm256_hadd_epi32(A0C0, A0C1); \ A1C0 = _mm256_hadd_epi32(A1C0, A1C1); \ A2C0 = _mm256_hadd_epi32(A2C0, A2C1); \ A3C0 = _mm256_hadd_epi32(A3C0, A3C1); \ \ A0C0 = _mm256_hadd_epi32(A0C0, A1C0)/*[3B 2B 1B 0B / 3A 2A 1A 0A]*/; \ A1C0 = _mm256_hadd_epi32(A2C0, A3C0)/*[7B 6B 5B 4B / 7A 6A 5A 4A]*/; \ \ A0C1 = _mm256_permute2f128_si256(A0C0, A0C0, 0x0001)/*[3A 2A 1A 0A / 3B 2B 1B 0B]*/; \ A1C1 = _mm256_permute2f128_si256(A1C0, A1C0, 0x0001)/*[7A 6A 5A 4A / 7B 6B 5B 4B]*/; \ \ COE0 = _mm256_add_epi32(A0C0, A0C1); \ COE1 = _mm256_add_epi32(A1C0, A1C1); \ COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add1), shift1); \ COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add1), shift1); \ \ COE_RESULT = _mm256_packs_epi32(COE0, COE1); \ \ im[dstPos] = COE_RESULT; MAKE_ODD(15, 1); MAKE_ODD(16, 3); MAKE_ODD(17, 5); MAKE_ODD(18, 7); MAKE_ODD(19, 9); MAKE_ODD(20, 11); MAKE_ODD(21, 13); MAKE_ODD(22, 15); MAKE_ODD(23, 17); MAKE_ODD(24, 19); MAKE_ODD(25, 21); MAKE_ODD(26, 23); MAKE_ODD(27, 25); MAKE_ODD(28, 27); MAKE_ODD(29, 29); MAKE_ODD(30, 31); #undef MAKE_ODD __m256i table_shuffle = _mm256_setr_epi16(0x0100, 0x0F0E, 0x0302, 0x0D0C, 0x0504, 0x0B0A, 0x0706, 0x0908, 0x0100, 0x0F0E, 0x0302, 0x0D0C, 0x0504, 0x0B0A, 0x0706, 0x0908); //__m256i im[32] for (i = 0; i < 32 / 8; i++){ R0C0 = _mm256_load_si256((__m256i const *)im + (i * 8 + 0));//[0 1 2 3 4 5 6 7 / *********] R1C0 = _mm256_load_si256((__m256i const *)im + (i * 8 + 1)); R2C0 = _mm256_load_si256((__m256i const *)im + (i * 8 + 2)); R3C0 = _mm256_load_si256((__m256i const *)im + (i * 8 + 3)); R4C0 = _mm256_load_si256((__m256i const *)im + (i * 8 + 4)); R5C0 = _mm256_load_si256((__m256i const *)im + (i * 8 + 5)); R6C0 = _mm256_load_si256((__m256i const *)im + (i * 8 + 6)); R7C0 = _mm256_load_si256((__m256i const *)im + (i * 8 + 7)); R0C0 = _mm256_shuffle_epi8(R0C0, table_shuffle); //[00 07 01 06 02 05 03 04 / *********] R1C0 = _mm256_shuffle_epi8(R1C0, table_shuffle); R2C0 = _mm256_shuffle_epi8(R2C0, table_shuffle); R3C0 = _mm256_shuffle_epi8(R3C0, table_shuffle); R4C0 = _mm256_shuffle_epi8(R4C0, table_shuffle); R5C0 = _mm256_shuffle_epi8(R5C0, table_shuffle); R6C0 = _mm256_shuffle_epi8(R6C0, table_shuffle); R7C0 = _mm256_shuffle_epi8(R7C0, table_shuffle); R0C0 = _mm256_cvtepi16_epi32(*(__m128i*)&R0C0); //[00 07 01 06 / 02 05 03 04] R1C0 = _mm256_cvtepi16_epi32(*(__m128i*)&R1C0); R2C0 = _mm256_cvtepi16_epi32(*(__m128i*)&R2C0); R3C0 = _mm256_cvtepi16_epi32(*(__m128i*)&R3C0); R4C0 = _mm256_cvtepi16_epi32(*(__m128i*)&R4C0); R5C0 = _mm256_cvtepi16_epi32(*(__m128i*)&R5C0); R6C0 = _mm256_cvtepi16_epi32(*(__m128i*)&R6C0); R7C0 = _mm256_cvtepi16_epi32(*(__m128i*)&R7C0); R0R1 = _mm256_hadd_epi32(R0C0, R1C0);//[00+07 01+06 10+17 11+16 / 02+05 03+06 12+15 13+16]------------0/1 R2R3 = _mm256_hadd_epi32(R2C0, R3C0); R4R5 = _mm256_hadd_epi32(R4C0, R5C0); R6R7 = _mm256_hadd_epi32(R6C0, R7C0); __m256i result_mask = _mm256_setr_epi32(0xf0000000, 0xf0000000, 0xf0000000, 0xf0000000, 0, 0, 0, 0); #define MAKE_ODD(tab,dstPos)\ R0C1 = _mm256_mullo_epi32(R0R1, _mm256_load_si256((__m256i *)tab_dct_8x8_avx2[tab])); \ R1C1 = _mm256_mullo_epi32(R2R3, _mm256_load_si256((__m256i *)tab_dct_8x8_avx2[tab])); \ R2C1 = _mm256_mullo_epi32(R4R5, _mm256_load_si256((__m256i *)tab_dct_8x8_avx2[tab])); \ R3C1 = _mm256_mullo_epi32(R6R7, _mm256_load_si256((__m256i *)tab_dct_8x8_avx2[tab])); \ \ R0C1 = _mm256_hadd_epi32(R0C1, R1C1/*[00+07+01+06 10+17+11+16 20+27+21+26 30+27+41+36 / 02+05+03+06 12+15+13+16 22+25+23+26 32+35+33+36]*/); \ R1C1 = _mm256_hadd_epi32(R2C1, R3C1); \ \ R2C1 = _mm256_permute2f128_si256(R0C1, R1C1, 0x0020/*[0A 1A 2A 3A / 4A 5A 6A 7A]*/); \ R3C1 = _mm256_permute2f128_si256(R0C1, R1C1, 0x0031/*[0B 1B 2B 3B / 4B 5B 6B 7B]*/); \ \ COE0 = _mm256_add_epi32(R2C1, R3C1/*[0 1 2 3 / 4 5 6 7]*/); \ COE1 = _mm256_permute2f128_si256(COE0, COE0, 0x33/*[4 5 6 7 / 4 5 6 7]*/); \ \ COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add2), shift2); \ COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add2), shift2); \ \ COE_RESULT = _mm256_packs_epi32(COE0, COE1); \ \ addr = (dst + (dstPos * 32) + (i * 8)); \ _mm256_maskstore_epi32((int*)addr, result_mask, COE_RESULT); \ MAKE_ODD(0, 0); MAKE_ODD(2, 2); MAKE_ODD(4, 4); MAKE_ODD(6, 6); #undef MAKE_ODD R0R1 = _mm256_hsub_epi32(R0C0, R1C0);//[00-07 01-06 10-17 11-16 / 02-05 03-06 12-15 13-16]------------0/1 R2R3 = _mm256_hsub_epi32(R2C0, R3C0); R4R5 = _mm256_hsub_epi32(R4C0, R5C0); R6R7 = _mm256_hsub_epi32(R6C0, R7C0); #define MAKE_ODD(tab,dstPos)\ R0C1 = _mm256_mullo_epi32(R0R1, _mm256_load_si256((__m256i *)tab_dct_8x8_avx2[tab])); \ R1C1 = _mm256_mullo_epi32(R2R3, _mm256_load_si256((__m256i *)tab_dct_8x8_avx2[tab])); \ R2C1 = _mm256_mullo_epi32(R4R5, _mm256_load_si256((__m256i *)tab_dct_8x8_avx2[tab])); \ R3C1 = _mm256_mullo_epi32(R6R7, _mm256_load_si256((__m256i *)tab_dct_8x8_avx2[tab])); \ \ R0C1 = _mm256_hadd_epi32(R0C1, R1C1/*[00+07+01+06 10+17+11+16 20+27+21+26 30+27+41+36 / 02+05+03+06 12+15+13+16 22+25+23+26 32+35+33+36]*/); \ R1C1 = _mm256_hadd_epi32(R2C1, R3C1); \ \ R2C1 = _mm256_permute2f128_si256(R0C1, R1C1, 0x0020/*[0A 1A 2A 3A / 4A 5A 6A 7A]*/); \ R3C1 = _mm256_permute2f128_si256(R0C1, R1C1, 0x0031/*[0B 1B 2B 3B / 4B 5B 6B 7B]*/); \ \ COE0 = _mm256_add_epi32(R2C1, R3C1/*[0 1 2 3 / 4 5 6 7]*/); \ COE1 = _mm256_permute2f128_si256(COE0, COE0, 0x33/*[4 5 6 7 / 4 5 6 7]*/); \ \ COE0 = _mm256_srai_epi32(_mm256_add_epi32(COE0, c_add2), shift2); \ COE1 = _mm256_srai_epi32(_mm256_add_epi32(COE1, c_add2), shift2); \ \ COE_RESULT = _mm256_packs_epi32(COE0, COE1); \ \ addr = (dst + (dstPos * 32) + (i * 8)); \ _mm256_maskstore_epi32((int*)addr, result_mask, COE_RESULT); \ MAKE_ODD(1, 1); MAKE_ODD(3, 3); MAKE_ODD(5, 5); MAKE_ODD(7, 7); #undef MAKE_ODD } } /* --------------------------------------------------------------------------- * transpose 16x16(ת) */ #define TRANSPOSE_16x16_16BIT(A00, A01, A02, A03, A04, A05, A06, A07, A08, A09, A10, A11, A12, A13, A14, A15, H00, H01,H02, H03, H04, H05, H06, H07, H08, H09, H10, H11, H12, H13, H14, H15) \ tr0_00 = _mm256_unpacklo_epi16(A00, A01); \ tr0_01 = _mm256_unpacklo_epi16(A02, A03); \ tr0_02 = _mm256_unpackhi_epi16(A00, A01); \ tr0_03 = _mm256_unpackhi_epi16(A02, A03); \ tr0_04 = _mm256_unpacklo_epi16(A04, A05); \ tr0_05 = _mm256_unpacklo_epi16(A06, A07); \ tr0_06 = _mm256_unpackhi_epi16(A04, A05); \ tr0_07 = _mm256_unpackhi_epi16(A06, A07); \ tr0_08 = _mm256_unpacklo_epi16(A08, A09); \ tr0_09 = _mm256_unpacklo_epi16(A10, A11); \ tr0_10 = _mm256_unpackhi_epi16(A08, A09); \ tr0_11 = _mm256_unpackhi_epi16(A10, A11); \ tr0_12 = _mm256_unpacklo_epi16(A12, A13); \ tr0_13 = _mm256_unpacklo_epi16(A14, A15); \ tr0_14 = _mm256_unpackhi_epi16(A12, A13); \ tr0_15 = _mm256_unpackhi_epi16(A14, A15); \ tr1_00 = _mm256_unpacklo_epi32(tr0_00, tr0_01); \ tr1_01 = _mm256_unpacklo_epi32(tr0_02, tr0_03); \ tr1_02 = _mm256_unpackhi_epi32(tr0_00, tr0_01); \ tr1_03 = _mm256_unpackhi_epi32(tr0_02, tr0_03); \ tr1_04 = _mm256_unpacklo_epi32(tr0_04, tr0_05); \ tr1_05 = _mm256_unpacklo_epi32(tr0_06, tr0_07); \ tr1_06 = _mm256_unpackhi_epi32(tr0_04, tr0_05); \ tr1_07 = _mm256_unpackhi_epi32(tr0_06, tr0_07); \ tr1_08 = _mm256_unpacklo_epi32(tr0_08, tr0_09); \ tr1_09 = _mm256_unpacklo_epi32(tr0_10, tr0_11); \ tr1_10 = _mm256_unpackhi_epi32(tr0_08, tr0_09); \ tr1_11 = _mm256_unpackhi_epi32(tr0_10, tr0_11); \ tr1_12 = _mm256_unpacklo_epi32(tr0_12, tr0_13); \ tr1_13 = _mm256_unpacklo_epi32(tr0_14, tr0_15); \ tr1_14 = _mm256_unpackhi_epi32(tr0_12, tr0_13); \ tr1_15 = _mm256_unpackhi_epi32(tr0_14, tr0_15); \ tr0_00 = _mm256_unpacklo_epi64(tr1_00, tr1_04); \ tr0_01 = _mm256_unpackhi_epi64(tr1_00, tr1_04); \ tr0_02 = _mm256_unpacklo_epi64(tr1_02, tr1_06); \ tr0_03 = _mm256_unpackhi_epi64(tr1_02, tr1_06); \ tr0_04 = _mm256_unpacklo_epi64(tr1_01, tr1_05); \ tr0_05 = _mm256_unpackhi_epi64(tr1_01, tr1_05); \ tr0_06 = _mm256_unpacklo_epi64(tr1_03, tr1_07); \ tr0_07 = _mm256_unpackhi_epi64(tr1_03, tr1_07); \ tr0_08 = _mm256_unpacklo_epi64(tr1_08, tr1_12); \ tr0_09 = _mm256_unpackhi_epi64(tr1_08, tr1_12); \ tr0_10 = _mm256_unpacklo_epi64(tr1_10, tr1_14); \ tr0_11 = _mm256_unpackhi_epi64(tr1_10, tr1_14); \ tr0_12 = _mm256_unpacklo_epi64(tr1_09, tr1_13); \ tr0_13 = _mm256_unpackhi_epi64(tr1_09, tr1_13); \ tr0_14 = _mm256_unpacklo_epi64(tr1_11, tr1_15); \ tr0_15 = _mm256_unpackhi_epi64(tr1_11, tr1_15); \ H00 = _mm256_permute2x128_si256(tr0_00, tr0_08, 0x20);\ H01 = _mm256_permute2x128_si256(tr0_01, tr0_09, 0x20);\ H02 = _mm256_permute2x128_si256(tr0_02, tr0_10, 0x20);\ H03 = _mm256_permute2x128_si256(tr0_03, tr0_11, 0x20);\ H04 = _mm256_permute2x128_si256(tr0_04, tr0_12, 0x20);\ H05 = _mm256_permute2x128_si256(tr0_05, tr0_13, 0x20);\ H06 = _mm256_permute2x128_si256(tr0_06, tr0_14, 0x20);\ H07 = _mm256_permute2x128_si256(tr0_07, tr0_15, 0x20);\ H08 = _mm256_permute2x128_si256(tr0_00, tr0_08, 0x31);\ H09 = _mm256_permute2x128_si256(tr0_01, tr0_09, 0x31);\ H10 = _mm256_permute2x128_si256(tr0_02, tr0_10, 0x31);\ H11 = _mm256_permute2x128_si256(tr0_03, tr0_11, 0x31);\ H12 = _mm256_permute2x128_si256(tr0_04, tr0_12, 0x31);\ H13 = _mm256_permute2x128_si256(tr0_05, tr0_13, 0x31);\ H14 = _mm256_permute2x128_si256(tr0_06, tr0_14, 0x31);\ H15 = _mm256_permute2x128_si256(tr0_07, tr0_15, 0x31);\ /* --------------------------------------------------------------------------- * transpose 8x16(ת) */ #define TRANSPOSE_8x16_16BIT(A_00, A_01, A_02, A_03, A_04, A_05, A_06, A_07, H_00, H_01,H_02, H_03, H_04, H_05, H_06, H_07, H_08, H_09, H_10, H_11, H_12, H_13, H_14, H_15) \ tr0_00 = _mm256_unpacklo_epi16(A_00, A_01); \ tr0_01 = _mm256_unpacklo_epi16(A_02, A_03); \ tr0_02 = _mm256_unpackhi_epi16(A_00, A_01); \ tr0_03 = _mm256_unpackhi_epi16(A_02, A_03); \ tr0_04 = _mm256_unpacklo_epi16(A_04, A_05); \ tr0_05 = _mm256_unpacklo_epi16(A_06, A_07); \ tr0_06 = _mm256_unpackhi_epi16(A_04, A_05); \ tr0_07 = _mm256_unpackhi_epi16(A_06, A_07); \ tr1_00 = _mm256_unpacklo_epi32(tr0_00, tr0_01); \ tr1_01 = _mm256_unpacklo_epi32(tr0_02, tr0_03); \ tr1_02 = _mm256_unpackhi_epi32(tr0_00, tr0_01); \ tr1_03 = _mm256_unpackhi_epi32(tr0_02, tr0_03); \ tr1_04 = _mm256_unpacklo_epi32(tr0_04, tr0_05); \ tr1_05 = _mm256_unpacklo_epi32(tr0_06, tr0_07); \ tr1_06 = _mm256_unpackhi_epi32(tr0_04, tr0_05); \ tr1_07 = _mm256_unpackhi_epi32(tr0_06, tr0_07); \ tr0_00 = _mm256_unpacklo_epi64(tr1_00, tr1_04); \ tr0_01 = _mm256_unpackhi_epi64(tr1_00, tr1_04); \ tr0_02 = _mm256_unpacklo_epi64(tr1_02, tr1_06); \ tr0_03 = _mm256_unpackhi_epi64(tr1_02, tr1_06); \ tr0_04 = _mm256_unpacklo_epi64(tr1_01, tr1_05); \ tr0_05 = _mm256_unpackhi_epi64(tr1_01, tr1_05); \ tr0_06 = _mm256_unpacklo_epi64(tr1_03, tr1_07); \ tr0_07 = _mm256_unpackhi_epi64(tr1_03, tr1_07); \ H_00 = _mm256_extracti128_si256(tr0_00, 0);\ H_01 = _mm256_extracti128_si256(tr0_01, 0);\ H_02 = _mm256_extracti128_si256(tr0_02, 0);\ H_03 = _mm256_extracti128_si256(tr0_03, 0);\ H_04 = _mm256_extracti128_si256(tr0_04, 0);\ H_05 = _mm256_extracti128_si256(tr0_05, 0);\ H_06 = _mm256_extracti128_si256(tr0_06, 0);\ H_07 = _mm256_extracti128_si256(tr0_07, 0);\ H_08 = _mm256_extracti128_si256(tr0_00, 1);\ H_09 = _mm256_extracti128_si256(tr0_01, 1);\ H_10 = _mm256_extracti128_si256(tr0_02, 1);\ H_11 = _mm256_extracti128_si256(tr0_03, 1);\ H_12 = _mm256_extracti128_si256(tr0_04, 1);\ H_13 = _mm256_extracti128_si256(tr0_05, 1);\ H_14 = _mm256_extracti128_si256(tr0_06, 1);\ H_15 = _mm256_extracti128_si256(tr0_07, 1);\ /* --------------------------------------------------------------------------- */ static void wavelet_16x64_avx2(coeff_t *coeff) { // 16*64 __m256i V00, V01, V02, V03, V04, V05, V06, V07, V08, V09, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31, V32, V33, V34, V35, V36, V37, V38, V39, V40, V41, V42, V43, V44, V45, V46, V47, V48, V49, V50, V51, V52, V53, V54, V55, V56, V57, V58, V59, V60, V61, V62, V63; // 64*16 __m256i T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4]; //ʱ __m128i B00, B01, B02, B03, B04, B05, B06, B07, B08, B09, B10, B11, B12, B13, B14, B15, B16, B17, B18, B19, B20, B21, B22, B23, B24, B25, B26, B27, B28, B29, B30, B31; __m128i B32, B33, B34, B35, B36, B37, B38, B39, B40, B41, B42, B43, B44, B45, B46, B47, B48, B49, B50, B51, B52, B53, B54, B55, B56, B57, B58, B59, B60, B61, B62, B63; __m256i tr0_00, tr0_01, tr0_02, tr0_03, tr0_04, tr0_05, tr0_06, tr0_07, tr0_08, tr0_09, tr0_10, tr0_11, tr0_12, tr0_13, tr0_14, tr0_15; __m256i tr1_00, tr1_01, tr1_02, tr1_03, tr1_04, tr1_05, tr1_06, tr1_07, tr1_08, tr1_09, tr1_10, tr1_11, tr1_12, tr1_13, tr1_14, tr1_15; int i; __m128i mAddOffset1 = _mm_set1_epi16(1); __m256i mAddOffset2 = _mm256_set1_epi16(2); V00 = _mm256_load_si256((__m256i*)&coeff[16 * 0]); V01 = _mm256_load_si256((__m256i*)&coeff[16 * 1]); V02 = _mm256_load_si256((__m256i*)&coeff[16 * 2]); V03 = _mm256_load_si256((__m256i*)&coeff[16 * 3]); V04 = _mm256_load_si256((__m256i*)&coeff[16 * 4]); V05 = _mm256_load_si256((__m256i*)&coeff[16 * 5]); V06 = _mm256_load_si256((__m256i*)&coeff[16 * 6]); V07 = _mm256_load_si256((__m256i*)&coeff[16 * 7]); V08 = _mm256_load_si256((__m256i*)&coeff[16 * 8]); V09 = _mm256_load_si256((__m256i*)&coeff[16 * 9]); V10 = _mm256_load_si256((__m256i*)&coeff[16 * 10]); V11 = _mm256_load_si256((__m256i*)&coeff[16 * 11]); V12 = _mm256_load_si256((__m256i*)&coeff[16 * 12]); V13 = _mm256_load_si256((__m256i*)&coeff[16 * 13]); V14 = _mm256_load_si256((__m256i*)&coeff[16 * 14]); V15 = _mm256_load_si256((__m256i*)&coeff[16 * 15]); V16 = _mm256_load_si256((__m256i*)&coeff[16 * 16]); V17 = _mm256_load_si256((__m256i*)&coeff[16 * 17]); V18 = _mm256_load_si256((__m256i*)&coeff[16 * 18]); V19 = _mm256_load_si256((__m256i*)&coeff[16 * 19]); V20 = _mm256_load_si256((__m256i*)&coeff[16 * 20]); V21 = _mm256_load_si256((__m256i*)&coeff[16 * 21]); V22 = _mm256_load_si256((__m256i*)&coeff[16 * 22]); V23 = _mm256_load_si256((__m256i*)&coeff[16 * 23]); V24 = _mm256_load_si256((__m256i*)&coeff[16 * 24]); V25 = _mm256_load_si256((__m256i*)&coeff[16 * 25]); V26 = _mm256_load_si256((__m256i*)&coeff[16 * 26]); V27 = _mm256_load_si256((__m256i*)&coeff[16 * 27]); V28 = _mm256_load_si256((__m256i*)&coeff[16 * 28]); V29 = _mm256_load_si256((__m256i*)&coeff[16 * 29]); V30 = _mm256_load_si256((__m256i*)&coeff[16 * 30]); V31 = _mm256_load_si256((__m256i*)&coeff[16 * 31]); V32 = _mm256_load_si256((__m256i*)&coeff[16 * 32]); V33 = _mm256_load_si256((__m256i*)&coeff[16 * 33]); V34 = _mm256_load_si256((__m256i*)&coeff[16 * 34]); V35 = _mm256_load_si256((__m256i*)&coeff[16 * 35]); V36 = _mm256_load_si256((__m256i*)&coeff[16 * 36]); V37 = _mm256_load_si256((__m256i*)&coeff[16 * 37]); V38 = _mm256_load_si256((__m256i*)&coeff[16 * 38]); V39 = _mm256_load_si256((__m256i*)&coeff[16 * 39]); V40 = _mm256_load_si256((__m256i*)&coeff[16 * 40]); V41 = _mm256_load_si256((__m256i*)&coeff[16 * 41]); V42 = _mm256_load_si256((__m256i*)&coeff[16 * 42]); V43 = _mm256_load_si256((__m256i*)&coeff[16 * 43]); V44 = _mm256_load_si256((__m256i*)&coeff[16 * 44]); V45 = _mm256_load_si256((__m256i*)&coeff[16 * 45]); V46 = _mm256_load_si256((__m256i*)&coeff[16 * 46]); V47 = _mm256_load_si256((__m256i*)&coeff[16 * 47]); V48 = _mm256_load_si256((__m256i*)&coeff[16 * 48]); V49 = _mm256_load_si256((__m256i*)&coeff[16 * 49]); V50 = _mm256_load_si256((__m256i*)&coeff[16 * 50]); V51 = _mm256_load_si256((__m256i*)&coeff[16 * 51]); V52 = _mm256_load_si256((__m256i*)&coeff[16 * 52]); V53 = _mm256_load_si256((__m256i*)&coeff[16 * 53]); V54 = _mm256_load_si256((__m256i*)&coeff[16 * 54]); V55 = _mm256_load_si256((__m256i*)&coeff[16 * 55]); V56 = _mm256_load_si256((__m256i*)&coeff[16 * 56]); V57 = _mm256_load_si256((__m256i*)&coeff[16 * 57]); V58 = _mm256_load_si256((__m256i*)&coeff[16 * 58]); V59 = _mm256_load_si256((__m256i*)&coeff[16 * 59]); V60 = _mm256_load_si256((__m256i*)&coeff[16 * 60]); V61 = _mm256_load_si256((__m256i*)&coeff[16 * 61]); V62 = _mm256_load_si256((__m256i*)&coeff[16 * 62]); V63 = _mm256_load_si256((__m256i*)&coeff[16 * 63]); TRANSPOSE_16x16_16BIT(V00, V01, V02, V03, V04, V05, V06, V07, V08, V09, V10, V11, V12, V13, V14, V15, T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0]); TRANSPOSE_16x16_16BIT(V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31, T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1]); TRANSPOSE_16x16_16BIT(V32, V33, V34, V35, V36, V37, V38, V39, V40, V41, V42, V43, V44, V45, V46, V47, T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2], T08[2], T09[2], T10[2], T11[2], T12[2], T13[2], T14[2], T15[2]); TRANSPOSE_16x16_16BIT(V48, V49, V50, V51, V52, V53, V54, V55, V56, V57, V58, V59, V60, V61, V62, V63, T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3], T08[3], T09[3], T10[3], T11[3], T12[3], T13[3], T14[3], T15[3]); /* step 1: horizontal transform */ // pExt[x] -= (pExt[x - 1] + pExt[x + 1]) >> 1; for (i = 0; i < 4; i++){ T01[i] = _mm256_sub_epi16(T01[i], _mm256_srai_epi16(_mm256_add_epi16(T00[i], T02[i]), 1)); T03[i] = _mm256_sub_epi16(T03[i], _mm256_srai_epi16(_mm256_add_epi16(T02[i], T04[i]), 1)); T05[i] = _mm256_sub_epi16(T05[i], _mm256_srai_epi16(_mm256_add_epi16(T04[i], T06[i]), 1)); T07[i] = _mm256_sub_epi16(T07[i], _mm256_srai_epi16(_mm256_add_epi16(T06[i], T08[i]), 1)); T09[i] = _mm256_sub_epi16(T09[i], _mm256_srai_epi16(_mm256_add_epi16(T08[i], T10[i]), 1)); T11[i] = _mm256_sub_epi16(T11[i], _mm256_srai_epi16(_mm256_add_epi16(T10[i], T12[i]), 1)); T13[i] = _mm256_sub_epi16(T13[i], _mm256_srai_epi16(_mm256_add_epi16(T12[i], T14[i]), 1)); T15[i] = _mm256_sub_epi16(T15[i], _mm256_srai_epi16(_mm256_add_epi16(T14[i], T14[i]), 1)); } for (i = 0; i < 4; i++){ T00[i] = _mm256_add_epi16(T00[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T01[i], T01[i]), mAddOffset2), 2)); T02[i] = _mm256_add_epi16(T02[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T01[i], T03[i]), mAddOffset2), 2)); T04[i] = _mm256_add_epi16(T04[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T03[i], T05[i]), mAddOffset2), 2)); T06[i] = _mm256_add_epi16(T06[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T05[i], T07[i]), mAddOffset2), 2)); T08[i] = _mm256_add_epi16(T08[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T07[i], T09[i]), mAddOffset2), 2)); T10[i] = _mm256_add_epi16(T10[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T09[i], T11[i]), mAddOffset2), 2)); T12[i] = _mm256_add_epi16(T12[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T11[i], T13[i]), mAddOffset2), 2)); T14[i] = _mm256_add_epi16(T14[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T13[i], T15[i]), mAddOffset2), 2)); } /* step 2: vertical transform */ /* copy ת*/ TRANSPOSE_8x16_16BIT(T00[0], T02[0], T04[0], T06[0], T08[0], T10[0], T12[0], T14[0], B00, B01, B02, B03, B04, B05, B06, B07, B08, B09, B10, B11, B12, B13, B14, B15); TRANSPOSE_8x16_16BIT(T00[1], T02[1], T04[1], T06[1], T08[1], T10[1], T12[1], T14[1], B16, B17, B18, B19, B20, B21, B22, B23, B24, B25, B26, B27, B28, B29, B30, B31); TRANSPOSE_8x16_16BIT(T00[2], T02[2], T04[2], T06[2], T08[2], T10[2], T12[2], T14[2], B32, B33, B34, B35, B36, B37, B38, B39, B40, B41, B42, B43, B44, B45, B46, B47); TRANSPOSE_8x16_16BIT(T00[3], T02[3], T04[3], T06[3], T08[3], T10[3], T12[3], T14[3], B48, B49, B50, B51, B52, B53, B54, B55, B56, B57, B58, B59, B60, B61, B62, B63); //pExt[y] -= (pExt[y - 1] + pExt[y + 1]) >> 1; B01 = _mm_sub_epi16(B01, _mm_srai_epi16(_mm_add_epi16(B00, B02), 1)); B03 = _mm_sub_epi16(B03, _mm_srai_epi16(_mm_add_epi16(B02, B04), 1)); B05 = _mm_sub_epi16(B05, _mm_srai_epi16(_mm_add_epi16(B04, B06), 1)); B07 = _mm_sub_epi16(B07, _mm_srai_epi16(_mm_add_epi16(B06, B08), 1)); B09 = _mm_sub_epi16(B09, _mm_srai_epi16(_mm_add_epi16(B08, B10), 1)); B11 = _mm_sub_epi16(B11, _mm_srai_epi16(_mm_add_epi16(B10, B12), 1)); B13 = _mm_sub_epi16(B13, _mm_srai_epi16(_mm_add_epi16(B12, B14), 1)); B15 = _mm_sub_epi16(B15, _mm_srai_epi16(_mm_add_epi16(B14, B16), 1)); B17 = _mm_sub_epi16(B17, _mm_srai_epi16(_mm_add_epi16(B16, B18), 1)); B19 = _mm_sub_epi16(B19, _mm_srai_epi16(_mm_add_epi16(B18, B20), 1)); B21 = _mm_sub_epi16(B21, _mm_srai_epi16(_mm_add_epi16(B20, B22), 1)); B23 = _mm_sub_epi16(B23, _mm_srai_epi16(_mm_add_epi16(B22, B24), 1)); B25 = _mm_sub_epi16(B25, _mm_srai_epi16(_mm_add_epi16(B24, B26), 1)); B27 = _mm_sub_epi16(B27, _mm_srai_epi16(_mm_add_epi16(B26, B28), 1)); B29 = _mm_sub_epi16(B29, _mm_srai_epi16(_mm_add_epi16(B28, B30), 1)); B31 = _mm_sub_epi16(B31, _mm_srai_epi16(_mm_add_epi16(B30, B32), 1)); B33 = _mm_sub_epi16(B33, _mm_srai_epi16(_mm_add_epi16(B32, B34), 1)); B35 = _mm_sub_epi16(B35, _mm_srai_epi16(_mm_add_epi16(B34, B36), 1)); B37 = _mm_sub_epi16(B37, _mm_srai_epi16(_mm_add_epi16(B36, B38), 1)); B39 = _mm_sub_epi16(B39, _mm_srai_epi16(_mm_add_epi16(B38, B40), 1)); B41 = _mm_sub_epi16(B41, _mm_srai_epi16(_mm_add_epi16(B40, B42), 1)); B43 = _mm_sub_epi16(B43, _mm_srai_epi16(_mm_add_epi16(B42, B44), 1)); B45 = _mm_sub_epi16(B45, _mm_srai_epi16(_mm_add_epi16(B44, B46), 1)); B47 = _mm_sub_epi16(B47, _mm_srai_epi16(_mm_add_epi16(B46, B48), 1)); B49 = _mm_sub_epi16(B49, _mm_srai_epi16(_mm_add_epi16(B48, B50), 1)); B51 = _mm_sub_epi16(B51, _mm_srai_epi16(_mm_add_epi16(B50, B52), 1)); B53 = _mm_sub_epi16(B53, _mm_srai_epi16(_mm_add_epi16(B52, B54), 1)); B55 = _mm_sub_epi16(B55, _mm_srai_epi16(_mm_add_epi16(B54, B56), 1)); B57 = _mm_sub_epi16(B57, _mm_srai_epi16(_mm_add_epi16(B56, B58), 1)); B59 = _mm_sub_epi16(B59, _mm_srai_epi16(_mm_add_epi16(B58, B60), 1)); B61 = _mm_sub_epi16(B61, _mm_srai_epi16(_mm_add_epi16(B60, B62), 1)); B63 = _mm_sub_epi16(B63, _mm_srai_epi16(_mm_add_epi16(B62, B62), 1)); //pExt[y] = (pExt[y] << 1) + ((pExt[y - 1] + pExt[y + 1] + 1) >> 1); B00 = _mm_add_epi16(_mm_slli_epi16(B00, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B01, B01), mAddOffset1), 1)); B02 = _mm_add_epi16(_mm_slli_epi16(B02, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B01, B03), mAddOffset1), 1)); B04 = _mm_add_epi16(_mm_slli_epi16(B04, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B03, B05), mAddOffset1), 1)); B06 = _mm_add_epi16(_mm_slli_epi16(B06, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B05, B07), mAddOffset1), 1)); B08 = _mm_add_epi16(_mm_slli_epi16(B08, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B07, B09), mAddOffset1), 1)); B10 = _mm_add_epi16(_mm_slli_epi16(B10, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B09, B11), mAddOffset1), 1)); B12 = _mm_add_epi16(_mm_slli_epi16(B12, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B11, B13), mAddOffset1), 1)); B14 = _mm_add_epi16(_mm_slli_epi16(B14, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B13, B15), mAddOffset1), 1)); B16 = _mm_add_epi16(_mm_slli_epi16(B16, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B15, B17), mAddOffset1), 1)); B18 = _mm_add_epi16(_mm_slli_epi16(B18, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B17, B19), mAddOffset1), 1)); B20 = _mm_add_epi16(_mm_slli_epi16(B20, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B19, B21), mAddOffset1), 1)); B22 = _mm_add_epi16(_mm_slli_epi16(B22, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B21, B23), mAddOffset1), 1)); B24 = _mm_add_epi16(_mm_slli_epi16(B24, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B23, B25), mAddOffset1), 1)); B26 = _mm_add_epi16(_mm_slli_epi16(B26, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B25, B27), mAddOffset1), 1)); B28 = _mm_add_epi16(_mm_slli_epi16(B28, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B27, B29), mAddOffset1), 1)); B30 = _mm_add_epi16(_mm_slli_epi16(B30, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B29, B31), mAddOffset1), 1)); B32 = _mm_add_epi16(_mm_slli_epi16(B32, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B31, B33), mAddOffset1), 1)); B34 = _mm_add_epi16(_mm_slli_epi16(B34, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B33, B35), mAddOffset1), 1)); B36 = _mm_add_epi16(_mm_slli_epi16(B36, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B35, B37), mAddOffset1), 1)); B38 = _mm_add_epi16(_mm_slli_epi16(B38, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B37, B39), mAddOffset1), 1)); B40 = _mm_add_epi16(_mm_slli_epi16(B40, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B39, B41), mAddOffset1), 1)); B42 = _mm_add_epi16(_mm_slli_epi16(B42, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B41, B43), mAddOffset1), 1)); B44 = _mm_add_epi16(_mm_slli_epi16(B44, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B43, B45), mAddOffset1), 1)); B46 = _mm_add_epi16(_mm_slli_epi16(B46, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B45, B47), mAddOffset1), 1)); B48 = _mm_add_epi16(_mm_slli_epi16(B48, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B47, B49), mAddOffset1), 1)); B50 = _mm_add_epi16(_mm_slli_epi16(B50, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B49, B51), mAddOffset1), 1)); B52 = _mm_add_epi16(_mm_slli_epi16(B52, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B51, B53), mAddOffset1), 1)); B54 = _mm_add_epi16(_mm_slli_epi16(B54, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B53, B55), mAddOffset1), 1)); B56 = _mm_add_epi16(_mm_slli_epi16(B56, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B55, B57), mAddOffset1), 1)); B58 = _mm_add_epi16(_mm_slli_epi16(B58, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B57, B59), mAddOffset1), 1)); B60 = _mm_add_epi16(_mm_slli_epi16(B60, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B59, B61), mAddOffset1), 1)); B62 = _mm_add_epi16(_mm_slli_epi16(B62, 1), _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(B61, B63), mAddOffset1), 1)); //STORE _mm_store_si128((__m128i*)&coeff[8 * 0], B00); _mm_store_si128((__m128i*)&coeff[8 * 1], B02); _mm_store_si128((__m128i*)&coeff[8 * 2], B04); _mm_store_si128((__m128i*)&coeff[8 * 3], B06); _mm_store_si128((__m128i*)&coeff[8 * 4], B08); _mm_store_si128((__m128i*)&coeff[8 * 5], B10); _mm_store_si128((__m128i*)&coeff[8 * 6], B12); _mm_store_si128((__m128i*)&coeff[8 * 7], B14); _mm_store_si128((__m128i*)&coeff[8 * 8], B16); _mm_store_si128((__m128i*)&coeff[8 * 9], B18); _mm_store_si128((__m128i*)&coeff[8 * 10], B20); _mm_store_si128((__m128i*)&coeff[8 * 11], B22); _mm_store_si128((__m128i*)&coeff[8 * 12], B24); _mm_store_si128((__m128i*)&coeff[8 * 13], B26); _mm_store_si128((__m128i*)&coeff[8 * 14], B28); _mm_store_si128((__m128i*)&coeff[8 * 15], B30); _mm_store_si128((__m128i*)&coeff[8 * 16], B32); _mm_store_si128((__m128i*)&coeff[8 * 17], B34); _mm_store_si128((__m128i*)&coeff[8 * 18], B36); _mm_store_si128((__m128i*)&coeff[8 * 19], B38); _mm_store_si128((__m128i*)&coeff[8 * 20], B40); _mm_store_si128((__m128i*)&coeff[8 * 21], B42); _mm_store_si128((__m128i*)&coeff[8 * 22], B44); _mm_store_si128((__m128i*)&coeff[8 * 23], B46); _mm_store_si128((__m128i*)&coeff[8 * 24], B48); _mm_store_si128((__m128i*)&coeff[8 * 25], B50); _mm_store_si128((__m128i*)&coeff[8 * 26], B52); _mm_store_si128((__m128i*)&coeff[8 * 27], B54); _mm_store_si128((__m128i*)&coeff[8 * 28], B56); _mm_store_si128((__m128i*)&coeff[8 * 29], B58); _mm_store_si128((__m128i*)&coeff[8 * 30], B60); _mm_store_si128((__m128i*)&coeff[8 * 31], B62); } /* --------------------------------------------------------------------------- */ static void wavelet_64x16_avx2(coeff_t *coeff) { // 16*64 __m256i V00, V01, V02, V03, V04, V05, V06, V07, V08, V09, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31, V32, V33, V34, V35, V36, V37, V38, V39, V40, V41, V42, V43, V44, V45, V46, V47, V48, V49, V50, V51, V52, V53, V54, V55, V56, V57, V58, V59, V60, V61, V62, V63; // 64*16 __m256i T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4]; //ʱ 64*16 __m256i A00[2], A01[2], A02[2], A03[2], A04[2], A05[2], A06[2], A07[2], A08[2], A09[2], A10[2], A11[2], A12[2], A13[2], A14[2], A15[2]; __m256i tr0_00, tr0_01, tr0_02, tr0_03, tr0_04, tr0_05, tr0_06, tr0_07, tr0_08, tr0_09, tr0_10, tr0_11, tr0_12, tr0_13, tr0_14, tr0_15; __m256i tr1_00, tr1_01, tr1_02, tr1_03, tr1_04, tr1_05, tr1_06, tr1_07, tr1_08, tr1_09, tr1_10, tr1_11, tr1_12, tr1_13, tr1_14, tr1_15; int i; __m256i mAddOffset1 = _mm256_set1_epi16(1); __m256i mAddOffset2 = _mm256_set1_epi16(2); //load for (i = 0; i < 4; i++) { T00[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 0])); T01[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 1])); T02[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 2])); T03[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 3])); T04[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 4])); T05[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 5])); T06[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 6])); T07[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 7])); T08[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 8])); T09[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 9])); T10[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 10])); T11[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 11])); T12[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 12])); T13[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 13])); T14[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 14])); T15[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 15])); } TRANSPOSE_16x16_16BIT(T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0], V00, V01, V02, V03, V04, V05, V06, V07, V08, V09, V10, V11, V12, V13, V14, V15); TRANSPOSE_16x16_16BIT(T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1], V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31); TRANSPOSE_16x16_16BIT(T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2], T08[2], T09[2], T10[2], T11[2], T12[2], T13[2], T14[2], T15[2], V32, V33, V34, V35, V36, V37, V38, V39, V40, V41, V42, V43, V44, V45, V46, V47); TRANSPOSE_16x16_16BIT(T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3], T08[3], T09[3], T10[3], T11[3], T12[3], T13[3], T14[3], T15[3], V48, V49, V50, V51, V52, V53, V54, V55, V56, V57, V58, V59, V60, V61, V62, V63); //pExt[x] -= (pExt[x - 1] + pExt[x + 1]) >> 1; V01 = _mm256_sub_epi16(V01, _mm256_srai_epi16(_mm256_add_epi16(V00, V02), 1)); V03 = _mm256_sub_epi16(V03, _mm256_srai_epi16(_mm256_add_epi16(V02, V04), 1)); V05 = _mm256_sub_epi16(V05, _mm256_srai_epi16(_mm256_add_epi16(V04, V06), 1)); V07 = _mm256_sub_epi16(V07, _mm256_srai_epi16(_mm256_add_epi16(V06, V08), 1)); V09 = _mm256_sub_epi16(V09, _mm256_srai_epi16(_mm256_add_epi16(V08, V10), 1)); V11 = _mm256_sub_epi16(V11, _mm256_srai_epi16(_mm256_add_epi16(V10, V12), 1)); V13 = _mm256_sub_epi16(V13, _mm256_srai_epi16(_mm256_add_epi16(V12, V14), 1)); V15 = _mm256_sub_epi16(V15, _mm256_srai_epi16(_mm256_add_epi16(V14, V16), 1)); V17 = _mm256_sub_epi16(V17, _mm256_srai_epi16(_mm256_add_epi16(V16, V18), 1)); V19 = _mm256_sub_epi16(V19, _mm256_srai_epi16(_mm256_add_epi16(V18, V20), 1)); V21 = _mm256_sub_epi16(V21, _mm256_srai_epi16(_mm256_add_epi16(V20, V22), 1)); V23 = _mm256_sub_epi16(V23, _mm256_srai_epi16(_mm256_add_epi16(V22, V24), 1)); V25 = _mm256_sub_epi16(V25, _mm256_srai_epi16(_mm256_add_epi16(V24, V26), 1)); V27 = _mm256_sub_epi16(V27, _mm256_srai_epi16(_mm256_add_epi16(V26, V28), 1)); V29 = _mm256_sub_epi16(V29, _mm256_srai_epi16(_mm256_add_epi16(V28, V30), 1)); V31 = _mm256_sub_epi16(V31, _mm256_srai_epi16(_mm256_add_epi16(V30, V32), 1)); V33 = _mm256_sub_epi16(V33, _mm256_srai_epi16(_mm256_add_epi16(V32, V34), 1)); V35 = _mm256_sub_epi16(V35, _mm256_srai_epi16(_mm256_add_epi16(V34, V36), 1)); V37 = _mm256_sub_epi16(V37, _mm256_srai_epi16(_mm256_add_epi16(V36, V38), 1)); V39 = _mm256_sub_epi16(V39, _mm256_srai_epi16(_mm256_add_epi16(V38, V40), 1)); V41 = _mm256_sub_epi16(V41, _mm256_srai_epi16(_mm256_add_epi16(V40, V42), 1)); V43 = _mm256_sub_epi16(V43, _mm256_srai_epi16(_mm256_add_epi16(V42, V44), 1)); V45 = _mm256_sub_epi16(V45, _mm256_srai_epi16(_mm256_add_epi16(V44, V46), 1)); V47 = _mm256_sub_epi16(V47, _mm256_srai_epi16(_mm256_add_epi16(V46, V48), 1)); V49 = _mm256_sub_epi16(V49, _mm256_srai_epi16(_mm256_add_epi16(V48, V50), 1)); V51 = _mm256_sub_epi16(V51, _mm256_srai_epi16(_mm256_add_epi16(V50, V52), 1)); V53 = _mm256_sub_epi16(V53, _mm256_srai_epi16(_mm256_add_epi16(V52, V54), 1)); V55 = _mm256_sub_epi16(V55, _mm256_srai_epi16(_mm256_add_epi16(V54, V56), 1)); V57 = _mm256_sub_epi16(V57, _mm256_srai_epi16(_mm256_add_epi16(V56, V58), 1)); V59 = _mm256_sub_epi16(V59, _mm256_srai_epi16(_mm256_add_epi16(V58, V60), 1)); V61 = _mm256_sub_epi16(V61, _mm256_srai_epi16(_mm256_add_epi16(V60, V62), 1)); V63 = _mm256_sub_epi16(V63, _mm256_srai_epi16(_mm256_add_epi16(V62, V62), 1)); //pExt[x] += (pExt[x - 1] + pExt[x + 1] + 2) >> 2; V00 = _mm256_add_epi16(V00, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V01, V01), mAddOffset2), 2)); V02 = _mm256_add_epi16(V02, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V01, V03), mAddOffset2), 2)); V04 = _mm256_add_epi16(V04, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V03, V05), mAddOffset2), 2)); V06 = _mm256_add_epi16(V06, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V05, V07), mAddOffset2), 2)); V08 = _mm256_add_epi16(V08, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V07, V09), mAddOffset2), 2)); V10 = _mm256_add_epi16(V10, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V09, V11), mAddOffset2), 2)); V12 = _mm256_add_epi16(V12, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V11, V13), mAddOffset2), 2)); V14 = _mm256_add_epi16(V14, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V13, V15), mAddOffset2), 2)); V16 = _mm256_add_epi16(V16, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V15, V17), mAddOffset2), 2)); V18 = _mm256_add_epi16(V18, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V17, V19), mAddOffset2), 2)); V20 = _mm256_add_epi16(V20, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V19, V21), mAddOffset2), 2)); V22 = _mm256_add_epi16(V22, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V21, V23), mAddOffset2), 2)); V24 = _mm256_add_epi16(V24, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V23, V25), mAddOffset2), 2)); V26 = _mm256_add_epi16(V26, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V25, V27), mAddOffset2), 2)); V28 = _mm256_add_epi16(V28, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V27, V29), mAddOffset2), 2)); V30 = _mm256_add_epi16(V30, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V29, V31), mAddOffset2), 2)); V32 = _mm256_add_epi16(V32, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V31, V33), mAddOffset2), 2)); V34 = _mm256_add_epi16(V34, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V33, V35), mAddOffset2), 2)); V36 = _mm256_add_epi16(V36, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V35, V37), mAddOffset2), 2)); V38 = _mm256_add_epi16(V38, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V37, V39), mAddOffset2), 2)); V40 = _mm256_add_epi16(V40, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V39, V41), mAddOffset2), 2)); V42 = _mm256_add_epi16(V42, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V41, V43), mAddOffset2), 2)); V44 = _mm256_add_epi16(V44, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V43, V45), mAddOffset2), 2)); V46 = _mm256_add_epi16(V46, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V45, V47), mAddOffset2), 2)); V48 = _mm256_add_epi16(V48, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V47, V49), mAddOffset2), 2)); V50 = _mm256_add_epi16(V50, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V49, V51), mAddOffset2), 2)); V52 = _mm256_add_epi16(V52, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V51, V53), mAddOffset2), 2)); V54 = _mm256_add_epi16(V54, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V53, V55), mAddOffset2), 2)); V56 = _mm256_add_epi16(V56, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V55, V57), mAddOffset2), 2)); V58 = _mm256_add_epi16(V58, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V57, V59), mAddOffset2), 2)); V60 = _mm256_add_epi16(V60, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V59, V61), mAddOffset2), 2)); V62 = _mm256_add_epi16(V62, _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V61, V63), mAddOffset2), 2)); TRANSPOSE_16x16_16BIT(V00, V02, V04, V06, V08, V10, V12, V14, V16, V18, V20, V22, V24, V26, V28, V30, A00[0], A01[0], A02[0], A03[0], A04[0], A05[0], A06[0], A07[0], A08[0], A09[0], A10[0], A11[0], A12[0], A13[0], A14[0], A15[0]); TRANSPOSE_16x16_16BIT(V32, V34, V36, V38, V40, V42, V44, V46, V48, V50, V52, V54, V56, V58, V60, V62, A00[1], A01[1], A02[1], A03[1], A04[1], A05[1], A06[1], A07[1], A08[1], A09[1], A10[1], A11[1], A12[1], A13[1], A14[1], A15[1]); //pExt[y] -= (pExt[y - 1] + pExt[y + 1]) >> 1; for (i = 0; i < 2; i++){ A01[i] = _mm256_sub_epi16(A01[i], _mm256_srai_epi16(_mm256_add_epi16(A00[i], A02[i]), 1)); A03[i] = _mm256_sub_epi16(A03[i], _mm256_srai_epi16(_mm256_add_epi16(A02[i], A04[i]), 1)); A05[i] = _mm256_sub_epi16(A05[i], _mm256_srai_epi16(_mm256_add_epi16(A04[i], A06[i]), 1)); A07[i] = _mm256_sub_epi16(A07[i], _mm256_srai_epi16(_mm256_add_epi16(A06[i], A08[i]), 1)); A09[i] = _mm256_sub_epi16(A09[i], _mm256_srai_epi16(_mm256_add_epi16(A08[i], A10[i]), 1)); A11[i] = _mm256_sub_epi16(A11[i], _mm256_srai_epi16(_mm256_add_epi16(A10[i], A12[i]), 1)); A13[i] = _mm256_sub_epi16(A13[i], _mm256_srai_epi16(_mm256_add_epi16(A12[i], A14[i]), 1)); A15[i] = _mm256_sub_epi16(A15[i], _mm256_srai_epi16(_mm256_add_epi16(A14[i], A14[i]), 1)); } //pExt[y] = (pExt[y] << 1) + ((pExt[y - 1] + pExt[y + 1] + 1) >> 1); for (i = 0; i < 2; i++){ A00[i] = _mm256_add_epi16(_mm256_slli_epi16(A00[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A01[i], A01[i]), mAddOffset1), 1)); A02[i] = _mm256_add_epi16(_mm256_slli_epi16(A02[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A01[i], A03[i]), mAddOffset1), 1)); A04[i] = _mm256_add_epi16(_mm256_slli_epi16(A04[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A03[i], A05[i]), mAddOffset1), 1)); A06[i] = _mm256_add_epi16(_mm256_slli_epi16(A06[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A05[i], A07[i]), mAddOffset1), 1)); A08[i] = _mm256_add_epi16(_mm256_slli_epi16(A08[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A07[i], A09[i]), mAddOffset1), 1)); A10[i] = _mm256_add_epi16(_mm256_slli_epi16(A10[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A09[i], A11[i]), mAddOffset1), 1)); A12[i] = _mm256_add_epi16(_mm256_slli_epi16(A12[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A11[i], A13[i]), mAddOffset1), 1)); A14[i] = _mm256_add_epi16(_mm256_slli_epi16(A14[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A13[i], A15[i]), mAddOffset1), 1)); } //Store for (i = 0; i < 2; i++){ _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 0], A00[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 1], A02[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 2], A04[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 3], A06[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 4], A08[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 5], A10[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 6], A12[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 7], A14[i]); } } /* --------------------------------------------------------------------------- */ static void wavelet_64x64_avx2(coeff_t *coeff) { // 16*64 __m256i V00[4], V01[4], V02[4], V03[4], V04[4], V05[4], V06[4], V07[4], V08[4], V09[4], V10[4], V11[4], V12[4], V13[4], V14[4], V15[4], V16[4], V17[4], V18[4], V19[4], V20[4], V21[4], V22[4], V23[4], V24[4], V25[4], V26[4], V27[4], V28[4], V29[4], V30[4], V31[4], V32[4], V33[4], V34[4], V35[4], V36[4], V37[4], V38[4], V39[4], V40[4], V41[4], V42[4], V43[4], V44[4], V45[4], V46[4], V47[4], V48[4], V49[4], V50[4], V51[4], V52[4], V53[4], V54[4], V55[4], V56[4], V57[4], V58[4], V59[4], V60[4], V61[4], V62[4], V63[4]; // 64*64 __m256i T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4], T16[4], T17[4], T18[4], T19[4], T20[4], T21[4], T22[4], T23[4], T24[4], T25[4], T26[4], T27[4], T28[4], T29[4], T30[4], T31[4], T32[4], T33[4], T34[4], T35[4], T36[4], T37[4], T38[4], T39[4], T40[4], T41[4], T42[4], T43[4], T44[4], T45[4], T46[4], T47[4], T48[4], T49[4], T50[4], T51[4], T52[4], T53[4], T54[4], T55[4], T56[4], T57[4], T58[4], T59[4], T60[4], T61[4], T62[4], T63[4]; //ʱ 32*64 __m256i A00[2], A01[2], A02[2], A03[2], A04[2], A05[2], A06[2], A07[2], A08[2], A09[2], A10[2], A11[2], A12[2], A13[2], A14[2], A15[2], A16[2], A17[2], A18[2], A19[2], A20[2], A21[2], A22[2], A23[2], A24[2], A25[2], A26[2], A27[2], A28[2], A29[2], A30[2], A31[2], A32[2], A33[2], A34[2], A35[2], A36[2], A37[2], A38[2], A39[2], A40[2], A41[2], A42[2], A43[2], A44[2], A45[2], A46[2], A47[2], A48[2], A49[2], A50[2], A51[2], A52[2], A53[2], A54[2], A55[2], A56[2], A57[2], A58[2], A59[2], A60[2], A61[2], A62[2], A63[2]; __m256i tr0_00, tr0_01, tr0_02, tr0_03, tr0_04, tr0_05, tr0_06, tr0_07, tr0_08, tr0_09, tr0_10, tr0_11, tr0_12, tr0_13, tr0_14, tr0_15; __m256i tr1_00, tr1_01, tr1_02, tr1_03, tr1_04, tr1_05, tr1_06, tr1_07, tr1_08, tr1_09, tr1_10, tr1_11, tr1_12, tr1_13, tr1_14, tr1_15; int i; __m256i mAddOffset1 = _mm256_set1_epi16(1); __m256i mAddOffset2 = _mm256_set1_epi16(2); //load for (i = 0; i < 4; i++){ T00[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 0])); T01[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 1])); T02[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 2])); T03[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 3])); T04[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 4])); T05[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 5])); T06[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 6])); T07[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 7])); T08[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 8])); T09[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 9])); T10[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 10])); T11[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 11])); T12[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 12])); T13[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 13])); T14[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 14])); T15[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 15])); T16[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 16])); T17[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 17])); T18[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 18])); T19[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 19])); T20[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 20])); T21[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 21])); T22[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 22])); T23[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 23])); T24[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 24])); T25[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 25])); T26[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 26])); T27[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 27])); T28[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 28])); T29[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 29])); T30[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 30])); T31[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 31])); T32[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 32])); T33[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 33])); T34[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 34])); T35[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 35])); T36[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 36])); T37[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 37])); T38[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 38])); T39[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 39])); T40[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 40])); T41[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 41])); T42[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 42])); T43[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 43])); T44[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 44])); T45[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 45])); T46[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 46])); T47[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 47])); T48[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 48])); T49[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 49])); T50[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 50])); T51[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 51])); T52[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 52])); T53[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 53])); T54[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 54])); T55[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 55])); T56[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 56])); T57[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 57])); T58[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 58])); T59[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 59])); T60[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 60])); T61[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 61])); T62[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 62])); T63[i] = _mm256_load_si256((__m256i const *)((__m128i*)&coeff[16 * i + 64 * 63])); } //0-15ת TRANSPOSE_16x16_16BIT(T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0], V00[0], V01[0], V02[0], V03[0], V04[0], V05[0], V06[0], V07[0], V08[0], V09[0], V10[0], V11[0], V12[0], V13[0], V14[0], V15[0]); TRANSPOSE_16x16_16BIT(T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1], V16[0], V17[0], V18[0], V19[0], V20[0], V21[0], V22[0], V23[0], V24[0], V25[0], V26[0], V27[0], V28[0], V29[0], V30[0], V31[0]); TRANSPOSE_16x16_16BIT(T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2], T08[2], T09[2], T10[2], T11[2], T12[2], T13[2], T14[2], T15[2], V32[0], V33[0], V34[0], V35[0], V36[0], V37[0], V38[0], V39[0], V40[0], V41[0], V42[0], V43[0], V44[0], V45[0], V46[0], V47[0]); TRANSPOSE_16x16_16BIT(T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3], T08[3], T09[3], T10[3], T11[3], T12[3], T13[3], T14[3], T15[3], V48[0], V49[0], V50[0], V51[0], V52[0], V53[0], V54[0], V55[0], V56[0], V57[0], V58[0], V59[0], V60[0], V61[0], V62[0], V63[0]); //16-31ת TRANSPOSE_16x16_16BIT(T16[0], T17[0], T18[0], T19[0], T20[0], T21[0], T22[0], T23[0], T24[0], T25[0], T26[0], T27[0], T28[0], T29[0], T30[0], T31[0], V00[1], V01[1], V02[1], V03[1], V04[1], V05[1], V06[1], V07[1], V08[1], V09[1], V10[1], V11[1], V12[1], V13[1], V14[1], V15[1]); TRANSPOSE_16x16_16BIT(T16[1], T17[1], T18[1], T19[1], T20[1], T21[1], T22[1], T23[1], T24[1], T25[1], T26[1], T27[1], T28[1], T29[1], T30[1], T31[1], V16[1], V17[1], V18[1], V19[1], V20[1], V21[1], V22[1], V23[1], V24[1], V25[1], V26[1], V27[1], V28[1], V29[1], V30[1], V31[1]); TRANSPOSE_16x16_16BIT(T16[2], T17[2], T18[2], T19[2], T20[2], T21[2], T22[2], T23[2], T24[2], T25[2], T26[2], T27[2], T28[2], T29[2], T30[2], T31[2], V32[1], V33[1], V34[1], V35[1], V36[1], V37[1], V38[1], V39[1], V40[1], V41[1], V42[1], V43[1], V44[1], V45[1], V46[1], V47[1]); TRANSPOSE_16x16_16BIT(T16[3], T17[3], T18[3], T19[3], T20[3], T21[3], T22[3], T23[3], T24[3], T25[3], T26[3], T27[3], T28[3], T29[3], T30[3], T31[3], V48[1], V49[1], V50[1], V51[1], V52[1], V53[1], V54[1], V55[1], V56[1], V57[1], V58[1], V59[1], V60[1], V61[1], V62[1], V63[1]); //32-47ת TRANSPOSE_16x16_16BIT(T32[0], T33[0], T34[0], T35[0], T36[0], T37[0], T38[0], T39[0], T40[0], T41[0], T42[0], T43[0], T44[0], T45[0], T46[0], T47[0], V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2]); TRANSPOSE_16x16_16BIT(T32[1], T33[1], T34[1], T35[1], T36[1], T37[1], T38[1], T39[1], T40[1], T41[1], T42[1], T43[1], T44[1], T45[1], T46[1], T47[1], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2]); TRANSPOSE_16x16_16BIT(T32[2], T33[2], T34[2], T35[2], T36[2], T37[2], T38[2], T39[2], T40[2], T41[2], T42[2], T43[2], T44[2], T45[2], T46[2], T47[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2]); TRANSPOSE_16x16_16BIT(T32[3], T33[3], T34[3], T35[3], T36[3], T37[3], T38[3], T39[3], T40[3], T41[3], T42[3], T43[3], T44[3], T45[3], T46[3], T47[3], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2]); //48-63ת TRANSPOSE_16x16_16BIT(T48[0], T49[0], T50[0], T51[0], T52[0], T53[0], T54[0], T55[0], T56[0], T57[0], T58[0], T59[0], T60[0], T61[0], T62[0], T63[0], V00[3], V01[3], V02[3], V03[3], V04[3], V05[3], V06[3], V07[3], V08[3], V09[3], V10[3], V11[3], V12[3], V13[3], V14[3], V15[3]); TRANSPOSE_16x16_16BIT(T48[1], T49[1], T50[1], T51[1], T52[1], T53[1], T54[1], T55[1], T56[1], T57[1], T58[1], T59[1], T60[1], T61[1], T62[1], T63[1], V16[3], V17[3], V18[3], V19[3], V20[3], V21[3], V22[3], V23[3], V24[3], V25[3], V26[3], V27[3], V28[3], V29[3], V30[3], V31[3]); TRANSPOSE_16x16_16BIT(T48[2], T49[2], T50[2], T51[2], T52[2], T53[2], T54[2], T55[2], T56[2], T57[2], T58[2], T59[2], T60[2], T61[2], T62[2], T63[2], V32[3], V33[3], V34[3], V35[3], V36[3], V37[3], V38[3], V39[3], V40[3], V41[3], V42[3], V43[3], V44[3], V45[3], V46[3], V47[3]); TRANSPOSE_16x16_16BIT(T48[3], T49[3], T50[3], T51[3], T52[3], T53[3], T54[3], T55[3], T56[3], T57[3], T58[3], T59[3], T60[3], T61[3], T62[3], T63[3], V48[3], V49[3], V50[3], V51[3], V52[3], V53[3], V54[3], V55[3], V56[3], V57[3], V58[3], V59[3], V60[3], V61[3], V62[3], V63[3]); //pExt[x] -= (pExt[x - 1] + pExt[x + 1]) >> 1; for (i = 0; i < 4; i++){ V01[i] = _mm256_sub_epi16(V01[i], _mm256_srai_epi16(_mm256_add_epi16(V00[i], V02[i]), 1)); V03[i] = _mm256_sub_epi16(V03[i], _mm256_srai_epi16(_mm256_add_epi16(V02[i], V04[i]), 1)); V05[i] = _mm256_sub_epi16(V05[i], _mm256_srai_epi16(_mm256_add_epi16(V04[i], V06[i]), 1)); V07[i] = _mm256_sub_epi16(V07[i], _mm256_srai_epi16(_mm256_add_epi16(V06[i], V08[i]), 1)); V09[i] = _mm256_sub_epi16(V09[i], _mm256_srai_epi16(_mm256_add_epi16(V08[i], V10[i]), 1)); V11[i] = _mm256_sub_epi16(V11[i], _mm256_srai_epi16(_mm256_add_epi16(V10[i], V12[i]), 1)); V13[i] = _mm256_sub_epi16(V13[i], _mm256_srai_epi16(_mm256_add_epi16(V12[i], V14[i]), 1)); V15[i] = _mm256_sub_epi16(V15[i], _mm256_srai_epi16(_mm256_add_epi16(V14[i], V16[i]), 1)); V17[i] = _mm256_sub_epi16(V17[i], _mm256_srai_epi16(_mm256_add_epi16(V16[i], V18[i]), 1)); V19[i] = _mm256_sub_epi16(V19[i], _mm256_srai_epi16(_mm256_add_epi16(V18[i], V20[i]), 1)); V21[i] = _mm256_sub_epi16(V21[i], _mm256_srai_epi16(_mm256_add_epi16(V20[i], V22[i]), 1)); V23[i] = _mm256_sub_epi16(V23[i], _mm256_srai_epi16(_mm256_add_epi16(V22[i], V24[i]), 1)); V25[i] = _mm256_sub_epi16(V25[i], _mm256_srai_epi16(_mm256_add_epi16(V24[i], V26[i]), 1)); V27[i] = _mm256_sub_epi16(V27[i], _mm256_srai_epi16(_mm256_add_epi16(V26[i], V28[i]), 1)); V29[i] = _mm256_sub_epi16(V29[i], _mm256_srai_epi16(_mm256_add_epi16(V28[i], V30[i]), 1)); V31[i] = _mm256_sub_epi16(V31[i], _mm256_srai_epi16(_mm256_add_epi16(V30[i], V32[i]), 1)); V33[i] = _mm256_sub_epi16(V33[i], _mm256_srai_epi16(_mm256_add_epi16(V32[i], V34[i]), 1)); V35[i] = _mm256_sub_epi16(V35[i], _mm256_srai_epi16(_mm256_add_epi16(V34[i], V36[i]), 1)); V37[i] = _mm256_sub_epi16(V37[i], _mm256_srai_epi16(_mm256_add_epi16(V36[i], V38[i]), 1)); V39[i] = _mm256_sub_epi16(V39[i], _mm256_srai_epi16(_mm256_add_epi16(V38[i], V40[i]), 1)); V41[i] = _mm256_sub_epi16(V41[i], _mm256_srai_epi16(_mm256_add_epi16(V40[i], V42[i]), 1)); V43[i] = _mm256_sub_epi16(V43[i], _mm256_srai_epi16(_mm256_add_epi16(V42[i], V44[i]), 1)); V45[i] = _mm256_sub_epi16(V45[i], _mm256_srai_epi16(_mm256_add_epi16(V44[i], V46[i]), 1)); V47[i] = _mm256_sub_epi16(V47[i], _mm256_srai_epi16(_mm256_add_epi16(V46[i], V48[i]), 1)); V49[i] = _mm256_sub_epi16(V49[i], _mm256_srai_epi16(_mm256_add_epi16(V48[i], V50[i]), 1)); V51[i] = _mm256_sub_epi16(V51[i], _mm256_srai_epi16(_mm256_add_epi16(V50[i], V52[i]), 1)); V53[i] = _mm256_sub_epi16(V53[i], _mm256_srai_epi16(_mm256_add_epi16(V52[i], V54[i]), 1)); V55[i] = _mm256_sub_epi16(V55[i], _mm256_srai_epi16(_mm256_add_epi16(V54[i], V56[i]), 1)); V57[i] = _mm256_sub_epi16(V57[i], _mm256_srai_epi16(_mm256_add_epi16(V56[i], V58[i]), 1)); V59[i] = _mm256_sub_epi16(V59[i], _mm256_srai_epi16(_mm256_add_epi16(V58[i], V60[i]), 1)); V61[i] = _mm256_sub_epi16(V61[i], _mm256_srai_epi16(_mm256_add_epi16(V60[i], V62[i]), 1)); V63[i] = _mm256_sub_epi16(V63[i], _mm256_srai_epi16(_mm256_add_epi16(V62[i], V62[i]), 1)); } //pExt[x] += (pExt[x - 1] + pExt[x + 1] + 2) >> 2; for (i = 0; i < 4; i++){ V00[i] = _mm256_add_epi16(V00[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V01[i], V01[i]), mAddOffset2), 2)); V02[i] = _mm256_add_epi16(V02[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V01[i], V03[i]), mAddOffset2), 2)); V04[i] = _mm256_add_epi16(V04[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V03[i], V05[i]), mAddOffset2), 2)); V06[i] = _mm256_add_epi16(V06[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V05[i], V07[i]), mAddOffset2), 2)); V08[i] = _mm256_add_epi16(V08[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V07[i], V09[i]), mAddOffset2), 2)); V10[i] = _mm256_add_epi16(V10[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V09[i], V11[i]), mAddOffset2), 2)); V12[i] = _mm256_add_epi16(V12[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V11[i], V13[i]), mAddOffset2), 2)); V14[i] = _mm256_add_epi16(V14[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V13[i], V15[i]), mAddOffset2), 2)); V16[i] = _mm256_add_epi16(V16[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V15[i], V17[i]), mAddOffset2), 2)); V18[i] = _mm256_add_epi16(V18[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V17[i], V19[i]), mAddOffset2), 2)); V20[i] = _mm256_add_epi16(V20[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V19[i], V21[i]), mAddOffset2), 2)); V22[i] = _mm256_add_epi16(V22[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V21[i], V23[i]), mAddOffset2), 2)); V24[i] = _mm256_add_epi16(V24[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V23[i], V25[i]), mAddOffset2), 2)); V26[i] = _mm256_add_epi16(V26[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V25[i], V27[i]), mAddOffset2), 2)); V28[i] = _mm256_add_epi16(V28[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V27[i], V29[i]), mAddOffset2), 2)); V30[i] = _mm256_add_epi16(V30[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V29[i], V31[i]), mAddOffset2), 2)); V32[i] = _mm256_add_epi16(V32[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V31[i], V33[i]), mAddOffset2), 2)); V34[i] = _mm256_add_epi16(V34[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V33[i], V35[i]), mAddOffset2), 2)); V36[i] = _mm256_add_epi16(V36[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V35[i], V37[i]), mAddOffset2), 2)); V38[i] = _mm256_add_epi16(V38[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V37[i], V39[i]), mAddOffset2), 2)); V40[i] = _mm256_add_epi16(V40[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V39[i], V41[i]), mAddOffset2), 2)); V42[i] = _mm256_add_epi16(V42[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V41[i], V43[i]), mAddOffset2), 2)); V44[i] = _mm256_add_epi16(V44[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V43[i], V45[i]), mAddOffset2), 2)); V46[i] = _mm256_add_epi16(V46[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V45[i], V47[i]), mAddOffset2), 2)); V48[i] = _mm256_add_epi16(V48[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V47[i], V49[i]), mAddOffset2), 2)); V50[i] = _mm256_add_epi16(V50[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V49[i], V51[i]), mAddOffset2), 2)); V52[i] = _mm256_add_epi16(V52[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V51[i], V53[i]), mAddOffset2), 2)); V54[i] = _mm256_add_epi16(V54[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V53[i], V55[i]), mAddOffset2), 2)); V56[i] = _mm256_add_epi16(V56[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V55[i], V57[i]), mAddOffset2), 2)); V58[i] = _mm256_add_epi16(V58[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V57[i], V59[i]), mAddOffset2), 2)); V60[i] = _mm256_add_epi16(V60[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V59[i], V61[i]), mAddOffset2), 2)); V62[i] = _mm256_add_epi16(V62[i], _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(V61[i], V63[i]), mAddOffset2), 2)); } TRANSPOSE_16x16_16BIT(V00[0], V02[0], V04[0], V06[0], V08[0], V10[0], V12[0], V14[0], V16[0], V18[0], V20[0], V22[0], V24[0], V26[0], V28[0], V30[0], A00[0], A01[0], A02[0], A03[0], A04[0], A05[0], A06[0], A07[0], A08[0], A09[0], A10[0], A11[0], A12[0], A13[0], A14[0], A15[0]); TRANSPOSE_16x16_16BIT(V32[0], V34[0], V36[0], V38[0], V40[0], V42[0], V44[0], V46[0], V48[0], V50[0], V52[0], V54[0], V56[0], V58[0], V60[0], V62[0], A00[1], A01[1], A02[1], A03[1], A04[1], A05[1], A06[1], A07[1], A08[1], A09[1], A10[1], A11[1], A12[1], A13[1], A14[1], A15[1]); TRANSPOSE_16x16_16BIT(V00[1], V02[1], V04[1], V06[1], V08[1], V10[1], V12[1], V14[1], V16[1], V18[1], V20[1], V22[1], V24[1], V26[1], V28[1], V30[1], A16[0], A17[0], A18[0], A19[0], A20[0], A21[0], A22[0], A23[0], A24[0], A25[0], A26[0], A27[0], A28[0], A29[0], A30[0], A31[0]); TRANSPOSE_16x16_16BIT(V32[1], V34[1], V36[1], V38[1], V40[1], V42[1], V44[1], V46[1], V48[1], V50[1], V52[1], V54[1], V56[1], V58[1], V60[1], V62[1], A16[1], A17[1], A18[1], A19[1], A20[1], A21[1], A22[1], A23[1], A24[1], A25[1], A26[1], A27[1], A28[1], A29[1], A30[1], A31[1]); TRANSPOSE_16x16_16BIT(V00[2], V02[2], V04[2], V06[2], V08[2], V10[2], V12[2], V14[2], V16[2], V18[2], V20[2], V22[2], V24[2], V26[2], V28[2], V30[2], A32[0], A33[0], A34[0], A35[0], A36[0], A37[0], A38[0], A39[0], A40[0], A41[0], A42[0], A43[0], A44[0], A45[0], A46[0], A47[0]); TRANSPOSE_16x16_16BIT(V32[2], V34[2], V36[2], V38[2], V40[2], V42[2], V44[2], V46[2], V48[2], V50[2], V52[2], V54[2], V56[2], V58[2], V60[2], V62[2], A32[1], A33[1], A34[1], A35[1], A36[1], A37[1], A38[1], A39[1], A40[1], A41[1], A42[1], A43[1], A44[1], A45[1], A46[1], A47[1]); TRANSPOSE_16x16_16BIT(V00[3], V02[3], V04[3], V06[3], V08[3], V10[3], V12[3], V14[3], V16[3], V18[3], V20[3], V22[3], V24[3], V26[3], V28[3], V30[3], A48[0], A49[0], A50[0], A51[0], A52[0], A53[0], A54[0], A55[0], A56[0], A57[0], A58[0], A59[0], A60[0], A61[0], A62[0], A63[0]); TRANSPOSE_16x16_16BIT(V32[3], V34[3], V36[3], V38[3], V40[3], V42[3], V44[3], V46[3], V48[3], V50[3], V52[3], V54[3], V56[3], V58[3], V60[3], V62[3], A48[1], A49[1], A50[1], A51[1], A52[1], A53[1], A54[1], A55[1], A56[1], A57[1], A58[1], A59[1], A60[1], A61[1], A62[1], A63[1]); //pExt[y] -= (pExt[y - 1] + pExt[y + 1]) >> 1; for (i = 0; i < 2; i++){ A01[i] = _mm256_sub_epi16(A01[i], _mm256_srai_epi16(_mm256_add_epi16(A00[i], A02[i]), 1)); A03[i] = _mm256_sub_epi16(A03[i], _mm256_srai_epi16(_mm256_add_epi16(A02[i], A04[i]), 1)); A05[i] = _mm256_sub_epi16(A05[i], _mm256_srai_epi16(_mm256_add_epi16(A04[i], A06[i]), 1)); A07[i] = _mm256_sub_epi16(A07[i], _mm256_srai_epi16(_mm256_add_epi16(A06[i], A08[i]), 1)); A09[i] = _mm256_sub_epi16(A09[i], _mm256_srai_epi16(_mm256_add_epi16(A08[i], A10[i]), 1)); A11[i] = _mm256_sub_epi16(A11[i], _mm256_srai_epi16(_mm256_add_epi16(A10[i], A12[i]), 1)); A13[i] = _mm256_sub_epi16(A13[i], _mm256_srai_epi16(_mm256_add_epi16(A12[i], A14[i]), 1)); A15[i] = _mm256_sub_epi16(A15[i], _mm256_srai_epi16(_mm256_add_epi16(A14[i], A16[i]), 1)); A17[i] = _mm256_sub_epi16(A17[i], _mm256_srai_epi16(_mm256_add_epi16(A16[i], A18[i]), 1)); A19[i] = _mm256_sub_epi16(A19[i], _mm256_srai_epi16(_mm256_add_epi16(A18[i], A20[i]), 1)); A21[i] = _mm256_sub_epi16(A21[i], _mm256_srai_epi16(_mm256_add_epi16(A20[i], A22[i]), 1)); A23[i] = _mm256_sub_epi16(A23[i], _mm256_srai_epi16(_mm256_add_epi16(A22[i], A24[i]), 1)); A25[i] = _mm256_sub_epi16(A25[i], _mm256_srai_epi16(_mm256_add_epi16(A24[i], A26[i]), 1)); A27[i] = _mm256_sub_epi16(A27[i], _mm256_srai_epi16(_mm256_add_epi16(A26[i], A28[i]), 1)); A29[i] = _mm256_sub_epi16(A29[i], _mm256_srai_epi16(_mm256_add_epi16(A28[i], A30[i]), 1)); A31[i] = _mm256_sub_epi16(A31[i], _mm256_srai_epi16(_mm256_add_epi16(A30[i], A32[i]), 1)); A33[i] = _mm256_sub_epi16(A33[i], _mm256_srai_epi16(_mm256_add_epi16(A32[i], A34[i]), 1)); A35[i] = _mm256_sub_epi16(A35[i], _mm256_srai_epi16(_mm256_add_epi16(A34[i], A36[i]), 1)); A37[i] = _mm256_sub_epi16(A37[i], _mm256_srai_epi16(_mm256_add_epi16(A36[i], A38[i]), 1)); A39[i] = _mm256_sub_epi16(A39[i], _mm256_srai_epi16(_mm256_add_epi16(A38[i], A40[i]), 1)); A41[i] = _mm256_sub_epi16(A41[i], _mm256_srai_epi16(_mm256_add_epi16(A40[i], A42[i]), 1)); A43[i] = _mm256_sub_epi16(A43[i], _mm256_srai_epi16(_mm256_add_epi16(A42[i], A44[i]), 1)); A45[i] = _mm256_sub_epi16(A45[i], _mm256_srai_epi16(_mm256_add_epi16(A44[i], A46[i]), 1)); A47[i] = _mm256_sub_epi16(A47[i], _mm256_srai_epi16(_mm256_add_epi16(A46[i], A48[i]), 1)); A49[i] = _mm256_sub_epi16(A49[i], _mm256_srai_epi16(_mm256_add_epi16(A48[i], A50[i]), 1)); A51[i] = _mm256_sub_epi16(A51[i], _mm256_srai_epi16(_mm256_add_epi16(A50[i], A52[i]), 1)); A53[i] = _mm256_sub_epi16(A53[i], _mm256_srai_epi16(_mm256_add_epi16(A52[i], A54[i]), 1)); A55[i] = _mm256_sub_epi16(A55[i], _mm256_srai_epi16(_mm256_add_epi16(A54[i], A56[i]), 1)); A57[i] = _mm256_sub_epi16(A57[i], _mm256_srai_epi16(_mm256_add_epi16(A56[i], A58[i]), 1)); A59[i] = _mm256_sub_epi16(A59[i], _mm256_srai_epi16(_mm256_add_epi16(A58[i], A60[i]), 1)); A61[i] = _mm256_sub_epi16(A61[i], _mm256_srai_epi16(_mm256_add_epi16(A60[i], A62[i]), 1)); A63[i] = _mm256_sub_epi16(A63[i], _mm256_srai_epi16(_mm256_add_epi16(A62[i], A62[i]), 1)); } //pExt[y] = (pExt[y] << 1) + ((pExt[y - 1] + pExt[y + 1] + 1) >> 1); for (i = 0; i < 2; i++){ A00[i] = _mm256_add_epi16(_mm256_slli_epi16(A00[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A01[i], A01[i]), mAddOffset1), 1)); A02[i] = _mm256_add_epi16(_mm256_slli_epi16(A02[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A01[i], A03[i]), mAddOffset1), 1)); A04[i] = _mm256_add_epi16(_mm256_slli_epi16(A04[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A03[i], A05[i]), mAddOffset1), 1)); A06[i] = _mm256_add_epi16(_mm256_slli_epi16(A06[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A05[i], A07[i]), mAddOffset1), 1)); A08[i] = _mm256_add_epi16(_mm256_slli_epi16(A08[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A07[i], A09[i]), mAddOffset1), 1)); A10[i] = _mm256_add_epi16(_mm256_slli_epi16(A10[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A09[i], A11[i]), mAddOffset1), 1)); A12[i] = _mm256_add_epi16(_mm256_slli_epi16(A12[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A11[i], A13[i]), mAddOffset1), 1)); A14[i] = _mm256_add_epi16(_mm256_slli_epi16(A14[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A13[i], A15[i]), mAddOffset1), 1)); A16[i] = _mm256_add_epi16(_mm256_slli_epi16(A16[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A15[i], A17[i]), mAddOffset1), 1)); A18[i] = _mm256_add_epi16(_mm256_slli_epi16(A18[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A17[i], A19[i]), mAddOffset1), 1)); A20[i] = _mm256_add_epi16(_mm256_slli_epi16(A20[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A19[i], A21[i]), mAddOffset1), 1)); A22[i] = _mm256_add_epi16(_mm256_slli_epi16(A22[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A21[i], A23[i]), mAddOffset1), 1)); A24[i] = _mm256_add_epi16(_mm256_slli_epi16(A24[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A23[i], A25[i]), mAddOffset1), 1)); A26[i] = _mm256_add_epi16(_mm256_slli_epi16(A26[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A25[i], A27[i]), mAddOffset1), 1)); A28[i] = _mm256_add_epi16(_mm256_slli_epi16(A28[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A27[i], A29[i]), mAddOffset1), 1)); A30[i] = _mm256_add_epi16(_mm256_slli_epi16(A30[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A29[i], A31[i]), mAddOffset1), 1)); A32[i] = _mm256_add_epi16(_mm256_slli_epi16(A32[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A31[i], A33[i]), mAddOffset1), 1)); A34[i] = _mm256_add_epi16(_mm256_slli_epi16(A34[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A33[i], A35[i]), mAddOffset1), 1)); A36[i] = _mm256_add_epi16(_mm256_slli_epi16(A36[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A35[i], A37[i]), mAddOffset1), 1)); A38[i] = _mm256_add_epi16(_mm256_slli_epi16(A38[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A37[i], A39[i]), mAddOffset1), 1)); A40[i] = _mm256_add_epi16(_mm256_slli_epi16(A40[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A39[i], A41[i]), mAddOffset1), 1)); A42[i] = _mm256_add_epi16(_mm256_slli_epi16(A42[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A41[i], A43[i]), mAddOffset1), 1)); A44[i] = _mm256_add_epi16(_mm256_slli_epi16(A44[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A43[i], A45[i]), mAddOffset1), 1)); A46[i] = _mm256_add_epi16(_mm256_slli_epi16(A46[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A45[i], A47[i]), mAddOffset1), 1)); A48[i] = _mm256_add_epi16(_mm256_slli_epi16(A48[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A47[i], A49[i]), mAddOffset1), 1)); A50[i] = _mm256_add_epi16(_mm256_slli_epi16(A50[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A49[i], A51[i]), mAddOffset1), 1)); A52[i] = _mm256_add_epi16(_mm256_slli_epi16(A52[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A51[i], A53[i]), mAddOffset1), 1)); A54[i] = _mm256_add_epi16(_mm256_slli_epi16(A54[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A53[i], A55[i]), mAddOffset1), 1)); A56[i] = _mm256_add_epi16(_mm256_slli_epi16(A56[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A55[i], A57[i]), mAddOffset1), 1)); A58[i] = _mm256_add_epi16(_mm256_slli_epi16(A58[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A57[i], A59[i]), mAddOffset1), 1)); A60[i] = _mm256_add_epi16(_mm256_slli_epi16(A60[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A59[i], A61[i]), mAddOffset1), 1)); A62[i] = _mm256_add_epi16(_mm256_slli_epi16(A62[i], 1), _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(A61[i], A63[i]), mAddOffset1), 1)); } //Store for (i = 0; i < 2; i++){ _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 0], A00[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 1], A02[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 2], A04[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 3], A06[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 4], A08[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 5], A10[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 6], A12[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 7], A14[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 8], A16[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 9], A18[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 10], A20[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 11], A22[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 12], A24[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 13], A26[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 14], A28[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 15], A30[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 16], A32[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 17], A34[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 18], A36[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 19], A38[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 20], A40[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 21], A42[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 22], A44[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 23], A46[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 24], A48[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 25], A50[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 26], A52[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 27], A54[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 28], A56[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 29], A58[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 30], A60[i]); _mm256_store_si256((__m256i*)&coeff[16 * i + 32 * 31], A62[i]); } } /* --------------------------------------------------------------------------- */ void dct_c_64x64_avx2(const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(src); UNUSED_PARAMETER(i_src); wavelet_64x64_avx2(dst); dct_c_32x32_avx2(dst, dst, 32 | 1); } /* --------------------------------------------------------------------------- */ void dct_c_64x64_half_avx2(const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(src); UNUSED_PARAMETER(i_src); wavelet_64x64_avx2(dst); dct_c_32x32_half_avx2(dst, dst, 32 | 1); } /* --------------------------------------------------------------------------- */ void dct_c_64x16_avx2(const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(src); UNUSED_PARAMETER(i_src); wavelet_64x16_avx2(dst); dct_c_32x8_avx2(dst, dst, 32 | 0x01); } /* --------------------------------------------------------------------------- */ void dct_c_16x64_avx2(const coeff_t *src, coeff_t *dst, int i_src) { UNUSED_PARAMETER(src); UNUSED_PARAMETER(i_src); wavelet_16x64_avx2(dst); dct_c_8x32_avx2(dst, dst, 8 | 0x01); } /* --------------------------------------------------------------------------- */ ALIGN32(static const int16_t tab_dct_16_0_avx[][16]) = { { 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100 }, // 0 { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A, 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A }, // 1 { 0x0100, 0x0706, 0x0302, 0x0504, 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, 0x0100, 0x0706, 0x0302, 0x0504, 0x0F0E, 0x0908, 0x0D0C, 0x0B0A }, // 2 { 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, 0x0100, 0x0706, 0x0302, 0x0504, 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, 0x0100, 0x0706, 0x0302, 0x0504 } // 3 }; /* --------------------------------------------------------------------------- */ ALIGN32(static const int16_t tab_dct_16_1_avx[][16]) = { { 45, 43, 40, 35, 29, 21, 13, 4, 45, 43, 40, 35, 29, 21, 13, 4 }, // 0 { 43, 29, 4, -21, -40, -45, -35, -13, 43, 29, 4, -21, -40, -45, -35, -13 }, // 1 { 40, 4, -35, -43, -13, 29, 45, 21, 40, 4, -35, -43, -13, 29, 45, 21 }, // 2 { 35, -21, -43, 4, 45, 13, -40, -29, 35, -21, -43, 4, 45, 13, -40, -29 }, // 3 { 29, -40, -13, 45, -4, -43, 21, 35, 29, -40, -13, 45, -4, -43, 21, 35 }, // 4 { 21, -45, 29, 13, -43, 35, 4, -40, 21, -45, 29, 13, -43, 35, 4, -40 }, // 5 { 13, -35, 45, -40, 21, 4, -29, 43, 13, -35, 45, -40, 21, 4, -29, 43 }, // 6 { 4, -13, 21, -29, 35, -40, 43, -45, 4, -13, 21, -29, 35, -40, 43, -45 }, // 7 { 42, 42, -42, -42, 17, 17, -17, -17, 42, 42, -42, -42, 17, 17, -17, -17 }, // 8 { 17, 17, -17, -17, -42, -42, 42, 42, 17, 17, -17, -17, -42, -42, 42, 42 }, // 9 { 44, 44, 9, 9, 38, 38, 25, 25, 44, 44, 9, 9, 38, 38, 25, 25 }, // 10 { 38, 38, -25, -25, -9, -9, -44, -44, 38, 38, -25, -25, -9, -9, -44, -44 }, // 11 { 25, 25, 38, 38, -44, -44, 9, 9, 25, 25, 38, 38, -44, -44, 9, 9 }, // 12 { 9, 9, -44, -44, -25, -25, 38, 38, 9, 9, -44, -44, -25, -25, 38, 38 }, // 13 /* --------------------------------------------------------------------------- */ #define MAKE_COEF(a0, a1, a2, a3, a4, a5, a6, a7) \ { (a0), -(a0), (a3), -(a3), (a1), -(a1), (a2), -(a2), (a0), -(a0), (a3), -(a3), (a1), -(a1), (a2), -(a2) }, \ { (a7), -(a7), (a4), -(a4), (a6), -(a6), (a5), -(a5), (a7), -(a7), (a4), -(a4), (a6), -(a6), (a5), -(a5) }, MAKE_COEF(45, 43, 40, 35, 29, 21, 13, 4) MAKE_COEF(43, 29, 4, -21, -40, -45, -35, -13) MAKE_COEF(40, 4, -35, -43, -13, 29, 45, 21) MAKE_COEF(35, -21, -43, 4, 45, 13, -40, -29) MAKE_COEF(29, -40, -13, 45, -4, -43, 21, 35) MAKE_COEF(21, -45, 29, 13, -43, 35, 4, -40) MAKE_COEF(13, -35, 45, -40, 21, 4, -29, 43) MAKE_COEF(4, -13, 21, -29, 35, -40, 43, -45) #undef MAKE_COEF { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, { 32, 32, 32, 32, -32, -32, -32, -32, 32, 32, 32, 32, -32, -32, -32, -32 }, }; /* --------------------------------------------------------------------------- */ #define pair256_set_epi16(a, b) \ _mm256_set_epi16(b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a) /* --------------------------------------------------------------------------- */ void dct_c_4x16_avx2(const coeff_t *src, coeff_t *dst, int i_src) { const __m256i k_p32_p32 = _mm256_set1_epi16(32); const __m256i k_p32_m32 = pair256_set_epi16(32, -32); const __m256i k_p42_p17 = pair256_set_epi16(42, 17); const __m256i k_p17_m42 = pair256_set_epi16(17, -42); __m256i in[16]; __m256i tr00, tr01; __m256i r0, r1, r4, r5, t0, t2, u0, u1, u2, u3; __m256i T00A, T01A, T00B, T01B; __m256i T10, T11, T12, T13; __m256i T20, T21, T22, T23, T24, T25, T26, T27; __m256i T30, T31, T32, T33; __m256i T40, T41; __m256i T70; int shift2 = 9; const __m256i c_256 = _mm256_set1_epi32(256); __m256i tab_dct_16_02 = _mm256_loadu_si256((__m256i*)tab_dct_16_0_avx[2]); __m256i tab_dct_16_03 = _mm256_loadu_si256((__m256i*)tab_dct_16_0_avx[3]); __m256i tab_dct_16_8 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[8]); __m256i tab_dct_16_9 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[9]); __m256i tab_dct_16_10 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[10]); __m256i tab_dct_16_11 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[11]); __m256i tab_dct_16_12 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[12]); __m256i tab_dct_16_13 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[13]); __m256i tab_dct_16_14 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[14]); __m256i tab_dct_16_15 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[15]); __m256i tab_dct_16_16 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[16]); __m256i tab_dct_16_17 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[17]); __m256i tab_dct_16_18 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[18]); __m256i tab_dct_16_19 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[19]); __m256i tab_dct_16_20 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[20]); __m256i tab_dct_16_21 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[21]); __m256i tab_dct_16_22 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[22]); __m256i tab_dct_16_23 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[23]); __m256i tab_dct_16_24 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[24]); __m256i tab_dct_16_25 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[25]); __m256i tab_dct_16_26 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[26]); __m256i tab_dct_16_27 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[27]); __m256i tab_dct_16_28 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[28]); __m256i tab_dct_16_29 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[29]); __m256i tab_dct_16_30 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[30]); __m256i tab_dct_16_31 = _mm256_loadu_si256((__m256i*)tab_dct_16_1_avx[31]); ///// DCT1 4x16->16x4////// //transpose input data in[0] = _mm256_load_si256((const __m256i *)(src + 0 * i_src)); in[4] = _mm256_load_si256((const __m256i *)(src + 4 * i_src)); in[8] = _mm256_load_si256((const __m256i *)(src + 8 * i_src)); in[12] = _mm256_load_si256((const __m256i *)(src + 12 * i_src)); tr00 = _mm256_shuffle_epi32(in[0], 0xD8);//00 01 04 05 02 03 06 07 / 08 09 12 13 10 11 14 15 tr01 = _mm256_shuffle_epi32(in[4], 0xD8); r0 = _mm256_shufflehi_epi16(tr00, 0xB1);//00 01 04 05 03 02 07 06 / 08 09 12 13 11 10 15 14 r4 = _mm256_shufflehi_epi16(tr01, 0xB1); r1 = _mm256_unpacklo_epi64(r0, r4);//0, 1, 4, 5, 16, 17, 20, 21, 8, 9, 12, 13, 24, 25, 28, 29 r5 = _mm256_unpackhi_epi64(r0, r4);//3, 2, 7, 6, 19, 18, 23, 22, 11, 10, 15, 14, 27, 26, 31, 30 t0 = _mm256_add_epi16(r1, r5);//3, 3, 11, 11, 35, 35, 43, 43, 19, 19, 27, 27, 51, 51, 59, 59 t2 = _mm256_sub_epi16(r1, r5);//-3, -1, -3, -1, -3, -1, -3, -1, -3, -1, -3, -1, -3, -1, -3, -1 u0 = _mm256_madd_epi16(t0, k_p32_p32); u2 = _mm256_madd_epi16(t0, k_p32_m32); u1 = _mm256_madd_epi16(t2, k_p42_p17); u3 = _mm256_madd_epi16(t2, k_p17_m42); T00A = _mm256_packs_epi32(u0, u1); T00B = _mm256_packs_epi32(u2, u3); T00A = _mm256_permute4x64_epi64(T00A, 0xD8); T00A = _mm256_shuffle_epi32(T00A, 0xD8);//out[0] out[4] out[1] out[5] T00B = _mm256_permute4x64_epi64(T00B, 0xD8); T00B = _mm256_shuffle_epi32(T00B, 0xD8);//out[2] out[6] out[3] out[7] tr00 = _mm256_shuffle_epi32(in[8], 0xD8);//00 01 04 05 02 03 06 07 / 08 09 12 13 10 11 14 15 tr01 = _mm256_shuffle_epi32(in[12], 0xD8); r0 = _mm256_shufflehi_epi16(tr00, 0xB1);//00 01 04 05 03 02 07 06 / 08 09 12 13 11 10 15 14 r4 = _mm256_shufflehi_epi16(tr01, 0xB1); r1 = _mm256_unpacklo_epi64(r0, r4); r5 = _mm256_unpackhi_epi64(r0, r4); t0 = _mm256_add_epi16(r1, r5); t2 = _mm256_sub_epi16(r1, r5); u0 = _mm256_madd_epi16(t0, k_p32_p32); u2 = _mm256_madd_epi16(t0, k_p32_m32); u1 = _mm256_madd_epi16(t2, k_p42_p17); u3 = _mm256_madd_epi16(t2, k_p17_m42); T01A = _mm256_packs_epi32(u0, u1); T01B = _mm256_packs_epi32(u2, u3); T01A = _mm256_permute4x64_epi64(T01A, 0xD8); T01A = _mm256_shuffle_epi32(T01A, 0xD8);//out[8] out[12] out[9] out[13] T01B = _mm256_permute4x64_epi64(T01B, 0xD8); T01B = _mm256_shuffle_epi32(T01B, 0xD8);//out[10] out[14] out[11] out[15] T00A = _mm256_shuffle_epi8(T00A, tab_dct_16_02); T01A = _mm256_shuffle_epi8(T01A, tab_dct_16_03); T00B = _mm256_shuffle_epi8(T00B, tab_dct_16_02); T01B = _mm256_shuffle_epi8(T01B, tab_dct_16_03); T10 = _mm256_unpacklo_epi16(T00A, T01A);//T10 T12 T11 = _mm256_unpackhi_epi16(T00A, T01A);//T11 T13 T12 = _mm256_unpacklo_epi16(T00B, T01B);//T14 T16 T13 = _mm256_unpackhi_epi16(T00B, T01B);//T15 T17 T20 = _mm256_madd_epi16(T10, tab_dct_16_30); T21 = _mm256_madd_epi16(T11, tab_dct_16_30); T22 = _mm256_madd_epi16(T12, tab_dct_16_30); T23 = _mm256_madd_epi16(T13, tab_dct_16_30); T24 = _mm256_madd_epi16(T10, tab_dct_16_10); T25 = _mm256_madd_epi16(T11, tab_dct_16_10); T26 = _mm256_madd_epi16(T12, tab_dct_16_10); T27 = _mm256_madd_epi16(T13, tab_dct_16_10); T30 = _mm256_add_epi32(T20, T21); T31 = _mm256_add_epi32(T22, T23); T32 = _mm256_sub_epi32(T24, T25); T33 = _mm256_sub_epi32(T26, T27); T30 = _mm256_hadd_epi32(T30, T31); T32 = _mm256_hadd_epi32(T32, T33); T40 = _mm256_hadd_epi32(T30, T32); T40 = _mm256_srai_epi32(_mm256_add_epi32(T40, c_256), shift2); T40 = _mm256_permute4x64_epi64(T40, 0xD8);//0 2 T20 = _mm256_madd_epi16(T10, tab_dct_16_14); T21 = _mm256_madd_epi16(T11, tab_dct_16_15); T22 = _mm256_madd_epi16(T12, tab_dct_16_14); T23 = _mm256_madd_epi16(T13, tab_dct_16_15); T24 = _mm256_madd_epi16(T10, tab_dct_16_16); T25 = _mm256_madd_epi16(T11, tab_dct_16_17); T26 = _mm256_madd_epi16(T12, tab_dct_16_16); T27 = _mm256_madd_epi16(T13, tab_dct_16_17); T30 = _mm256_add_epi32(T20, T21); T31 = _mm256_add_epi32(T22, T23); T32 = _mm256_add_epi32(T24, T25); T33 = _mm256_add_epi32(T26, T27); T30 = _mm256_hadd_epi32(T30, T31); T32 = _mm256_hadd_epi32(T32, T33); T41 = _mm256_hadd_epi32(T30, T32); T41 = _mm256_srai_epi32(_mm256_add_epi32(T41, c_256), shift2); T41 = _mm256_permute4x64_epi64(T41, 0xD8);//1 3 T70 = _mm256_packs_epi32(T40, T41); T70 = _mm256_shufflehi_epi16(T70, 0xD8); T70 = _mm256_shufflelo_epi16(T70, 0xD8); _mm256_storeu2_m128i((__m128i*)(dst + 2 * 4), (__m128i*)(dst + 0 * 4), T70); T20 = _mm256_madd_epi16(T10, tab_dct_16_8); T21 = _mm256_madd_epi16(T11, tab_dct_16_8); T22 = _mm256_madd_epi16(T12, tab_dct_16_8); T23 = _mm256_madd_epi16(T13, tab_dct_16_8); T24 = _mm256_madd_epi16(T10, tab_dct_16_11); T25 = _mm256_madd_epi16(T11, tab_dct_16_11); T26 = _mm256_madd_epi16(T12, tab_dct_16_11); T27 = _mm256_madd_epi16(T13, tab_dct_16_11); T30 = _mm256_add_epi32(T20, T21); T31 = _mm256_add_epi32(T22, T23); T32 = _mm256_sub_epi32(T24, T25); T33 = _mm256_sub_epi32(T26, T27); T30 = _mm256_hadd_epi32(T30, T31); T32 = _mm256_hadd_epi32(T32, T33); T40 = _mm256_hadd_epi32(T30, T32); T40 = _mm256_srai_epi32(_mm256_add_epi32(T40, c_256), shift2); T40 = _mm256_permute4x64_epi64(T40, 0xD8);//4 6 T20 = _mm256_madd_epi16(T10, tab_dct_16_18); T21 = _mm256_madd_epi16(T11, tab_dct_16_19); T22 = _mm256_madd_epi16(T12, tab_dct_16_18); T23 = _mm256_madd_epi16(T13, tab_dct_16_19); T24 = _mm256_madd_epi16(T10, tab_dct_16_20); T25 = _mm256_madd_epi16(T11, tab_dct_16_21); T26 = _mm256_madd_epi16(T12, tab_dct_16_20); T27 = _mm256_madd_epi16(T13, tab_dct_16_21); T30 = _mm256_add_epi32(T20, T21); T31 = _mm256_add_epi32(T22, T23); T32 = _mm256_add_epi32(T24, T25); T33 = _mm256_add_epi32(T26, T27); T30 = _mm256_hadd_epi32(T30, T31); T32 = _mm256_hadd_epi32(T32, T33); T41 = _mm256_hadd_epi32(T30, T32); T41 = _mm256_srai_epi32(_mm256_add_epi32(T41, c_256), shift2); T41 = _mm256_permute4x64_epi64(T41, 0xD8);//5 7 T70 = _mm256_packs_epi32(T40, T41); T70 = _mm256_shufflehi_epi16(T70, 0xD8); T70 = _mm256_shufflelo_epi16(T70, 0xD8); _mm256_storeu2_m128i((__m128i*)(dst + 6 * 4), (__m128i*)(dst + 4 * 4), T70); T20 = _mm256_madd_epi16(T10, tab_dct_16_31); T21 = _mm256_madd_epi16(T11, tab_dct_16_31); T22 = _mm256_madd_epi16(T12, tab_dct_16_31); T23 = _mm256_madd_epi16(T13, tab_dct_16_31); T24 = _mm256_madd_epi16(T10, tab_dct_16_12); T25 = _mm256_madd_epi16(T11, tab_dct_16_12); T26 = _mm256_madd_epi16(T12, tab_dct_16_12); T27 = _mm256_madd_epi16(T13, tab_dct_16_12); T30 = _mm256_add_epi32(T20, T21); T31 = _mm256_add_epi32(T22, T23); T32 = _mm256_sub_epi32(T24, T25); T33 = _mm256_sub_epi32(T26, T27); T30 = _mm256_hadd_epi32(T30, T31); T32 = _mm256_hadd_epi32(T32, T33); T40 = _mm256_hadd_epi32(T30, T32); T40 = _mm256_srai_epi32(_mm256_add_epi32(T40, c_256), shift2); T40 = _mm256_permute4x64_epi64(T40, 0xD8);//8 10 T20 = _mm256_madd_epi16(T10, tab_dct_16_22); T21 = _mm256_madd_epi16(T11, tab_dct_16_23); T22 = _mm256_madd_epi16(T12, tab_dct_16_22); T23 = _mm256_madd_epi16(T13, tab_dct_16_23); T24 = _mm256_madd_epi16(T10, tab_dct_16_24); T25 = _mm256_madd_epi16(T11, tab_dct_16_25); T26 = _mm256_madd_epi16(T12, tab_dct_16_24); T27 = _mm256_madd_epi16(T13, tab_dct_16_25); T30 = _mm256_add_epi32(T20, T21); T31 = _mm256_add_epi32(T22, T23); T32 = _mm256_add_epi32(T24, T25); T33 = _mm256_add_epi32(T26, T27); T30 = _mm256_hadd_epi32(T30, T31); T32 = _mm256_hadd_epi32(T32, T33); T41 = _mm256_hadd_epi32(T30, T32); T41 = _mm256_srai_epi32(_mm256_add_epi32(T41, c_256), shift2); T41 = _mm256_permute4x64_epi64(T41, 0xD8);//9 11 T70 = _mm256_packs_epi32(T40, T41); T70 = _mm256_shufflehi_epi16(T70, 0xD8); T70 = _mm256_shufflelo_epi16(T70, 0xD8); _mm256_storeu2_m128i((__m128i*)(dst + 10 * 4), (__m128i*)(dst + 8 * 4), T70); T20 = _mm256_madd_epi16(T10, tab_dct_16_9); T21 = _mm256_madd_epi16(T11, tab_dct_16_9); T22 = _mm256_madd_epi16(T12, tab_dct_16_9); T23 = _mm256_madd_epi16(T13, tab_dct_16_9); T24 = _mm256_madd_epi16(T10, tab_dct_16_13); T25 = _mm256_madd_epi16(T11, tab_dct_16_13); T26 = _mm256_madd_epi16(T12, tab_dct_16_13); T27 = _mm256_madd_epi16(T13, tab_dct_16_13); T30 = _mm256_add_epi32(T20, T21); T31 = _mm256_add_epi32(T22, T23); T32 = _mm256_sub_epi32(T24, T25); T33 = _mm256_sub_epi32(T26, T27); T30 = _mm256_hadd_epi32(T30, T31); T32 = _mm256_hadd_epi32(T32, T33); T40 = _mm256_hadd_epi32(T30, T32); T40 = _mm256_srai_epi32(_mm256_add_epi32(T40, c_256), shift2); T40 = _mm256_permute4x64_epi64(T40, 0xD8);//12 14 T20 = _mm256_madd_epi16(T10, tab_dct_16_26); T21 = _mm256_madd_epi16(T11, tab_dct_16_27); T22 = _mm256_madd_epi16(T12, tab_dct_16_26); T23 = _mm256_madd_epi16(T13, tab_dct_16_27); T24 = _mm256_madd_epi16(T10, tab_dct_16_28); T25 = _mm256_madd_epi16(T11, tab_dct_16_29); T26 = _mm256_madd_epi16(T12, tab_dct_16_28); T27 = _mm256_madd_epi16(T13, tab_dct_16_29); T30 = _mm256_add_epi32(T20, T21); T31 = _mm256_add_epi32(T22, T23); T32 = _mm256_add_epi32(T24, T25); T33 = _mm256_add_epi32(T26, T27); T30 = _mm256_hadd_epi32(T30, T31); T32 = _mm256_hadd_epi32(T32, T33); T41 = _mm256_hadd_epi32(T30, T32); T41 = _mm256_srai_epi32(_mm256_add_epi32(T41, c_256), shift2); T41 = _mm256_permute4x64_epi64(T41, 0xD8);//13 15 T70 = _mm256_packs_epi32(T40, T41); T70 = _mm256_shufflehi_epi16(T70, 0xD8); T70 = _mm256_shufflelo_epi16(T70, 0xD8); _mm256_storeu2_m128i((__m128i*)(dst + 14 * 4), (__m128i*)(dst + 12 * 4), T70); } /* --------------------------------------------------------------------------- */ ALIGN32(static const int16_t tab_dct_16x4_avx2[][16]) = { { 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100 }, { 32, 32, 32, 32, 32, 32, 32, 32, 32, -32, 32, -32, 32, -32, 32, -32 },//0 8 { 42, 17, 42, 17, 42, 17, 42, 17, 17, -42, 17, -42, 17, -42, 17, -42 },//4 12 { 44, 9, 38, 25, 44, 9, 38, 25, 44, 9, 38, 25, 44, 9, 38, 25 },//2 { 38, -25, -9, -44, 38, -25, -9, -44, 38, -25, -9, -44, 38, -25, -9, -44 },//6 { 25, 38, -44, 9, 25, 38, -44, 9, 25, 38, -44, 9, 25, 38, -44, 9 },//10 { 9, -44, -25, 38, 9, -44, -25, 38, 9, -44, -25, 38, 9, -44, -25, 38 } //14 }; /* --------------------------------------------------------------------------- */ ALIGN32(static const int16_t tab_dct_16x4_1_avx2[][16]) = { { 45, 43, 40, 35, 29, 21, 13, 4, 45, 43, 40, 35, 29, 21, 13, 4 },// 0 { 43, 29, 4, -21, -40, -45, -35, -13, 43, 29, 4, -21, -40, -45, -35, -13 },// 1 { 40, 4, -35, -43, -13, 29, 45, 21, 40, 4, -35, -43, -13, 29, 45, 21 },// 2 { 35, -21, -43, 4, 45, 13, -40, -29, 35, -21, -43, 4, 45, 13, -40, -29 },// 3 { 29, -40, -13, 45, -4, -43, 21, 35, 29, -40, -13, 45, -4, -43, 21, 35 },// 4 { 21, -45, 29, 13, -43, 35, 4, -40, 21, -45, 29, 13, -43, 35, 4, -40 },// 5 { 13, -35, 45, -40, 21, 4, -29, 43, 13, -35, 45, -40, 21, 4, -29, 43 },// 6 { 4, -13, 21, -29, 35, -40, 43, -45, 4, -13, 21, -29, 35, -40, 43, -45 } // 7 }; /* --------------------------------------------------------------------------- */ ALIGN32(static const int16_t tab_dct1_4_avx2[][16]) = { { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, { 42, 17, -17, -42, 42, 17, -17, -42, 42, 17, -17, -42, 42, 17, -17, -42 }, { 32, -32, -32, 32, 32, -32, -32, 32, 32, -32, -32, 32, 32, -32, -32, 32 }, { 17, -42, 42, -17, 17, -42, 42, -17, 17, -42, 42, -17, 17, -42, 42, -17 } }; /* --------------------------------------------------------------------------- */ void dct_c_16x4_avx2(const coeff_t * src, coeff_t * dst, int i_src) { int shift1 = B16X16_IN_BIT + FACTO_BIT +g_bit_depth + 1 - LIMIT_BIT; int shift2 = B16X16_IN_BIT + FACTO_BIT - 2; const int ADD1 = (1 << shift1) >> 1; const int ADD2 = (1 << shift2) >> 1; const __m256i c_2 = _mm256_set1_epi32(ADD1); // TODO: shift1 = 2 const __m256i k_ROUNDING2 = _mm256_set1_epi32(ADD2); __m256i T00, T01, T02, T03, T04, T05, T06, T07; __m256i T10, T11; __m256i T20, T21; __m256i T30, T31; __m256i T40, T41; __m256i T50, T51; __m256i T60, T61; __m256i im1[2]; __m256i temp[4]; __m256i r0, r1, u0, u1, u2, u3, v0, v1, v2, v3, w0, w1, w2, w3; __m256i res0, res1, res2, res3; __m256i d0, d1, d2, d3; __m256i im[4]; //////// DCT1 16x4->4x16 /////// //input data T00 = _mm256_loadu2_m128i((__m128i *)(src + 2 * i_src + 0), (__m128i *)(src + 0 * i_src + 0)); // [00 01 02 03 04 05 06 07 20 21 22 23 24 25 26 27] T01 = _mm256_loadu2_m128i((__m128i *)(src + 2 * i_src + 8), (__m128i *)(src + 0 * i_src + 8)); // [08 09 0A 0B 0C 0D 0E 0F 28 29 2A 2B 2C 2D 2E 2F] T02 = _mm256_loadu2_m128i((__m128i *)(src + 3 * i_src + 0), (__m128i *)(src + 1 * i_src + 0)); T03 = _mm256_loadu2_m128i((__m128i *)(src + 3 * i_src + 8), (__m128i *)(src + 1 * i_src + 8)); //shuffle T04 = _mm256_shuffle_epi8(T01, _mm256_load_si256((__m256i *)tab_dct_16x4_avx2[0])); T05 = _mm256_shuffle_epi8(T03, _mm256_load_si256((__m256i *)tab_dct_16x4_avx2[0])); T06 = _mm256_sub_epi16(T00, T04); T07 = _mm256_sub_epi16(T02, T05); T00 = _mm256_shuffle_epi8(T00, _mm256_load_si256((__m256i *)tab_dct_16_shuffle_avx2[0])); // [00 07 03 04 01 06 02 05 20 27 23 24 21 26 22 25] T02 = _mm256_shuffle_epi8(T02, _mm256_load_si256((__m256i *)tab_dct_16_shuffle_avx2[0])); // [0F 08 0B 0C 0E 09 0D 0A 2F 28 2B 2C 2E 29 2D 2A] T01 = _mm256_shuffle_epi8(T01, _mm256_load_si256((__m256i *)tab_dct_16_shuffle_avx2[1])); T03 = _mm256_shuffle_epi8(T03, _mm256_load_si256((__m256i *)tab_dct_16_shuffle_avx2[1])); T10 = _mm256_add_epi16(T00, T01); // [00 07 03 04 01 06 02 05 20 27 23 24 21 26 22 25] T11 = _mm256_add_epi16(T02, T03); T20 = _mm256_hadd_epi16(T10, T11);// [00 03 01 02 10 13 11 12 20 23 21 22 30 33 31 32] T21 = _mm256_hsub_epi16(T10, T11); T30 = _mm256_hadd_epi16(T20, T20);// [00 01 10 11 00 01 10 11 20 21 30 31 20 21 30 31] T31 = _mm256_hsub_epi16(T20, T20); T30 = _mm256_permute4x64_epi64(T30, 0xd8); T31 = _mm256_permute4x64_epi64(T31, 0xd8); T40 = _mm256_madd_epi16(T30, _mm256_load_si256((__m256i*)tab_dct_16x4_avx2[1]));//0/8 T41 = _mm256_madd_epi16(T31, _mm256_load_si256((__m256i*)tab_dct_16x4_avx2[2]));//4/12 T50 = _mm256_srai_epi32(_mm256_add_epi32(T40, c_2), shift1); T51 = _mm256_srai_epi32(_mm256_add_epi32(T41, c_2), shift1); T60 = _mm256_packs_epi32(T50, T51);//0 4 8 12 T60 = _mm256_permute4x64_epi64(T60, 0xd8);//0 8 4 12 im[0] = T60; T40 = _mm256_madd_epi16(T21, _mm256_load_si256((__m256i*)tab_dct_16x4_avx2[3]));//2 T41 = _mm256_madd_epi16(T21, _mm256_load_si256((__m256i*)tab_dct_16x4_avx2[4]));///6 T50 = _mm256_hadd_epi32(T40, T41); T50 = _mm256_permute4x64_epi64(T50, 0xd8); T40 = _mm256_madd_epi16(T21, _mm256_load_si256((__m256i*)tab_dct_16x4_avx2[5]));//10 T41 = _mm256_madd_epi16(T21, _mm256_load_si256((__m256i*)tab_dct_16x4_avx2[6]));//14 T51 = _mm256_hadd_epi32(T40, T41); T51 = _mm256_permute4x64_epi64(T51, 0xd8); T50 = _mm256_srai_epi32(_mm256_add_epi32(T50, c_2), shift1); T51 = _mm256_srai_epi32(_mm256_add_epi32(T51, c_2), shift1); T60 = _mm256_packs_epi32(T50, T51);//2 10 6 14 im[1] = T60; T40 = _mm256_madd_epi16(T06, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(0)]));//1 T41 = _mm256_madd_epi16(T07, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(0)])); T50 = _mm256_hadd_epi32(T40, T41); T40 = _mm256_madd_epi16(T06, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(1)]));//3 T41 = _mm256_madd_epi16(T07, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(1)])); T51 = _mm256_hadd_epi32(T40, T41); T60 = _mm256_hadd_epi32(T50, T51); T60 = _mm256_permute4x64_epi64(T60, 0xd8); T60 = _mm256_srai_epi32(_mm256_add_epi32(T60, c_2), shift1); T40 = _mm256_madd_epi16(T06, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(2)]));//9 T41 = _mm256_madd_epi16(T07, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(2)])); T50 = _mm256_hadd_epi32(T40, T41); T40 = _mm256_madd_epi16(T06, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(3)]));//11 T41 = _mm256_madd_epi16(T07, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(3)])); T51 = _mm256_hadd_epi32(T40, T41); T61 = _mm256_hadd_epi32(T50, T51); T61 = _mm256_permute4x64_epi64(T61, 0xd8); T61 = _mm256_srai_epi32(_mm256_add_epi32(T61, c_2), shift1); T60 = _mm256_packs_epi32(T60, T61);//1 5 3 7 T60 = _mm256_permute4x64_epi64(T60, 0xd8);//1 3 5 7 im[2] = T60; T40 = _mm256_madd_epi16(T06, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(4)]));//9 T41 = _mm256_madd_epi16(T07, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(4)])); T50 = _mm256_hadd_epi32(T40, T41); T40 = _mm256_madd_epi16(T06, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(5)]));//11 T41 = _mm256_madd_epi16(T07, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(5)])); T51 = _mm256_hadd_epi32(T40, T41); T60 = _mm256_hadd_epi32(T50, T51); T60 = _mm256_permute4x64_epi64(T60, 0xd8); T60 = _mm256_srai_epi32(_mm256_add_epi32(T60, c_2), shift1); T40 = _mm256_madd_epi16(T06, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(6)]));//13 T41 = _mm256_madd_epi16(T07, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(6)])); T50 = _mm256_hadd_epi32(T40, T41); T40 = _mm256_madd_epi16(T06, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(7)]));//15 T41 = _mm256_madd_epi16(T07, _mm256_load_si256((__m256i*)tab_dct_16x4_1_avx2[(7)])); T51 = _mm256_hadd_epi32(T40, T41); T61 = _mm256_hadd_epi32(T50, T51); T61 = _mm256_permute4x64_epi64(T61, 0xd8); T61 = _mm256_srai_epi32(_mm256_add_epi32(T61, c_2), shift1); T60 = _mm256_packs_epi32(T60, T61);//9 13 11 15 T60 = _mm256_permute4x64_epi64(T60, 0xd8);//9 11 13 15 im[3] = T60; im1[0] = _mm256_unpacklo_epi64(im[0], im[1]);//0 2 4 6 im1[1] = _mm256_unpackhi_epi64(im[0], im[1]);//8 10 12 14 temp[0] = _mm256_unpacklo_epi64(im1[0], im[2]);//0 1 4 5 temp[1] = _mm256_unpackhi_epi64(im1[0], im[2]);//2 3 6 7 temp[2] = _mm256_unpacklo_epi64(im1[1], im[3]);//8 9 12 13 temp[3] = _mm256_unpackhi_epi64(im1[1], im[3]);//10 11 14 15 //////// DCT2 16x4->4x16 /////// //1st 4x4 r0 = _mm256_madd_epi16(temp[0], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[0])); r1 = _mm256_madd_epi16(temp[1], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[0])); u0 = _mm256_hadd_epi32(r0, r1); r0 = _mm256_madd_epi16(temp[0], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[1])); r1 = _mm256_madd_epi16(temp[1], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[1])); u1 = _mm256_hadd_epi32(r0, r1); r0 = _mm256_madd_epi16(temp[0], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[2])); r1 = _mm256_madd_epi16(temp[1], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[2])); u2 = _mm256_hadd_epi32(r0, r1); r0 = _mm256_madd_epi16(temp[0], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[3])); r1 = _mm256_madd_epi16(temp[1], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[3])); u3 = _mm256_hadd_epi32(r0, r1); v0 = _mm256_add_epi32(u0, k_ROUNDING2); v1 = _mm256_add_epi32(u1, k_ROUNDING2); v2 = _mm256_add_epi32(u2, k_ROUNDING2); v3 = _mm256_add_epi32(u3, k_ROUNDING2); w0 = _mm256_srai_epi32(v0, shift2); w1 = _mm256_srai_epi32(v1, shift2); w2 = _mm256_srai_epi32(v2, shift2); w3 = _mm256_srai_epi32(v3, shift2); res0 = _mm256_packs_epi32(w0, w1);//0 2 res1 = _mm256_packs_epi32(w2, w3);//1 3 //2st 4x4 r0 = _mm256_madd_epi16(temp[2], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[0])); r1 = _mm256_madd_epi16(temp[3], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[0])); u0 = _mm256_hadd_epi32(r0, r1); r0 = _mm256_madd_epi16(temp[2], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[1])); r1 = _mm256_madd_epi16(temp[3], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[1])); u1 = _mm256_hadd_epi32(r0, r1); r0 = _mm256_madd_epi16(temp[2], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[2])); r1 = _mm256_madd_epi16(temp[3], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[2])); u2 = _mm256_hadd_epi32(r0, r1); r0 = _mm256_madd_epi16(temp[2], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[3])); r1 = _mm256_madd_epi16(temp[3], _mm256_load_si256((__m256i*)tab_dct1_4_avx2[3])); u3 = _mm256_hadd_epi32(r0, r1); v0 = _mm256_add_epi32(u0, k_ROUNDING2); v1 = _mm256_add_epi32(u1, k_ROUNDING2); v2 = _mm256_add_epi32(u2, k_ROUNDING2); v3 = _mm256_add_epi32(u3, k_ROUNDING2); w0 = _mm256_srai_epi32(v0, shift2); w1 = _mm256_srai_epi32(v1, shift2); w2 = _mm256_srai_epi32(v2, shift2); w3 = _mm256_srai_epi32(v3, shift2); res2 = _mm256_packs_epi32(w0, w1);//4 6 res3 = _mm256_packs_epi32(w2, w3);//5 7 res0 = _mm256_permute4x64_epi64(res0, 0xd8); res1 = _mm256_permute4x64_epi64(res1, 0xd8); res2 = _mm256_permute4x64_epi64(res2, 0xd8); res3 = _mm256_permute4x64_epi64(res3, 0xd8); d0 = _mm256_permute2x128_si256(res0, res2, 0x20); d1 = _mm256_permute2x128_si256(res0, res2, 0x31); d2 = _mm256_permute2x128_si256(res1, res3, 0x20); d3 = _mm256_permute2x128_si256(res1, res3, 0x31); //store _mm256_store_si256((__m256i *)(dst + 0), d0); _mm256_store_si256((__m256i *)(dst + 16), d1); _mm256_store_si256((__m256i *)(dst + 32), d2); _mm256_store_si256((__m256i *)(dst + 48), d3); } xavs2-1.3/source/common/vec/intrinsic_deblock.c000066400000000000000000000754021340660520300216250ustar00rootroot00000000000000/* * intrinsic_deblock.c * * Description of this file: * SSE assembly functions of Deblock module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../basic_types.h" #include "intrinsic.h" #include #include #include #include void deblock_edge_ver_sse128(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag) { pel_t *pTmp = SrcPtr - 4; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i TL0, TL1, TL2, TL3; __m128i TR0, TR1, TR2, TR3; __m128i TL0l, TL1l, TL2l; __m128i TR0l, TR1l, TR2l; __m128i V0, V1, V2, V3, V4, V5; __m128i T0, T1, T2, T3, T4, T5, T6, T7; __m128i M0, M1, M2; __m128i FLT_L, FLT_R, FLT, FS; __m128i FS3, FS4, FS56; __m128i ALPHA = _mm_set1_epi16((pel_t)Alpha); __m128i BETA = _mm_set1_epi16((pel_t)Beta); __m128i c_0 = _mm_set1_epi16(0); __m128i c_1 = _mm_set1_epi16(1); __m128i c_2 = _mm_set1_epi16(2); __m128i c_3 = _mm_set1_epi16(3); __m128i c_4 = _mm_set1_epi16(4); __m128i c_8 = _mm_set1_epi16(8); __m128i c_16 = _mm_set1_epi16(16); T0 = _mm_loadl_epi64((__m128i*)(pTmp)); T1 = _mm_loadl_epi64((__m128i*)(pTmp + stride)); T2 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 2)); T3 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 3)); T4 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 4)); T5 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 5)); T6 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 6)); T7 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 7)); T0 = _mm_unpacklo_epi8(T0, T1); T1 = _mm_unpacklo_epi8(T2, T3); T2 = _mm_unpacklo_epi8(T4, T5); T3 = _mm_unpacklo_epi8(T6, T7); T4 = _mm_unpacklo_epi16(T0, T1); T5 = _mm_unpacklo_epi16(T2, T3); T6 = _mm_unpackhi_epi16(T0, T1); T7 = _mm_unpackhi_epi16(T2, T3); T0 = _mm_unpacklo_epi32(T4, T5); T1 = _mm_unpackhi_epi32(T4, T5); T2 = _mm_unpacklo_epi32(T6, T7); T3 = _mm_unpackhi_epi32(T6, T7); TL3 = _mm_unpacklo_epi8(T0, c_0); TL2 = _mm_unpackhi_epi8(T0, c_0); TL1 = _mm_unpacklo_epi8(T1, c_0); TL0 = _mm_unpackhi_epi8(T1, c_0); TR0 = _mm_unpacklo_epi8(T2, c_0); TR1 = _mm_unpackhi_epi8(T2, c_0); TR2 = _mm_unpacklo_epi8(T3, c_0); TR3 = _mm_unpackhi_epi8(T3, c_0); #define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b)) T0 = _mm_subabs_epu16(TL0, TR0); T1 = _mm_cmpgt_epi16(T0, c_1); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag1, flag0, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0 = _mm_subabs_epu16(TL1, TL0); T1 = _mm_subabs_epu16(TR1, TR0); FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2); FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2); T0 = _mm_subabs_epu16(TL2, TL0); T1 = _mm_subabs_epu16(TR2, TR0); M1 = _mm_cmpgt_epi16(BETA, T0); M2 = _mm_cmpgt_epi16(BETA, T1); FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L); FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R); FLT = _mm_add_epi16(FLT_L, FLT_R); M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1)); T0 = _mm_sub_epi16(FLT, c_2); T1 = _mm_sub_epi16(FLT, c_3); T2 = _mm_subabs_epu16(TL1, TR1); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(c_1, c_2, _mm_cmpeq_epi16(FLT_L, c_2)); FS3 = _mm_blendv_epi8(c_0, c_1, _mm_cmpgt_epi16(BETA, T2)); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4)); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4)); FS = _mm_blendv_epi8(FS, FS3, _mm_cmpeq_epi16(FLT, c_3)); FS = _mm_and_si128(FS, M0); #undef _mm_subabs_epu16 TL0l = TL0; TL1l = TL1; TR0l = TR0; TR1l = TR1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(TL0l, TR0l), c_2); // L0 + R0 + 2 V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TL0l, 1), T2), 2); V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TR0l, 1), T2), 2); TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1)); TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1)); /* fs == 2 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4 T3 = _mm_slli_epi16(T3, 1); T0 = _mm_add_epi16(_mm_slli_epi16(TL1l, 1), _mm_add_epi16(TL1l, TR0l)); T0 = _mm_add_epi16(_mm_slli_epi16(TL0l, 3), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); T0 = _mm_add_epi16(_mm_slli_epi16(TR1l, 1), _mm_add_epi16(TR1l, TL0l)); T0 = _mm_add_epi16(_mm_slli_epi16(TR0l, 3), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_2)); TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_2)); /* fs == 3 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8 T3 = _mm_slli_epi16(T3, 1); T0 = _mm_add_epi16(_mm_slli_epi16(TL1l, 2), _mm_add_epi16(TL2, TR1l)); T0 = _mm_add_epi16(_mm_slli_epi16(TL0l, 1), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(T0, 4); T0 = _mm_add_epi16(_mm_slli_epi16(TR1l, 2), _mm_add_epi16(TR2, TL1l)); T0 = _mm_add_epi16(_mm_slli_epi16(TR0l, 1), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(T0, 4); TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_3)); TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_3)); T0 = _mm_add_epi16(_mm_add_epi16(TL2, TR0l), _mm_slli_epi16(TL2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL1l, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL0l, 2)); V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); T0 = _mm_add_epi16(_mm_add_epi16(TR2, TL0l), _mm_slli_epi16(TR2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR1l, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR0l, 2)); V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); TL1 = _mm_blendv_epi8(TL1, V2, _mm_cmpeq_epi16(FS, c_3)); TR1 = _mm_blendv_epi8(TR1, V3, _mm_cmpeq_epi16(FS, c_3)); FS = _mm_cmpeq_epi16(FS, c_4); if (!_mm_testz_si128(FS, _mm_set1_epi16(-1))) { /* fs == 4 */ TL2l = TL2; TR2l = TR2; /* cal L0/R0 */ T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TL0l, TL2), TR0l), 3); T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TL0l, TL2)); T2 = _mm_add_epi16(_mm_slli_epi16(TR2, 1), _mm_slli_epi16(TR2, 2)); V0 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5); T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TR0l, TR2), TL0l), 3); T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TR0l, TR2)); T2 = _mm_add_epi16(_mm_slli_epi16(TL2, 1), _mm_slli_epi16(TL2, 2)); V1 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5); TL0 = _mm_blendv_epi8(TL0, V0, FS); TR0 = _mm_blendv_epi8(TR0, V1, FS); /* cal L1/R1 */ T0 = _mm_slli_epi16(_mm_add_epi16(TL2, TR0l), 1); T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TL0l, 3), TL0l)); T2 = _mm_add_epi16(_mm_slli_epi16(TL2, 2), _mm_add_epi16(TR0l, c_8)); V2 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4); T0 = _mm_slli_epi16(_mm_add_epi16(TR2, TL0l), 1); T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TR0l, 3), TR0l)); T2 = _mm_add_epi16(_mm_slli_epi16(TR2, 2), _mm_add_epi16(TL0l, c_8)); V3 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4); TL1 = _mm_blendv_epi8(TL1, V2, FS); TR1 = _mm_blendv_epi8(TR1, V3, FS); /* cal L2/R2 */ T0 = _mm_add_epi16(_mm_slli_epi16(TL2, 1), TL2); T2 = _mm_add_epi16(_mm_slli_epi16(TL0l, 2), TR0l); V4 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3); T0 = _mm_add_epi16(_mm_slli_epi16(TR2, 1), TR2); T2 = _mm_add_epi16(_mm_slli_epi16(TR0l, 2), TL0l); V5 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3); TL2 = _mm_blendv_epi8(TL2, V4, FS); TR2 = _mm_blendv_epi8(TR2, V5, FS); } /* store result */ T0 = _mm_packus_epi16(TL3, TR0); T1 = _mm_packus_epi16(TL2, TR1); T2 = _mm_packus_epi16(TL1, TR2); T3 = _mm_packus_epi16(TL0, TR3); T4 = _mm_unpacklo_epi8(T0, T1); T5 = _mm_unpacklo_epi8(T2, T3); T6 = _mm_unpackhi_epi8(T0, T1); T7 = _mm_unpackhi_epi8(T2, T3); V0 = _mm_unpacklo_epi16(T4, T5); V1 = _mm_unpacklo_epi16(T6, T7); V2 = _mm_unpackhi_epi16(T4, T5); V3 = _mm_unpackhi_epi16(T6, T7); T0 = _mm_unpacklo_epi32(V0, V1); T1 = _mm_unpackhi_epi32(V0, V1); T2 = _mm_unpacklo_epi32(V2, V3); T3 = _mm_unpackhi_epi32(V2, V3); pTmp = SrcPtr - 4; _mm_storel_epi64((__m128i*)(pTmp), T0); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T0, 8)); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), T1); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T1, 8)); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), T2); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T2, 8)); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), T3); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T3, 8)); } void deblock_edge_ver_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag) { pel_t *pTmp; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i UVL0, UVL1, UVR0, UVR1; __m128i TL0, TL1, TL2, TL3; __m128i TR0, TR1, TR2, TR3; __m128i T0, T1, T2, T3, T4, T5, T6, T7; __m128i P0, P1, P2, P3, P4, P5, P6, P7; __m128i V0, V1, V2, V3; __m128i M0, M1, M2; __m128i FLT_L, FLT_R, FLT, FS; __m128i FS4, FS56; __m128i ALPHA = _mm_set1_epi16((pel_t)Alpha); __m128i BETA = _mm_set1_epi16((pel_t)Beta); __m128i c_0 = _mm_set1_epi16(0); __m128i c_1 = _mm_set1_epi16(1); __m128i c_2 = _mm_set1_epi16(2); __m128i c_3 = _mm_set1_epi16(3); __m128i c_4 = _mm_set1_epi16(4); __m128i c_8 = _mm_set1_epi16(8); pTmp = SrcPtrU - 4; T0 = _mm_loadl_epi64((__m128i*)(pTmp)); T1 = _mm_loadl_epi64((__m128i*)(pTmp + stride)); T2 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 2)); T3 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 3)); pTmp = SrcPtrV - 4; T4 = _mm_loadl_epi64((__m128i*)(pTmp)); T5 = _mm_loadl_epi64((__m128i*)(pTmp + stride)); T6 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 2)); T7 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 3)); P0 = _mm_unpacklo_epi8(T0, T1); P1 = _mm_unpacklo_epi8(T2, T3); P2 = _mm_unpacklo_epi8(T4, T5); P3 = _mm_unpacklo_epi8(T6, T7); P4 = _mm_unpacklo_epi16(P0, P1); P5 = _mm_unpacklo_epi16(P2, P3); P6 = _mm_unpackhi_epi16(P0, P1); P7 = _mm_unpackhi_epi16(P2, P3); T0 = _mm_unpacklo_epi32(P4, P5); T1 = _mm_unpackhi_epi32(P4, P5); T2 = _mm_unpacklo_epi32(P6, P7); T3 = _mm_unpackhi_epi32(P6, P7); TL3 = _mm_unpacklo_epi8(T0, c_0); TL2 = _mm_unpackhi_epi8(T0, c_0); TL1 = _mm_unpacklo_epi8(T1, c_0); TL0 = _mm_unpackhi_epi8(T1, c_0); TR0 = _mm_unpacklo_epi8(T2, c_0); TR1 = _mm_unpackhi_epi8(T2, c_0); TR2 = _mm_unpacklo_epi8(T3, c_0); TR3 = _mm_unpackhi_epi8(T3, c_0); #define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b)) T0 = _mm_subabs_epu16(TL0, TR0); T1 = _mm_cmpgt_epi16(T0, c_1); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag0, flag1, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0 = _mm_subabs_epu16(TL1, TL0); T1 = _mm_subabs_epu16(TR1, TR0); FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2); FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2); T0 = _mm_subabs_epu16(TL2, TL0); T1 = _mm_subabs_epu16(TR2, TR0); M1 = _mm_cmpgt_epi16(BETA, T0); M2 = _mm_cmpgt_epi16(BETA, T1); FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L); FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R); FLT = _mm_add_epi16(FLT_L, FLT_R); M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1)); T0 = _mm_sub_epi16(FLT, c_3); T1 = _mm_sub_epi16(FLT, c_4); T2 = _mm_subabs_epu16(TL1, TR1); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(c_0, c_1, _mm_cmpeq_epi16(FLT_L, c_2)); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4)); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4)); FS = _mm_and_si128(FS, M0); #undef _mm_subabs_epu16 UVL0 = TL0; UVL1 = TL1; UVR0 = TR0; UVR1 = TR1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(UVL0, UVR0), c_2); // L0 + R0 + 2 V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(UVL0, 1), T2), 2); V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(UVR0, 1), T2), 2); TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1)); TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1)); /* fs == 2 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4 T3 = _mm_slli_epi16(T3, 1); T0 = _mm_add_epi16(_mm_slli_epi16(UVL1, 1), _mm_add_epi16(UVL1, UVR0)); T0 = _mm_add_epi16(_mm_slli_epi16(UVL0, 3), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); T0 = _mm_add_epi16(_mm_slli_epi16(UVR1, 1), _mm_add_epi16(UVR1, UVL0)); T0 = _mm_add_epi16(_mm_slli_epi16(UVR0, 3), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_2)); TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_2)); /* fs == 3 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8 T3 = _mm_slli_epi16(T3, 1); T0 = _mm_add_epi16(_mm_slli_epi16(UVL1, 2), _mm_add_epi16(TL2, UVR1)); T0 = _mm_add_epi16(_mm_slli_epi16(UVL0, 1), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(T0, 4); T0 = _mm_add_epi16(_mm_slli_epi16(UVR1, 2), _mm_add_epi16(TR2, UVL1)); T0 = _mm_add_epi16(_mm_slli_epi16(UVR0, 1), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(T0, 4); TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_3)); TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_3)); T0 = _mm_add_epi16(_mm_add_epi16(TL2, UVR0), _mm_slli_epi16(TL2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVL1, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVL0, 2)); V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); T0 = _mm_add_epi16(_mm_add_epi16(TR2, UVL0), _mm_slli_epi16(TR2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVR1, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVR0, 2)); V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); TL1 = _mm_blendv_epi8(TL1, V2, _mm_cmpeq_epi16(FS, c_3)); TR1 = _mm_blendv_epi8(TR1, V3, _mm_cmpeq_epi16(FS, c_3)); /* store result */ T0 = _mm_packus_epi16(TL3, TR0); T1 = _mm_packus_epi16(TL2, TR1); T2 = _mm_packus_epi16(TL1, TR2); T3 = _mm_packus_epi16(TL0, TR3); P0 = _mm_unpacklo_epi8(T0, T1); P1 = _mm_unpacklo_epi8(T2, T3); P2 = _mm_unpackhi_epi8(T0, T1); P3 = _mm_unpackhi_epi8(T2, T3); P4 = _mm_unpacklo_epi16(P0, P1); P5 = _mm_unpacklo_epi16(P2, P3); P6 = _mm_unpackhi_epi16(P0, P1); P7 = _mm_unpackhi_epi16(P2, P3); T0 = _mm_unpacklo_epi32(P4, P5); T1 = _mm_unpackhi_epi32(P4, P5); T2 = _mm_unpacklo_epi32(P6, P7); T3 = _mm_unpackhi_epi32(P6, P7); pTmp = SrcPtrU - 4; _mm_storel_epi64((__m128i*)(pTmp), T0); _mm_storel_epi64((__m128i*)(pTmp + stride), _mm_srli_si128(T0, 8)); _mm_storel_epi64((__m128i*)(pTmp + (stride << 1)), T1); _mm_storel_epi64((__m128i*)(pTmp + stride * 3), _mm_srli_si128(T1, 8)); pTmp = SrcPtrV - 4; _mm_storel_epi64((__m128i*)(pTmp), T2); _mm_storel_epi64((__m128i*)(pTmp + stride), _mm_srli_si128(T2, 8)); _mm_storel_epi64((__m128i*)(pTmp + (stride << 1)), T3); _mm_storel_epi64((__m128i*)(pTmp + stride * 3), _mm_srli_si128(T3, 8)); } void deblock_edge_hor_sse128(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag) { int inc = stride; int inc2 = inc << 1; int inc3 = inc + inc2; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i TL0, TL1, TL2; __m128i TR0, TR1, TR2; __m128i TL0w, TL1w, TL2w, TR0w, TR1w, TR2w; //for write __m128i V0, V1, V2, V3, V4, V5; __m128i T0, T1, T2; __m128i M0, M1, M2; __m128i FLT_L, FLT_R, FLT, FS; __m128i FS3, FS4, FS56; __m128i ALPHA = _mm_set1_epi16((int16_t)Alpha); __m128i BETA = _mm_set1_epi16((int16_t)Beta); __m128i c_0 = _mm_set1_epi16(0); __m128i c_1 = _mm_set1_epi16(1); __m128i c_2 = _mm_set1_epi16(2); __m128i c_3 = _mm_set1_epi16(3); __m128i c_4 = _mm_set1_epi16(4); __m128i c_8 = _mm_set1_epi16(8); __m128i c_16 = _mm_set1_epi16(16); TL2 = _mm_loadl_epi64((__m128i*)(SrcPtr - inc3)); TL1 = _mm_loadl_epi64((__m128i*)(SrcPtr - inc2)); TL0 = _mm_loadl_epi64((__m128i*)(SrcPtr - inc)); TR0 = _mm_loadl_epi64((__m128i*)(SrcPtr + 0)); TR1 = _mm_loadl_epi64((__m128i*)(SrcPtr + inc)); TR2 = _mm_loadl_epi64((__m128i*)(SrcPtr + inc2)); TL2 = _mm_unpacklo_epi8(TL2, c_0); TL1 = _mm_unpacklo_epi8(TL1, c_0); TL0 = _mm_unpacklo_epi8(TL0, c_0); TR0 = _mm_unpacklo_epi8(TR0, c_0); TR1 = _mm_unpacklo_epi8(TR1, c_0); TR2 = _mm_unpacklo_epi8(TR2, c_0); #define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b)) T0 = _mm_subabs_epu16(TL0, TR0); T1 = _mm_cmpgt_epi16(T0, c_1); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag1, flag0, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0 = _mm_subabs_epu16(TL1, TL0); T1 = _mm_subabs_epu16(TR1, TR0); FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2); FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2); T0 = _mm_subabs_epu16(TL2, TL0); T1 = _mm_subabs_epu16(TR2, TR0); M1 = _mm_cmpgt_epi16(BETA, T0); M2 = _mm_cmpgt_epi16(BETA, T1); FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L); FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R); FLT = _mm_add_epi16(FLT_L, FLT_R); M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1)); T0 = _mm_subs_epi16(FLT, c_2); T1 = _mm_subs_epi16(FLT, c_3); T2 = _mm_subabs_epu16(TL1, TR1); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(c_1, c_2, _mm_cmpeq_epi16(FLT_L, c_2)); FS3 = _mm_blendv_epi8(c_0, c_1, _mm_cmpgt_epi16(BETA, T2)); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4)); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4)); FS = _mm_blendv_epi8(FS, FS3, _mm_cmpeq_epi16(FLT, c_3)); FS = _mm_and_si128(FS, M0); #undef _mm_subabs_epu16 TR0w = TR0; TR1w = TR1; TL0w = TL0; TL1w = TL1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(TL0, TR0), c_2); // L0 + R0 + 2 V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TL0, 1), T2), 2); V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TR0, 1), T2), 2); TL0w = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1)); TR0w = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1)); /* fs == 2 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4 T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 1), _mm_add_epi16(TL1, TR0)); T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 3), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 1), _mm_add_epi16(TR1, TL0)); T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 3), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); TL0w = _mm_blendv_epi8(TL0w, V0, _mm_cmpeq_epi16(FS, c_2)); TR0w = _mm_blendv_epi8(TR0w, V1, _mm_cmpeq_epi16(FS, c_2)); /* fs == 3 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8 T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 2), _mm_add_epi16(TL2, TR1)); T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 1), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(T0, 4); T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 2), _mm_add_epi16(TR2, TL1)); T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 1), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(T0, 4); TL0w = _mm_blendv_epi8(TL0w, V0, _mm_cmpeq_epi16(FS, c_3)); TR0w = _mm_blendv_epi8(TR0w, V1, _mm_cmpeq_epi16(FS, c_3)); T0 = _mm_add_epi16(_mm_add_epi16(TL2, TR0), _mm_slli_epi16(TL2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL1, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL0, 2)); V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); T0 = _mm_add_epi16(_mm_add_epi16(TR2, TL0), _mm_slli_epi16(TR2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR1, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR0, 2)); V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); TL1w = _mm_blendv_epi8(TL1w, V2, _mm_cmpeq_epi16(FS, c_3)); TR1w = _mm_blendv_epi8(TR1w, V3, _mm_cmpeq_epi16(FS, c_3)); FS = _mm_cmpeq_epi16(FS, c_4); if (M128_U64(FS, 0) || M128_U64(FS, 1)) { /* fs == 4 */ /* cal L0/R0 */ T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TL0, TL2), TR0), 3); T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TL0, TL2)); T2 = _mm_add_epi16(_mm_slli_epi16(TR2, 1), _mm_slli_epi16(TR2, 2)); V0 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5); T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TR0, TR2), TL0), 3); T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TR0, TR2)); T2 = _mm_add_epi16(_mm_slli_epi16(TL2, 1), _mm_slli_epi16(TL2, 2)); V1 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5); TL0w = _mm_blendv_epi8(TL0w, V0, FS); TR0w = _mm_blendv_epi8(TR0w, V1, FS); /* cal L1/R1 */ T0 = _mm_slli_epi16(_mm_add_epi16(TL2, TR0), 1); T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TL0, 3), TL0)); T2 = _mm_add_epi16(_mm_slli_epi16(TL2, 2), _mm_add_epi16(TR0, c_8)); V2 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4); T0 = _mm_slli_epi16(_mm_add_epi16(TR2, TL0), 1); T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TR0, 3), TR0)); T2 = _mm_add_epi16(_mm_slli_epi16(TR2, 2), _mm_add_epi16(TL0, c_8)); V3 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4); TL1w = _mm_blendv_epi8(TL1w, V2, FS); TR1w = _mm_blendv_epi8(TR1w, V3, FS); /* cal L2/R2 */ T0 = _mm_add_epi16(_mm_slli_epi16(TL2, 1), TL2); T2 = _mm_add_epi16(_mm_slli_epi16(TL0, 2), TR0); V4 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3); T0 = _mm_add_epi16(_mm_slli_epi16(TR2, 1), TR2); T2 = _mm_add_epi16(_mm_slli_epi16(TR0, 2), TL0); V5 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3); TL2w = _mm_blendv_epi8(TL2, V4, FS); TR2w = _mm_blendv_epi8(TR2, V5, FS); /* store result */ _mm_storel_epi64((__m128i*)(SrcPtr - inc ), _mm_packus_epi16(TL0w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr ), _mm_packus_epi16(TR0w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr - inc2), _mm_packus_epi16(TL1w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr + inc ), _mm_packus_epi16(TR1w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr - inc3), _mm_packus_epi16(TL2w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr + inc2), _mm_packus_epi16(TR2w, c_0)); } else { /* store result */ _mm_storel_epi64((__m128i*)(SrcPtr - inc ), _mm_packus_epi16(TL0w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr ), _mm_packus_epi16(TR0w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr - inc2), _mm_packus_epi16(TL1w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr + inc ), _mm_packus_epi16(TR1w, c_0)); } } void deblock_edge_hor_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag) { int inc = stride; int inc2 = inc << 1; int inc3 = inc + inc2; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i UL0, UL1, UR0, UR1; __m128i TL0, TL1, TL2; __m128i TR0, TR1, TR2; __m128i T0, T1, T2; __m128i V0, V1, V2, V3; __m128i M0, M1, M2; __m128i FLT_L, FLT_R, FLT, FS; __m128i FS4, FS56; __m128i ALPHA = _mm_set1_epi16((pel_t)Alpha); __m128i BETA = _mm_set1_epi16((pel_t)Beta); __m128i c_0 = _mm_set1_epi16(0); __m128i c_1 = _mm_set1_epi16(1); __m128i c_2 = _mm_set1_epi16(2); __m128i c_3 = _mm_set1_epi16(3); __m128i c_4 = _mm_set1_epi16(4); __m128i c_8 = _mm_set1_epi16(8); TL0 = _mm_set_epi32(0, 0, ((int32_t*)(SrcPtrV - inc))[0], ((int32_t*)(SrcPtrU - inc))[0]); TL1 = _mm_set_epi32(0, 0, ((int32_t*)(SrcPtrV - inc2))[0], ((int32_t*)(SrcPtrU - inc2))[0]); TL2 = _mm_set_epi32(0, 0, ((int32_t*)(SrcPtrV - inc3))[0], ((int32_t*)(SrcPtrU - inc3))[0]); TR0 = _mm_set_epi32(0, 0, ((int32_t*)(SrcPtrV))[0], ((int32_t*)(SrcPtrU))[0]); TR1 = _mm_set_epi32(0, 0, ((int32_t*)(SrcPtrV + inc))[0], ((int32_t*)(SrcPtrU + inc))[0]); TR2 = _mm_set_epi32(0, 0, ((int32_t*)(SrcPtrV + inc2))[0], ((int32_t*)(SrcPtrU + inc2))[0]); TL0 = _mm_unpacklo_epi8(TL0, c_0); TL1 = _mm_unpacklo_epi8(TL1, c_0); TL2 = _mm_unpacklo_epi8(TL2, c_0); TR0 = _mm_unpacklo_epi8(TR0, c_0); TR1 = _mm_unpacklo_epi8(TR1, c_0); TR2 = _mm_unpacklo_epi8(TR2, c_0); #define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b)) T0 = _mm_subabs_epu16(TL0, TR0); T1 = _mm_cmpgt_epi16(T0, c_1); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag0, flag1, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0 = _mm_subabs_epu16(TL1, TL0); T1 = _mm_subabs_epu16(TR1, TR0); FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2); FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2); T0 = _mm_subabs_epu16(TL2, TL0); T1 = _mm_subabs_epu16(TR2, TR0); M1 = _mm_cmpgt_epi16(BETA, T0); M2 = _mm_cmpgt_epi16(BETA, T1); FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L); FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R); FLT = _mm_add_epi16(FLT_L, FLT_R); M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1)); T0 = _mm_subs_epi16(FLT, c_3); T1 = _mm_subs_epi16(FLT, c_4); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(c_0, c_1, _mm_cmpeq_epi16(FLT_L, c_2)); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4)); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4)); FS = _mm_and_si128(FS, M0); #undef _mm_subabs_epu16 UR0 = TR0; //UR0 TR0 to store UR1 = TR1; UL0 = TL0; UL1 = TL1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(TL0, TR0), c_2); // L0 + R0 + 2 V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TL0, 1), T2), 2); V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TR0, 1), T2), 2); UL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1)); UR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1)); /* fs == 2 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4 T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 1), _mm_add_epi16(TL1, TR0)); T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 3), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 1), _mm_add_epi16(TR1, TL0)); T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 3), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); UL0 = _mm_blendv_epi8(UL0, V0, _mm_cmpeq_epi16(FS, c_2)); UR0 = _mm_blendv_epi8(UR0, V1, _mm_cmpeq_epi16(FS, c_2)); /* fs == 3 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8 T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 2), _mm_add_epi16(TL2, TR1)); T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 1), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(T0, 4); T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 2), _mm_add_epi16(TR2, TL1)); T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 1), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(T0, 4); UL0 = _mm_blendv_epi8(UL0, V0, _mm_cmpeq_epi16(FS, c_3)); UR0 = _mm_blendv_epi8(UR0, V1, _mm_cmpeq_epi16(FS, c_3)); T0 = _mm_add_epi16(_mm_add_epi16(TL2, TR0), _mm_slli_epi16(TL2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL1, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL0, 2)); V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); T0 = _mm_add_epi16(_mm_add_epi16(TR2, TL0), _mm_slli_epi16(TR2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR1, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR0, 2)); V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); UL1 = _mm_blendv_epi8(UL1, V2, _mm_cmpeq_epi16(FS, c_3)); UR1 = _mm_blendv_epi8(UR1, V3, _mm_cmpeq_epi16(FS, c_3)); /* store result */ UL0 = _mm_packus_epi16(UL0, c_0); UL1 = _mm_packus_epi16(UL1, c_0); UR0 = _mm_packus_epi16(UR0, c_0); UR1 = _mm_packus_epi16(UR1, c_0); ((int32_t*)(SrcPtrU - inc ))[0] = M128_I32(UL0, 0); ((int32_t*)(SrcPtrU ))[0] = M128_I32(UR0, 0); ((int32_t*)(SrcPtrU - inc2))[0] = M128_I32(UL1, 0); ((int32_t*)(SrcPtrU + inc ))[0] = M128_I32(UR1, 0); ((int32_t*)(SrcPtrV - inc ))[0] = M128_I32(UL0, 1); ((int32_t*)(SrcPtrV ))[0] = M128_I32(UR0, 1); ((int32_t*)(SrcPtrV - inc2))[0] = M128_I32(UL1, 1); ((int32_t*)(SrcPtrV + inc ))[0] = M128_I32(UR1, 1); } xavs2-1.3/source/common/vec/intrinsic_deblock_avx2.c000066400000000000000000001002511340660520300225540ustar00rootroot00000000000000/* * intrinsic_deblock_avx2.c * * Description of this file: * AVX2 assembly functions of Deblock module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include #include #include #include #include #include "../basic_types.h" #include "intrinsic.h" void deblock_edge_ver_avx2(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag) { pel_t *pTmp = SrcPtr - 4; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i TL0, TL1, TL2, TL3; __m128i TR0, TR1, TR2, TR3; __m128i T0, T1, T2, T3, T4, T5, T6, T7; __m128i M0, M1; __m128i FLT, FS; __m128i FS3, FS4, FS56; __m256i TLR0, TLR1, TLR2; // store TL* and TR* __m256i TRL0, TRL1, TRL2; // store TR* and TL* __m256i T0_256, T1_256, T2_256; __m256i FLT_LR; __m256i TLR0w, TLR1w; __m256i FS_256; __m128i ALPHA = _mm_set1_epi16((pel_t)Alpha); __m128i BETA = _mm_set1_epi16((pel_t)Beta); __m128i c_0 = _mm_set1_epi16(0); __m256i c_1_256 = _mm256_set1_epi16(1); __m256i c_2_256 = _mm256_set1_epi16(2); __m256i c_3_256 = _mm256_set1_epi16(3); __m256i c_4_256 = _mm256_set1_epi16(4); __m256i c_8_256 = _mm256_set1_epi16(8); __m256i c_16_256 = _mm256_set1_epi16(16); __m256i BETA_256 = _mm256_set1_epi16((short)Beta); T0 = _mm_loadl_epi64((__m128i*)(pTmp)); T1 = _mm_loadl_epi64((__m128i*)(pTmp + stride)); T2 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 2)); T3 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 3)); T4 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 4)); T5 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 5)); T6 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 6)); T7 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 7)); //--------------- transpose ------------------------------- T0 = _mm_unpacklo_epi8(T0, T1); T1 = _mm_unpacklo_epi8(T2, T3); T2 = _mm_unpacklo_epi8(T4, T5); T3 = _mm_unpacklo_epi8(T6, T7); T4 = _mm_unpacklo_epi16(T0, T1); T5 = _mm_unpacklo_epi16(T2, T3); T6 = _mm_unpackhi_epi16(T0, T1); T7 = _mm_unpackhi_epi16(T2, T3); /* TLR0 = _mm256_inserti128_si256(_mm256_castsi128_si256(T4), T6, 1); TLR1 = _mm256_inserti128_si256(_mm256_castsi128_si256(T5), T7, 1); TLR0w = _mm256_unpacklo_epi32(TLR0, TLR1); //T0 T2 TLR1w = _mm256_unpackhi_epi32(TLR0, TLR1); //T1 T3 TLR3 = _mm256_unpacklo_epi8(TLR0w, c_0_256); //TL3 TR0 TLR2 = _mm256_unpackhi_epi8(TLR0w, c_0_256); //TL2 TR1 TLR1 = _mm256_unpacklo_epi8(TLR1w, c_0_256); //TL1 TR2 TLR0 = _mm256_unpackhi_epi8(TLR1w, c_0_256); //TL0 TR3 TR0 = _mm256_extracti128_si256(TLR3, 0x01); TR1 = _mm256_extracti128_si256(TLR2, 0x01); TR2 = _mm256_extracti128_si256(TLR1, 0x01); TR3 = _mm256_extracti128_si256(TLR0, 0x01); TLR0 = _mm256_inserti128_si256(TLR0, TR0, 1); TLR1 = _mm256_inserti128_si256(TLR1, TR1, 1); TLR2 = _mm256_inserti128_si256(TLR2, TR2, 1); TRL0 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR0), _mm256_castsi256_si128(TLR0), 1); TRL1 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR1), _mm256_castsi256_si128(TLR1), 1); */ T0 = _mm_unpacklo_epi32(T4, T5); T1 = _mm_unpackhi_epi32(T4, T5); T2 = _mm_unpacklo_epi32(T6, T7); T3 = _mm_unpackhi_epi32(T6, T7); TL3 = _mm_unpacklo_epi8(T0, c_0); TL2 = _mm_unpackhi_epi8(T0, c_0); TL1 = _mm_unpacklo_epi8(T1, c_0); TL0 = _mm_unpackhi_epi8(T1, c_0); TR0 = _mm_unpacklo_epi8(T2, c_0); TR1 = _mm_unpackhi_epi8(T2, c_0); TR2 = _mm_unpacklo_epi8(T3, c_0); TR3 = _mm_unpackhi_epi8(T3, c_0); TLR0 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL0), TR0, 1); TLR1 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL1), TR1, 1); TLR2 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL2), TR2, 1); TRL0 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR0), TL0, 1); TRL1 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR1), TL1, 1); T0 = _mm_abs_epi16(_mm_subs_epi16(TL0, TR0)); T1 = _mm_cmpgt_epi16(T0, _mm256_castsi256_si128(c_1_256)); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag1, flag0, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR1, TLR0)); FLT_LR = _mm256_and_si256(_mm256_cmpgt_epi16(BETA_256, T0_256), c_2_256); T1_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR2, TLR0)); T2_256 = _mm256_cmpgt_epi16(BETA_256, T1_256); FLT_LR = _mm256_add_epi16(_mm256_and_si256(T2_256, c_1_256), FLT_LR); FLT = _mm_add_epi16(_mm256_castsi256_si128(FLT_LR), _mm256_extracti128_si256(FLT_LR, 0x01)); T0_256 = _mm256_cmpeq_epi16(TLR1, TLR0); M1 = _mm_and_si128(_mm256_castsi256_si128(T0_256), _mm256_extracti128_si256(T0_256, 0x01)); T0 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_2_256)); T1 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_3_256)); T2 = _mm_abs_epi16(_mm_subs_epi16(TL1, TR1)); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(_mm256_castsi256_si128(c_1_256), _mm256_castsi256_si128(c_2_256), _mm_cmpeq_epi16(_mm256_castsi256_si128(FLT_LR), _mm256_castsi256_si128(c_2_256))); FS3 = _mm_blendv_epi8(c_0, _mm256_castsi256_si128(c_1_256), _mm_cmpgt_epi16(BETA, T2)); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_blendv_epi8(FS, FS3, _mm_cmpeq_epi16(FLT, _mm256_castsi256_si128(c_3_256))); FS = _mm_and_si128(FS, M0); FS_256 = _mm256_inserti128_si256(_mm256_castsi128_si256(FS), FS, 1); TLR0w = TLR0; TLR1w = TLR1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(TL0, TR0), _mm256_castsi256_si128(c_2_256)); // L0 + R0 + 2 T2_256 = _mm256_castsi128_si256(T2); T2_256 = _mm256_inserti128_si256(T2_256, T2, 1); // save T1_256 = _mm256_srli_epi16(_mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), T2_256), 2); TLR0w = _mm256_blendv_epi8(TLR0, T1_256, _mm256_cmpeq_epi16(FS_256, c_1_256)); /* fs == 2 */ T2_256 = _mm256_slli_epi16(T2_256, 1); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 1), _mm256_add_epi16(TLR1, TRL0)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 3), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_4_256), 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_2_256)); /* fs == 3 */ T2_256 = _mm256_slli_epi16(T2_256, 1); // (L0 << 2) + (R0 << 2) + 8 T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 2), _mm256_add_epi16(TLR2, TRL1)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(T0_256, 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); T0_256 = _mm256_add_epi16(_mm256_add_epi16(TLR2, TRL0), _mm256_slli_epi16(TLR2, 1)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR1, 3)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR0, 2)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_8_256), 4); TLR1w = _mm256_blendv_epi8(TLR1w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); FS = _mm_cmpeq_epi16(FS, _mm256_castsi256_si128(c_4_256)); if (_mm_extract_epi64(FS, 0) || _mm_extract_epi64(FS, 1)) { /* fs == 4 */ TRL2 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR2), TL2, 1); FS_256 = _mm256_inserti128_si256(_mm256_castsi128_si256(FS), FS, 1); /* cal L0/R0 */ T0_256 = _mm256_slli_epi16(_mm256_add_epi16(_mm256_add_epi16(TLR0, TLR2), TRL0), 3); T0_256 = _mm256_add_epi16(_mm256_add_epi16(T0_256, c_16_256), _mm256_add_epi16(TLR0, TLR2)); T2_256 = _mm256_add_epi16(_mm256_slli_epi16(TRL2, 1), _mm256_slli_epi16(TRL2, 2)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, T2_256), 5); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, FS_256); /* cal L1/R1 */ T0_256 = _mm256_slli_epi16(_mm256_add_epi16(TLR2, TRL0), 1); T0_256 = _mm256_add_epi16(T0_256, _mm256_sub_epi16(_mm256_slli_epi16(TLR0, 3), TLR0)); T2_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR2, 2), _mm256_add_epi16(TRL0, c_8_256)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, T2_256), 4); TLR1w = _mm256_blendv_epi8(TLR1w, T1_256, FS_256); /* cal L2/R2 */ T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR2, 1), TLR2); T2_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 2), TRL0); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, _mm256_add_epi16(T2_256, c_4_256)), 3); TLR2 = _mm256_blendv_epi8(TLR2, T1_256, FS_256); } /* stroe result */ T4 = _mm_packus_epi16(TL3, _mm256_extracti128_si256(TLR0w, 0x01)); T5 = _mm_packus_epi16(_mm256_castsi256_si128(TLR2), _mm256_extracti128_si256(TLR1w, 0x01)); T6 = _mm_packus_epi16(_mm256_castsi256_si128(TLR1w), _mm256_extracti128_si256(TLR2, 0x01)); T7 = _mm_packus_epi16(_mm256_castsi256_si128(TLR0w), TR3); T0 = _mm_unpacklo_epi8(T4, T5); T1 = _mm_unpacklo_epi8(T6, T7); T2 = _mm_unpackhi_epi8(T4, T5); T3 = _mm_unpackhi_epi8(T6, T7); T4 = _mm_unpacklo_epi16(T0, T1); T5 = _mm_unpacklo_epi16(T2, T3); T6 = _mm_unpackhi_epi16(T0, T1); T7 = _mm_unpackhi_epi16(T2, T3); T0 = _mm_unpacklo_epi32(T4, T5); T1 = _mm_unpackhi_epi32(T4, T5); T2 = _mm_unpacklo_epi32(T6, T7); T3 = _mm_unpackhi_epi32(T6, T7); pTmp = SrcPtr - 4; _mm_storel_epi64((__m128i*)(pTmp), T0); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T0, 8)); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), T1); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T1, 8)); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), T2); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T2, 8)); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), T3); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T3, 8)); } void deblock_edge_ver_c_avx2(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag) { pel_t *pTmp; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i TL0, TL1, TL2, TL3; __m128i TR0, TR1, TR2, TR3; __m128i T0, T1, T2, T3, T4, T5, T6, T7; __m128i M0, M1; __m128i FLT, FS; __m128i FS4, FS56; __m256i TLR0, TLR1, TLR2; // store TL* and TR* __m256i TRL0, TRL1; // store TR* and TL* __m256i T0_256, T1_256, T2_256; __m256i FLT_X; __m256i TLR0w, TLR1w; __m256i FS_256; __m128i ALPHA = _mm_set1_epi16((pel_t)Alpha); __m128i c_0 = _mm_set1_epi16(0); __m256i c_1_256 = _mm256_set1_epi16(1); __m256i c_2_256 = _mm256_set1_epi16(2); __m256i c_3_256 = _mm256_set1_epi16(3); __m256i c_4_256 = _mm256_set1_epi16(4); __m256i c_8_256 = _mm256_set1_epi16(8); __m256i BETA_256 = _mm256_set1_epi16((short)Beta); pTmp = SrcPtrU - 4; T0 = _mm_loadl_epi64((__m128i*)(pTmp)); T1 = _mm_loadl_epi64((__m128i*)(pTmp + stride)); T2 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 2)); T3 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 3)); pTmp = SrcPtrV - 4; T4 = _mm_loadl_epi64((__m128i*)(pTmp)); T5 = _mm_loadl_epi64((__m128i*)(pTmp + stride)); T6 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 2)); T7 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 3)); T0 = _mm_unpacklo_epi8(T0, T1); T1 = _mm_unpacklo_epi8(T2, T3); T2 = _mm_unpacklo_epi8(T4, T5); T3 = _mm_unpacklo_epi8(T6, T7); T4 = _mm_unpacklo_epi16(T0, T1); T5 = _mm_unpacklo_epi16(T2, T3); T6 = _mm_unpackhi_epi16(T0, T1); T7 = _mm_unpackhi_epi16(T2, T3); T0 = _mm_unpacklo_epi32(T4, T5); T1 = _mm_unpackhi_epi32(T4, T5); T2 = _mm_unpacklo_epi32(T6, T7); T3 = _mm_unpackhi_epi32(T6, T7); TL3 = _mm_unpacklo_epi8(T0, c_0); TL2 = _mm_unpackhi_epi8(T0, c_0); TL1 = _mm_unpacklo_epi8(T1, c_0); TL0 = _mm_unpackhi_epi8(T1, c_0); TR0 = _mm_unpacklo_epi8(T2, c_0); TR1 = _mm_unpackhi_epi8(T2, c_0); TR2 = _mm_unpacklo_epi8(T3, c_0); TR3 = _mm_unpackhi_epi8(T3, c_0); TLR0 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL0), TR0, 1); TLR1 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL1), TR1, 1); TLR2 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL2), TR2, 1); TRL0 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR0), TL0, 1); TRL1 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR1), TL1, 1); T0 = _mm_abs_epi16(_mm_subs_epi16(_mm256_castsi256_si128(TLR0), _mm256_castsi256_si128(TRL0))); T1 = _mm_cmpgt_epi16(T0, _mm256_castsi256_si128(c_1_256)); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag0, flag1, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR1, TLR0)); FLT_X = _mm256_and_si256(_mm256_cmpgt_epi16(BETA_256, T0_256), c_2_256); T0_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR2, TLR0)); T1_256 = _mm256_and_si256(_mm256_cmpgt_epi16(BETA_256, T0_256), c_1_256); FLT_X = _mm256_add_epi16(T1_256, FLT_X); FLT = _mm_add_epi16(_mm256_castsi256_si128(FLT_X), _mm256_extracti128_si256(FLT_X, 0x01)); T0_256 = _mm256_cmpeq_epi16(TLR1, TLR0); M1 = _mm_and_si128(_mm256_castsi256_si128(T0_256), _mm256_extracti128_si256(T0_256, 0x01)); T0 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_3_256)); T1 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_4_256)); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(c_0, _mm256_castsi256_si128(c_1_256), _mm_cmpeq_epi16(_mm256_castsi256_si128(FLT_X), _mm256_castsi256_si128(c_2_256))); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_and_si128(FS, M0); FS_256 = _mm256_inserti128_si256(_mm256_castsi128_si256(FS), FS, 1); TLR0w = TLR0; TLR1w = TLR1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(_mm256_castsi256_si128(TLR0), _mm256_castsi256_si128(TRL0)), _mm256_castsi256_si128(c_2_256)); // L0 + R0 + 2 T2_256 = _mm256_castsi128_si256(T2); T2_256 = _mm256_inserti128_si256(T2_256, T2, 1); // save T1_256 = _mm256_srli_epi16(_mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), T2_256), 2); TLR0w = _mm256_blendv_epi8(TLR0, T1_256, _mm256_cmpeq_epi16(FS_256, c_1_256)); /* fs == 2 */ T2_256 = _mm256_slli_epi16(T2_256, 1); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 1), _mm256_add_epi16(TLR1, TRL0)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 3), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_4_256), 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_2_256)); /* fs == 3 */ T2_256 = _mm256_slli_epi16(T2_256, 1); // (L0 << 2) + (R0 << 2) + 8 T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 2), _mm256_add_epi16(TLR2, TRL1)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(T0_256, 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); T0_256 = _mm256_add_epi16(_mm256_add_epi16(TLR2, TRL0), _mm256_slli_epi16(TLR2, 1)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR1, 3)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR0, 2)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_8_256), 4); TLR1w = _mm256_blendv_epi8(TLR1w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); /* stroe result */ T4 = _mm_packus_epi16(TL3, _mm256_extracti128_si256(TLR0w, 0x01)); T5 = _mm_packus_epi16(TL2, _mm256_extracti128_si256(TLR1w, 0x01)); T6 = _mm_packus_epi16(_mm256_castsi256_si128(TLR1w), TR2); T7 = _mm_packus_epi16(_mm256_castsi256_si128(TLR0w), TR3); T0 = _mm_unpacklo_epi8(T4, T5); T1 = _mm_unpacklo_epi8(T6, T7); T2 = _mm_unpackhi_epi8(T4, T5); T3 = _mm_unpackhi_epi8(T6, T7); T4 = _mm_unpacklo_epi16(T0, T1); T5 = _mm_unpacklo_epi16(T2, T3); T6 = _mm_unpackhi_epi16(T0, T1); T7 = _mm_unpackhi_epi16(T2, T3); T0 = _mm_unpacklo_epi32(T4, T5); T1 = _mm_unpackhi_epi32(T4, T5); T2 = _mm_unpacklo_epi32(T6, T7); T3 = _mm_unpackhi_epi32(T6, T7); pTmp = SrcPtrU - 4; _mm_storel_epi64((__m128i*)(pTmp), T0); _mm_storel_epi64((__m128i*)(pTmp + stride), _mm_srli_si128(T0, 8)); _mm_storel_epi64((__m128i*)(pTmp + (stride << 1)), T1); _mm_storel_epi64((__m128i*)(pTmp + stride * 3), _mm_srli_si128(T1, 8)); pTmp = SrcPtrV - 4; _mm_storel_epi64((__m128i*)(pTmp), T2); _mm_storel_epi64((__m128i*)(pTmp + stride), _mm_srli_si128(T2, 8)); _mm_storel_epi64((__m128i*)(pTmp + (stride << 1)), T3); _mm_storel_epi64((__m128i*)(pTmp + stride * 3), _mm_srli_si128(T3, 8)); } void deblock_edge_hor_avx2(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag) { int inc = stride; int inc2 = inc << 1; int inc3 = inc + inc2; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i TL0, TL1, TL2; __m128i TR0, TR1, TR2; __m128i T0, T1, T2; __m128i M0, M1; __m128i FLT, FS; __m128i FS3, FS4, FS56; __m256i TLR0, TLR1, TLR2; // store TL* and TR* __m256i TRL0, TRL1, TRL2; // store TR* and TL* __m256i T0_256, T1_256, T2_256; __m256i FLT_X; __m256i TLR0w, TLR1w; __m256i FS_256; __m128i ALPHA = _mm_set1_epi16((short)Alpha); __m128i BETA = _mm_set1_epi16((short)Beta); __m128i c_0 = _mm_set1_epi16(0); __m256i c_0_256 = _mm256_setzero_si256(); __m256i c_1_256 = _mm256_set1_epi16(1); __m256i c_2_256 = _mm256_set1_epi16(2); __m256i c_3_256 = _mm256_set1_epi16(3); __m256i c_4_256 = _mm256_set1_epi16(4); __m256i c_8_256 = _mm256_set1_epi16(8); __m256i c_16_256 = _mm256_set1_epi16(16); __m256i BETA_256 = _mm256_set1_epi16((short)Beta); TL2 = _mm_loadl_epi64((__m128i*)(SrcPtr - inc3)); TL1 = _mm_loadl_epi64((__m128i*)(SrcPtr - inc2)); TL0 = _mm_loadl_epi64((__m128i*)(SrcPtr - inc)); TR0 = _mm_loadl_epi64((__m128i*)(SrcPtr + 0)); TR1 = _mm_loadl_epi64((__m128i*)(SrcPtr + inc)); TR2 = _mm_loadl_epi64((__m128i*)(SrcPtr + inc2)); TL2 = _mm_unpacklo_epi8(TL2, c_0); TL1 = _mm_unpacklo_epi8(TL1, c_0); TL0 = _mm_unpacklo_epi8(TL0, c_0); TR0 = _mm_unpacklo_epi8(TR0, c_0); TR1 = _mm_unpacklo_epi8(TR1, c_0); TR2 = _mm_unpacklo_epi8(TR2, c_0); TLR0 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL0), TR0, 1); TLR1 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL1), TR1, 1); TLR2 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL2), TR2, 1); TRL0 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR0), TL0, 1); TRL1 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR1), TL1, 1); T0 = _mm_abs_epi16(_mm_subs_epi16(TL0, TR0)); T1 = _mm_cmpgt_epi16(T0, _mm256_castsi256_si128(c_1_256)); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag1, flag0, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR1, TLR0)); FLT_X = _mm256_and_si256(_mm256_cmpgt_epi16(BETA_256, T0_256), c_2_256); T0_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR2, TLR0)); T1_256 = _mm256_and_si256(_mm256_cmpgt_epi16(BETA_256, T0_256), c_1_256); FLT_X = _mm256_add_epi16(T1_256, FLT_X); FLT = _mm_add_epi16(_mm256_castsi256_si128(FLT_X), _mm256_extracti128_si256(FLT_X, 0x01)); T0_256 = _mm256_cmpeq_epi16(TLR1, TLR0); M1 = _mm_and_si128(_mm256_castsi256_si128(T0_256), _mm256_extracti128_si256(T0_256, 0x01)); T0 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_2_256)); T1 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_3_256)); T2 = _mm_abs_epi16(_mm_subs_epi16(TL1, TR1)); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(_mm256_castsi256_si128(c_1_256), _mm256_castsi256_si128(c_2_256), _mm_cmpeq_epi16(_mm256_castsi256_si128(FLT_X), _mm256_castsi256_si128(c_2_256))); FS3 = _mm_blendv_epi8(c_0, _mm256_castsi256_si128(c_1_256), _mm_cmpgt_epi16(BETA, T2)); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_blendv_epi8(FS, FS3, _mm_cmpeq_epi16(FLT, _mm256_castsi256_si128(c_3_256))); FS = _mm_and_si128(FS, M0); FS_256 = _mm256_inserti128_si256(_mm256_castsi128_si256(FS), FS, 1); TLR0w = TLR0; TLR1w = TLR1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(TL0, TR0), _mm256_castsi256_si128(c_2_256)); // L0 + R0 + 2 T2_256 = _mm256_castsi128_si256(T2); T2_256 = _mm256_inserti128_si256(T2_256, T2, 1); // save T1_256 = _mm256_srli_epi16(_mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), T2_256), 2); TLR0w = _mm256_blendv_epi8(TLR0, T1_256, _mm256_cmpeq_epi16(FS_256, c_1_256)); /* fs == 2 */ T2_256 = _mm256_slli_epi16(T2_256, 1); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 1), _mm256_add_epi16(TLR1, TRL0)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 3), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_4_256), 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_2_256)); /* fs == 3 */ T2_256 = _mm256_slli_epi16(T2_256, 1); // (L0 << 2) + (R0 << 2) + 8 T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 2), _mm256_add_epi16(TLR2, TRL1)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(T0_256, 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); T0_256 = _mm256_add_epi16(_mm256_add_epi16(TLR2, TRL0), _mm256_slli_epi16(TLR2, 1)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR1, 3)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR0, 2)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_8_256), 4); TLR1w = _mm256_blendv_epi8(TLR1w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); FS = _mm_cmpeq_epi16(FS, _mm256_castsi256_si128(c_4_256)); if (_mm_extract_epi64(FS, 0) || _mm_extract_epi64(FS, 1)) { /* fs == 4 */ TRL2 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR2), TL2, 1); FS_256 = _mm256_inserti128_si256(_mm256_castsi128_si256(FS), FS, 1); /* cal L0/R0 */ T0_256 = _mm256_slli_epi16(_mm256_add_epi16(_mm256_add_epi16(TLR0, TLR2), TRL0), 3); T0_256 = _mm256_add_epi16(_mm256_add_epi16(T0_256, c_16_256), _mm256_add_epi16(TLR0, TLR2)); T2_256 = _mm256_add_epi16(_mm256_slli_epi16(TRL2, 1), _mm256_slli_epi16(TRL2, 2)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, T2_256), 5); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, FS_256); /* cal L1/R1 */ T0_256 = _mm256_slli_epi16(_mm256_add_epi16(TLR2, TRL0), 1); T0_256 = _mm256_add_epi16(T0_256, _mm256_sub_epi16(_mm256_slli_epi16(TLR0, 3), TLR0)); T2_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR2, 2), _mm256_add_epi16(TRL0, c_8_256)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, T2_256), 4); TLR1w = _mm256_blendv_epi8(TLR1w, T1_256, FS_256); /* cal L2/R2 */ T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR2, 1), TLR2); T2_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 2), TRL0); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, _mm256_add_epi16(T2_256, c_4_256)), 3); TLR2 = _mm256_blendv_epi8(TLR2, T1_256, FS_256); TLR0w = _mm256_packus_epi16(TLR0w, c_0_256); TLR1w = _mm256_packus_epi16(TLR1w, c_0_256); TLR2 = _mm256_packus_epi16(TLR2, c_0_256); /* stroe result */ _mm_storel_epi64((__m128i*)(SrcPtr - inc), _mm256_castsi256_si128(TLR0w)); _mm_storel_epi64((__m128i*)(SrcPtr - 0), _mm256_extracti128_si256(TLR0w, 0x01)); _mm_storel_epi64((__m128i*)(SrcPtr - inc2), _mm256_castsi256_si128(TLR1w)); _mm_storel_epi64((__m128i*)(SrcPtr + inc), _mm256_extracti128_si256(TLR1w, 0x01)); _mm_storel_epi64((__m128i*)(SrcPtr - inc3), _mm256_castsi256_si128(TLR2)); _mm_storel_epi64((__m128i*)(SrcPtr + inc2), _mm256_extracti128_si256(TLR2, 0x01)); } else { /* stroe result */ TLR0w = _mm256_packus_epi16(TLR0w, c_0_256); TLR1w = _mm256_packus_epi16(TLR1w, c_0_256); _mm_storel_epi64((__m128i*)(SrcPtr - inc), _mm256_castsi256_si128(TLR0w)); _mm_storel_epi64((__m128i*)(SrcPtr - 0), _mm256_extracti128_si256(TLR0w, 0x01)); _mm_storel_epi64((__m128i*)(SrcPtr - inc2), _mm256_castsi256_si128(TLR1w)); _mm_storel_epi64((__m128i*)(SrcPtr + inc), _mm256_extracti128_si256(TLR1w, 0x01)); } } //Ҫ޸ı ޸ı i32s_tΪint32_t;signed int void deblock_edge_hor_c_avx2(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag) { int inc = stride; int inc2 = inc << 1; int inc3 = inc + inc2; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i T0, T1, T2; __m128i M0, M1; __m128i FLT, FS; __m128i FS4, FS56; __m256i TLR0, TLR1, TLR2; // store TL* and TR* __m256i TRL0, TRL1; // store TR* and TL* __m256i T0_256, T1_256, T2_256; __m256i FLT_X; __m256i TLR0w, TLR1w; __m256i FS_256; __m128i ALPHA = _mm_set1_epi16((short)Alpha); __m128i c_0 = _mm_set1_epi16(0); __m256i c_0_256 = _mm256_setzero_si256(); __m256i c_1_256 = _mm256_set1_epi16(1); __m256i c_2_256 = _mm256_set1_epi16(2); __m256i c_3_256 = _mm256_set1_epi16(3); __m256i c_4_256 = _mm256_set1_epi16(4); __m256i c_8_256 = _mm256_set1_epi16(8); __m256i BETA_256 = _mm256_set1_epi16((short)Beta); TLR0 = _mm256_set_epi32(0, 0, ((int32_t*)(SrcPtrV))[0], ((int32_t*)(SrcPtrU))[0], 0, 0, ((int32_t*)(SrcPtrV - inc))[0], ((int32_t*)(SrcPtrU - inc))[0]); TLR1 = _mm256_set_epi32(0, 0, ((int32_t*)(SrcPtrV + inc))[0], ((int32_t*)(SrcPtrU + inc))[0], 0, 0, ((int32_t*)(SrcPtrV - inc2))[0], ((int32_t*)(SrcPtrU - inc2))[0]); TLR2 = _mm256_set_epi32(0, 0, ((int32_t*)(SrcPtrV + inc2))[0], ((int32_t*)(SrcPtrU + inc2))[0], 0, 0, ((int32_t*)(SrcPtrV - inc3))[0], ((int32_t*)(SrcPtrU - inc3))[0]); TLR0 = _mm256_unpacklo_epi8(TLR0, c_0_256); TLR1 = _mm256_unpacklo_epi8(TLR1, c_0_256); TLR2 = _mm256_unpacklo_epi8(TLR2, c_0_256); TRL0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(TLR0, 0x01)), _mm256_castsi256_si128(TLR0), 1); TRL1 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(TLR1, 0x01)), _mm256_castsi256_si128(TLR1), 1); T0 = _mm_abs_epi16(_mm_subs_epi16(_mm256_castsi256_si128(TLR0), _mm256_castsi256_si128(TRL0))); T1 = _mm_cmpgt_epi16(T0, _mm256_castsi256_si128(c_1_256)); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag0, flag1, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR1, TLR0)); FLT_X = _mm256_and_si256(_mm256_cmpgt_epi16(BETA_256, T0_256), c_2_256); T0_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR2, TLR0)); T1_256 = _mm256_and_si256(_mm256_cmpgt_epi16(BETA_256, T0_256), c_1_256); FLT_X = _mm256_add_epi16(T1_256, FLT_X); FLT = _mm_add_epi16(_mm256_castsi256_si128(FLT_X), _mm256_extracti128_si256(FLT_X, 0x01)); T0_256 = _mm256_cmpeq_epi16(TLR1, TLR0); M1 = _mm_and_si128(_mm256_castsi256_si128(T0_256), _mm256_extracti128_si256(T0_256, 0x01)); T0 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_3_256)); T1 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_4_256)); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(c_0, _mm256_castsi256_si128(c_1_256), _mm_cmpeq_epi16(_mm256_castsi256_si128(FLT_X), _mm256_castsi256_si128(c_2_256))); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_and_si128(FS, M0); FS_256 = _mm256_inserti128_si256(_mm256_castsi128_si256(FS), FS, 1); TLR0w = TLR0; TLR1w = TLR1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(_mm256_castsi256_si128(TLR0), _mm256_castsi256_si128(TRL0)), _mm256_castsi256_si128(c_2_256)); // L0 + R0 + 2 T2_256 = _mm256_castsi128_si256(T2); T2_256 = _mm256_inserti128_si256(T2_256, T2, 1); // save T1_256 = _mm256_srli_epi16(_mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), T2_256), 2); TLR0w = _mm256_blendv_epi8(TLR0, T1_256, _mm256_cmpeq_epi16(FS_256, c_1_256)); /* fs == 2 */ T2_256 = _mm256_slli_epi16(T2_256, 1); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 1), _mm256_add_epi16(TLR1, TRL0)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 3), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_4_256), 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_2_256)); /* fs == 3 */ T2_256 = _mm256_slli_epi16(T2_256, 1); // (L0 << 2) + (R0 << 2) + 8 T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 2), _mm256_add_epi16(TLR2, TRL1)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(T0_256, 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); T0_256 = _mm256_add_epi16(_mm256_add_epi16(TLR2, TRL0), _mm256_slli_epi16(TLR2, 1)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR1, 3)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR0, 2)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_8_256), 4); TLR1w = _mm256_blendv_epi8(TLR1w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); /* stroe result */ TLR0w = _mm256_packus_epi16(TLR0w, c_0_256); TLR1w = _mm256_packus_epi16(TLR1w, c_0_256); ((int32_t*)(SrcPtrU - inc))[0] = _mm256_extract_epi32(TLR0w, 0); ((int32_t*)(SrcPtrU))[0] = _mm256_extract_epi32(TLR0w, 4); ((int32_t*)(SrcPtrU - inc2))[0] = _mm256_extract_epi32(TLR1w, 0); ((int32_t*)(SrcPtrU + inc))[0] = _mm256_extract_epi32(TLR1w, 4); ((int32_t*)(SrcPtrV - inc))[0] = _mm256_extract_epi32(TLR0w, 1); ((int32_t*)(SrcPtrV))[0] = _mm256_extract_epi32(TLR0w, 5); ((int32_t*)(SrcPtrV - inc2))[0] = _mm256_extract_epi32(TLR1w, 1); ((int32_t*)(SrcPtrV + inc))[0] = _mm256_extract_epi32(TLR1w, 5); } xavs2-1.3/source/common/vec/intrinsic_idct.c000066400000000000000000011661531340660520300211520ustar00rootroot00000000000000/* * intrinsic_idct.c * * Description of this file: * SSE assembly functions of IDCT module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../basic_types.h" #include "../avs2_defs.h" #include "intrinsic.h" #include #include #include #include ALIGN32(static const coeff_t tab_idct_8x8[12][8]) = { { 44, 38, 44, 38, 44, 38, 44, 38 }, { 25, 9, 25, 9, 25, 9, 25, 9 }, { 38, -9, 38, -9, 38, -9, 38, -9 }, { -44, -25, -44, -25, -44, -25, -44, -25 }, { 25, -44, 25, -44, 25, -44, 25, -44 }, { 9, 38, 9, 38, 9, 38, 9, 38 }, { 9, -25, 9, -25, 9, -25, 9, -25 }, { 38, -44, 38, -44, 38, -44, 38, -44 }, { 32, 32, 32, 32, 32, 32, 32, 32 }, { 32, -32, 32, -32, 32, -32, 32, -32 }, { 42, 17, 42, 17, 42, 17, 42, 17 }, { 17, -42, 17, -42, 17, -42, 17, -42 } }; extern ALIGN16(const int16_t g_2T [SEC_TR_SIZE * SEC_TR_SIZE]); extern ALIGN16(const int16_t g_2T_C[SEC_TR_SIZE * SEC_TR_SIZE]); /* --------------------------------------------------------------------------- */ void idct_c_4x4_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { const int shift1 = 5; const int shift2 = 20 - g_bit_depth; // const int clip_depth1 = LIMIT_BIT; const int clip_depth2 = g_bit_depth + 1; const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); __m128i c32_rnd = _mm_set1_epi32(1 << (shift1 - 1)); // add1 __m128i S0, S1; __m128i T0, T1; __m128i E0, E1, O0, O1; S0 = _mm_loadu_si128((__m128i*)(src )); S1 = _mm_loadu_si128((__m128i*)(src+ 8)); T0 = _mm_unpacklo_epi16(S0, S1); E0 = _mm_add_epi32(_mm_madd_epi16(T0, c16_p32_p32), c32_rnd); E1 = _mm_add_epi32(_mm_madd_epi16(T0, c16_n32_p32), c32_rnd); T1 = _mm_unpackhi_epi16(S0, S1); O0 = _mm_madd_epi16(T1, c16_p17_p42); O1 = _mm_madd_epi16(T1, c16_n42_p17); S0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0, O0), shift1), _mm_srai_epi32(_mm_sub_epi32(E1, O1), shift1)); S1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1, O1), shift1), _mm_srai_epi32(_mm_sub_epi32(E0, O0), shift1)); /* inverse */ T0 = _mm_unpacklo_epi16(S0, S1); T1 = _mm_unpackhi_epi16(S0, S1); S0 = _mm_unpacklo_epi32(T0, T1); S1 = _mm_unpackhi_epi32(T0, T1); /* second pass ------------------------------------------------- */ c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 T0 = _mm_unpacklo_epi16(S0, S1); E0 = _mm_add_epi32(_mm_madd_epi16(T0, c16_p32_p32), c32_rnd); E1 = _mm_add_epi32(_mm_madd_epi16(T0, c16_n32_p32), c32_rnd); T1 = _mm_unpackhi_epi16(S0, S1); O0 = _mm_madd_epi16(T1, c16_p17_p42); O1 = _mm_madd_epi16(T1, c16_n42_p17); S0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0, O0), shift2), _mm_srai_epi32(_mm_sub_epi32(E1, O1), shift2)); S1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1, O1), shift2), _mm_srai_epi32(_mm_sub_epi32(E0, O0), shift2)); T0 = _mm_unpacklo_epi16(S0, S1); T1 = _mm_unpackhi_epi16(S0, S1); S0 = _mm_unpacklo_epi32(T0, T1); S1 = _mm_unpackhi_epi32(T0, T1); // clip { const __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); const __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); S0 = _mm_max_epi16(_mm_min_epi16(S0, max_val), min_val); S1 = _mm_max_epi16(_mm_min_epi16(S1, max_val), min_val); } // store if (i_dst == 4) { _mm_store_si128((__m128i*)(dst + 0), S0); _mm_store_si128((__m128i*)(dst + 8), S1); } else { _mm_storel_epi64((__m128i*)(dst + 0 * i_dst), S0); _mm_storeh_pi((__m64 *)(dst + 1 * i_dst), _mm_castsi128_ps(S0)); _mm_storel_epi64((__m128i*)(dst + 2 * i_dst), S1); _mm_storeh_pi((__m64 *)(dst + 3 * i_dst), _mm_castsi128_ps(S1)); } } /* --------------------------------------------------------------------------- */ void idct_c_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { const int shift1 = 5; const int shift2 = 20 - g_bit_depth; // const int clip_depth1 = LIMIT_BIT; const int clip_depth2 = g_bit_depth + 1; const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); //row0 87high - 90low address const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); const __m128i c16_p21_p29 = _mm_set1_epi32(0x0015001D); const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D); const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); //row1 const __m128i c16_n21_p04 = _mm_set1_epi32(0xFFEB0004); const __m128i c16_n45_n40 = _mm_set1_epi32(0xFFD3FFD8); const __m128i c16_n13_n35 = _mm_set1_epi32(0xFFF3FFDD); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); //row2 const __m128i c16_n43_n35 = _mm_set1_epi32(0xFFD5FFDD); const __m128i c16_p29_n13 = _mm_set1_epi32(0x001DFFF3); const __m128i c16_p21_p45 = _mm_set1_epi32(0x0015002D); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); //row3 const __m128i c16_p04_n43 = _mm_set1_epi32(0x0004FFD5); const __m128i c16_p13_p45 = _mm_set1_epi32(0x000D002D); const __m128i c16_n29_n40 = _mm_set1_epi32(0xFFE3FFD8); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); //row4 const __m128i c16_p45_n13 = _mm_set1_epi32(0x002DFFF3); const __m128i c16_n43_n04 = _mm_set1_epi32(0xFFD5FFFC); const __m128i c16_p35_p21 = _mm_set1_epi32(0x00230015); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); //row5 const __m128i c16_p13_p29 = _mm_set1_epi32(0x000D001D); const __m128i c16_p35_n43 = _mm_set1_epi32(0x0023FFD5); const __m128i c16_n40_p04 = _mm_set1_epi32(0xFFD80004); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); //row6 const __m128i c16_n40_p45 = _mm_set1_epi32(0xFFD8002D); const __m128i c16_p04_p21 = _mm_set1_epi32(0x00040015); const __m128i c16_p43_n29 = _mm_set1_epi32(0x002BFFE3); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); //row7 const __m128i c16_n29_p21 = _mm_set1_epi32(0xFFE30015); const __m128i c16_n40_p35 = _mm_set1_epi32(0xFFD80023); const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019); const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); const __m128i c16_n25_n44 = _mm_set1_epi32(0xFFE7FFD4); const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); const __m128i c16_p38_p09 = _mm_set1_epi32(0x00260009); const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); const __m128i c16_n44_p38 = _mm_set1_epi32(0xFFD40026); const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); __m128i c32_rnd = _mm_set1_epi32(1 << (shift1 - 1)); // add1 // DCT1 __m128i in00, in01, in02, in03, in04, in05, in06, in07; __m128i res00, res01, res02, res03, res04, res05, res06, res07; in00 = _mm_loadu_si128((const __m128i*)&src[ 0 * 4]); // [07 06 05 04 03 02 01 00] in01 = _mm_loadu_si128((const __m128i*)&src[ 2 * 4]); // [27 26 25 24 23 22 21 20] in02 = _mm_loadu_si128((const __m128i*)&src[ 4 * 4]); // [47 46 45 44 43 42 41 40] in03 = _mm_loadu_si128((const __m128i*)&src[ 6 * 4]); // [67 66 65 64 63 62 61 60] in04 = _mm_loadu_si128((const __m128i*)&src[ 8 * 4]); in05 = _mm_loadu_si128((const __m128i*)&src[10 * 4]); in06 = _mm_loadu_si128((const __m128i*)&src[12 * 4]); in07 = _mm_loadu_si128((const __m128i*)&src[14 * 4]); { const __m128i T_00_00A = _mm_unpackhi_epi16(in00, in01); // [33 13 32 12 31 11 30 10] const __m128i T_00_01A = _mm_unpackhi_epi16(in02, in03); // [ ] const __m128i T_00_02A = _mm_unpackhi_epi16(in04, in05); // [ ] const __m128i T_00_03A = _mm_unpackhi_epi16(in06, in07); // [ ] const __m128i T_00_04A = _mm_unpacklo_epi16(in01, in03); // [ ] const __m128i T_00_05A = _mm_unpacklo_epi16(in05, in07); // [ ] const __m128i T_00_06A = _mm_unpacklo_epi16(in02, in06); // [ ]row const __m128i T_00_07A = _mm_unpacklo_epi16(in00, in04); // [83 03 82 02 81 01 81 00] row08 row00 __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A; __m128i EO0A, EO1A, EO2A, EO3A; __m128i EEO0A, EEO1A; __m128i EEE0A, EEE1A; #define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \ row = _mm_add_epi32(_mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)), \ _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315))); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, O0A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, O1A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, O2A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, O3A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, O4A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, O5A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, O6A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, O7A) #undef COMPUTE_ROW EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p38_p44), _mm_madd_epi16(T_00_05A, c16_p09_p25)); // EO0 EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n09_p38), _mm_madd_epi16(T_00_05A, c16_n25_n44)); // EO1 EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n44_p25), _mm_madd_epi16(T_00_05A, c16_p38_p09)); // EO2 EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n25_p09), _mm_madd_epi16(T_00_05A, c16_n44_p38)); // EO3 EEO0A = _mm_madd_epi16(T_00_06A, c16_p17_p42); EEO1A = _mm_madd_epi16(T_00_06A, c16_n42_p17); EEE0A = _mm_madd_epi16(T_00_07A, c16_p32_p32); EEE1A = _mm_madd_epi16(T_00_07A, c16_n32_p32); { const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A); // EE2 = EEE0 - EEO0 const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A); // EE3 = EEE1 - EEO1 const __m128i T10A = _mm_add_epi32(_mm_add_epi32(EE0A, EO0A), c32_rnd); // E0 (= EE0 + EO0) + rnd const __m128i T11A = _mm_add_epi32(_mm_add_epi32(EE1A, EO1A), c32_rnd); // E1 (= EE1 + EO1) + rnd const __m128i T12A = _mm_add_epi32(_mm_add_epi32(EE2A, EO2A), c32_rnd); // E2 (= EE2 + EO2) + rnd const __m128i T13A = _mm_add_epi32(_mm_add_epi32(EE3A, EO3A), c32_rnd); // E3 (= EE3 + EO3) + rnd const __m128i T14A = _mm_add_epi32(_mm_sub_epi32(EE3A, EO3A), c32_rnd); // E4 (= EE3 - EO3) + rnd const __m128i T15A = _mm_add_epi32(_mm_sub_epi32(EE2A, EO2A), c32_rnd); // E5 (= EE2 - EO2) + rnd const __m128i T16A = _mm_add_epi32(_mm_sub_epi32(EE1A, EO1A), c32_rnd); // E6 (= EE1 - EO1) + rnd const __m128i T17A = _mm_add_epi32(_mm_sub_epi32(EE0A, EO0A), c32_rnd); // E7 (= EE0 - EO0) + rnd const __m128i T30A = _mm_srai_epi32(_mm_add_epi32(T10A, O0A), shift1); // E0 + O0 + rnd [30 20 10 00] const __m128i T31A = _mm_srai_epi32(_mm_add_epi32(T11A, O1A), shift1); // E1 + O1 + rnd [31 21 11 01] const __m128i T32A = _mm_srai_epi32(_mm_add_epi32(T12A, O2A), shift1); // E2 + O2 + rnd [32 22 12 02] const __m128i T33A = _mm_srai_epi32(_mm_add_epi32(T13A, O3A), shift1); // E3 + O3 + rnd [33 23 13 03] const __m128i T34A = _mm_srai_epi32(_mm_add_epi32(T14A, O4A), shift1); // E4 [33 24 14 04] const __m128i T35A = _mm_srai_epi32(_mm_add_epi32(T15A, O5A), shift1); // E5 [35 25 15 05] const __m128i T36A = _mm_srai_epi32(_mm_add_epi32(T16A, O6A), shift1); // E6 [36 26 16 06] const __m128i T37A = _mm_srai_epi32(_mm_add_epi32(T17A, O7A), shift1); // E7 [37 27 17 07] const __m128i T38A = _mm_srai_epi32(_mm_sub_epi32(T17A, O7A), shift1); // E7 [30 20 10 00] x8 const __m128i T39A = _mm_srai_epi32(_mm_sub_epi32(T16A, O6A), shift1); // E6 [31 21 11 01] x9 const __m128i T3AA = _mm_srai_epi32(_mm_sub_epi32(T15A, O5A), shift1); // E5 [32 22 12 02] xA const __m128i T3BA = _mm_srai_epi32(_mm_sub_epi32(T14A, O4A), shift1); // E4 [33 23 13 03] xB const __m128i T3CA = _mm_srai_epi32(_mm_sub_epi32(T13A, O3A), shift1); // E3 - O3 + rnd [33 24 14 04] xC const __m128i T3DA = _mm_srai_epi32(_mm_sub_epi32(T12A, O2A), shift1); // E2 - O2 + rnd [35 25 15 05] xD const __m128i T3EA = _mm_srai_epi32(_mm_sub_epi32(T11A, O1A), shift1); // E1 - O1 + rnd [36 26 16 06] xE const __m128i T3FA = _mm_srai_epi32(_mm_sub_epi32(T10A, O0A), shift1); // E0 - O0 + rnd [37 27 17 07] xF res00 = _mm_packs_epi32(T30A, T38A); res01 = _mm_packs_epi32(T31A, T39A); res02 = _mm_packs_epi32(T32A, T3AA); res03 = _mm_packs_epi32(T33A, T3BA); res04 = _mm_packs_epi32(T34A, T3CA); res05 = _mm_packs_epi32(T35A, T3DA); res06 = _mm_packs_epi32(T36A, T3EA); res07 = _mm_packs_epi32(T37A, T3FA); } } // transpose matrix { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; __m128i E01, E02, E03, E04, E11, E12, E13, E14; __m128i O01, O02, O03, O04, O11, O12, O13, O14; __m128i m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3; tr0_0 = _mm_unpacklo_epi16(res00, res01); tr0_1 = _mm_unpackhi_epi16(res00, res01); tr0_2 = _mm_unpacklo_epi16(res02, res03); tr0_3 = _mm_unpackhi_epi16(res02, res03); tr0_4 = _mm_unpacklo_epi16(res04, res05); tr0_5 = _mm_unpackhi_epi16(res04, res05); tr0_6 = _mm_unpacklo_epi16(res06, res07); tr0_7 = _mm_unpackhi_epi16(res06, res07); tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_2); tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_2); tr1_2 = _mm_unpacklo_epi32(tr0_1, tr0_3); tr1_3 = _mm_unpackhi_epi32(tr0_1, tr0_3); tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_6); tr1_5 = _mm_unpackhi_epi32(tr0_4, tr0_6); tr1_6 = _mm_unpacklo_epi32(tr0_5, tr0_7); tr1_7 = _mm_unpackhi_epi32(tr0_5, tr0_7); res00 = _mm_unpacklo_epi64(tr1_0, tr1_4); res02 = _mm_unpackhi_epi64(tr1_0, tr1_4); res04 = _mm_unpacklo_epi64(tr1_1, tr1_5); res06 = _mm_unpackhi_epi64(tr1_1, tr1_5); res01 = _mm_unpacklo_epi64(tr1_2, tr1_6); res03 = _mm_unpackhi_epi64(tr1_2, tr1_6); res05 = _mm_unpacklo_epi64(tr1_3, tr1_7); res07 = _mm_unpackhi_epi64(tr1_3, tr1_7); c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 m128Tmp0 = _mm_unpacklo_epi16(res00, res04); E01 = _mm_add_epi32(_mm_madd_epi16(m128Tmp0, c16_p32_p32), c32_rnd); E11 = _mm_add_epi32(_mm_madd_epi16(m128Tmp0, c16_n32_p32), c32_rnd); m128Tmp1 = _mm_unpackhi_epi16(res00, res04); E02 = _mm_add_epi32(_mm_madd_epi16(m128Tmp1, c16_p32_p32), c32_rnd); E12 = _mm_add_epi32(_mm_madd_epi16(m128Tmp1, c16_n32_p32), c32_rnd); m128Tmp0 = _mm_unpacklo_epi16(res01, res05); E03 = _mm_add_epi32(_mm_madd_epi16(m128Tmp0, c16_p32_p32), c32_rnd); E13 = _mm_add_epi32(_mm_madd_epi16(m128Tmp0, c16_n32_p32), c32_rnd); m128Tmp1 = _mm_unpackhi_epi16(res01, res05); E04 = _mm_add_epi32(_mm_madd_epi16(m128Tmp1, c16_p32_p32), c32_rnd); E14 = _mm_add_epi32(_mm_madd_epi16(m128Tmp1, c16_n32_p32), c32_rnd); m128Tmp0 = _mm_unpacklo_epi16(res02, res06); O01 = _mm_madd_epi16(m128Tmp0, c16_p17_p42); O11 = _mm_madd_epi16(m128Tmp0, c16_n42_p17); m128Tmp1 = _mm_unpackhi_epi16(res02, res06); O02 = _mm_madd_epi16(m128Tmp1, c16_p17_p42); O12 = _mm_madd_epi16(m128Tmp1, c16_n42_p17); m128Tmp0 = _mm_unpacklo_epi16(res03, res07); O03 = _mm_madd_epi16(m128Tmp0, c16_p17_p42); O13 = _mm_madd_epi16(m128Tmp0, c16_n42_p17); m128Tmp1 = _mm_unpackhi_epi16(res03, res07); O04 = _mm_madd_epi16(m128Tmp1, c16_p17_p42); O14 = _mm_madd_epi16(m128Tmp1, c16_n42_p17); res00 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E01, O01), shift2), _mm_srai_epi32(_mm_add_epi32(E02, O02), shift2)); res01 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E03, O03), shift2), _mm_srai_epi32(_mm_add_epi32(E04, O04), shift2)); res06 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E01, O01), shift2), _mm_srai_epi32(_mm_sub_epi32(E02, O02), shift2)); res07 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E03, O03), shift2), _mm_srai_epi32(_mm_sub_epi32(E04, O04), shift2)); res02 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E11, O11), shift2), _mm_srai_epi32(_mm_add_epi32(E12, O12), shift2)); res03 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E13, O13), shift2), _mm_srai_epi32(_mm_add_epi32(E14, O14), shift2)); res04 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E11, O11), shift2), _mm_srai_epi32(_mm_sub_epi32(E12, O12), shift2)); res05 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E13, O13), shift2), _mm_srai_epi32(_mm_sub_epi32(E14, O14), shift2)); m128Tmp0 = _mm_unpacklo_epi16(res00, res02); m128Tmp1 = _mm_unpackhi_epi16(res00, res02); m128Tmp2 = _mm_unpacklo_epi16(res04, res06); m128Tmp3 = _mm_unpackhi_epi16(res04, res06); res00 = _mm_unpacklo_epi32(m128Tmp0, m128Tmp2); res02 = _mm_unpackhi_epi32(m128Tmp0, m128Tmp2); res04 = _mm_unpacklo_epi32(m128Tmp1, m128Tmp3); res06 = _mm_unpackhi_epi32(m128Tmp1, m128Tmp3); m128Tmp0 = _mm_unpacklo_epi16(res01, res03); m128Tmp1 = _mm_unpackhi_epi16(res01, res03); m128Tmp2 = _mm_unpacklo_epi16(res05, res07); m128Tmp3 = _mm_unpackhi_epi16(res05, res07); res01 = _mm_unpacklo_epi32(m128Tmp0, m128Tmp2); res03 = _mm_unpackhi_epi32(m128Tmp0, m128Tmp2); res05 = _mm_unpacklo_epi32(m128Tmp1, m128Tmp3); res07 = _mm_unpackhi_epi32(m128Tmp1, m128Tmp3); } // clip { const __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); const __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); res00 = _mm_max_epi16(_mm_min_epi16(res00, max_val), min_val); res02 = _mm_max_epi16(_mm_min_epi16(res02, max_val), min_val); res04 = _mm_max_epi16(_mm_min_epi16(res04, max_val), min_val); res06 = _mm_max_epi16(_mm_min_epi16(res06, max_val), min_val); res01 = _mm_max_epi16(_mm_min_epi16(res01, max_val), min_val); res03 = _mm_max_epi16(_mm_min_epi16(res03, max_val), min_val); res05 = _mm_max_epi16(_mm_min_epi16(res05, max_val), min_val); res07 = _mm_max_epi16(_mm_min_epi16(res07, max_val), min_val); } // store if (i_dst == 4) { _mm_store_si128((__m128i*)(dst + 0 * 4), res00); _mm_store_si128((__m128i*)(dst + 2 * 4), res02); _mm_store_si128((__m128i*)(dst + 4 * 4), res04); _mm_store_si128((__m128i*)(dst + 6 * 4), res06); _mm_store_si128((__m128i*)(dst + 8 * 4), res01); _mm_store_si128((__m128i*)(dst + 10 * 4), res03); _mm_store_si128((__m128i*)(dst + 12 * 4), res05); _mm_store_si128((__m128i*)(dst + 14 * 4), res07); } else { _mm_storel_epi64((__m128i*)(dst + 0 * i_dst), res00); _mm_storeh_pi ((__m64 *)(dst + 1 * i_dst), _mm_castsi128_ps(res00)); _mm_storel_epi64((__m128i*)(dst + 2 * i_dst), res02); _mm_storeh_pi ((__m64 *)(dst + 3 * i_dst), _mm_castsi128_ps(res02)); _mm_storel_epi64((__m128i*)(dst + 4 * i_dst), res04); _mm_storeh_pi ((__m64 *)(dst + 5 * i_dst), _mm_castsi128_ps(res04)); _mm_storel_epi64((__m128i*)(dst + 6 * i_dst), res06); _mm_storeh_pi ((__m64 *)(dst + 7 * i_dst), _mm_castsi128_ps(res06)); _mm_storel_epi64((__m128i*)(dst + 8 * i_dst), res01); _mm_storeh_pi ((__m64 *)(dst + 9 * i_dst), _mm_castsi128_ps(res01)); _mm_storel_epi64((__m128i*)(dst + 10 * i_dst), res03); _mm_storeh_pi ((__m64 *)(dst + 11 * i_dst), _mm_castsi128_ps(res03)); _mm_storel_epi64((__m128i*)(dst + 12 * i_dst), res05); _mm_storeh_pi ((__m64 *)(dst + 13 * i_dst), _mm_castsi128_ps(res05)); _mm_storel_epi64((__m128i*)(dst + 14 * i_dst), res07); _mm_storeh_pi ((__m64 *)(dst + 15 * i_dst), _mm_castsi128_ps(res07)); } } /* --------------------------------------------------------------------------- */ void idct_c_16x4_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { const int shift1 = 5; const int shift2 = 20 - g_bit_depth; // const int clip_depth1 = LIMIT_BIT; const int clip_depth2 = g_bit_depth + 1; const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); //row0 87high - 90low address const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); const __m128i c16_p21_p29 = _mm_set1_epi32(0x0015001D); const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D); const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); //row1 const __m128i c16_n21_p04 = _mm_set1_epi32(0xFFEB0004); const __m128i c16_n45_n40 = _mm_set1_epi32(0xFFD3FFD8); const __m128i c16_n13_n35 = _mm_set1_epi32(0xFFF3FFDD); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); //row2 const __m128i c16_n43_n35 = _mm_set1_epi32(0xFFD5FFDD); const __m128i c16_p29_n13 = _mm_set1_epi32(0x001DFFF3); const __m128i c16_p21_p45 = _mm_set1_epi32(0x0015002D); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); //row3 const __m128i c16_p04_n43 = _mm_set1_epi32(0x0004FFD5); const __m128i c16_p13_p45 = _mm_set1_epi32(0x000D002D); const __m128i c16_n29_n40 = _mm_set1_epi32(0xFFE3FFD8); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); //row4 const __m128i c16_p45_n13 = _mm_set1_epi32(0x002DFFF3); const __m128i c16_n43_n04 = _mm_set1_epi32(0xFFD5FFFC); const __m128i c16_p35_p21 = _mm_set1_epi32(0x00230015); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); //row5 const __m128i c16_p13_p29 = _mm_set1_epi32(0x000D001D); const __m128i c16_p35_n43 = _mm_set1_epi32(0x0023FFD5); const __m128i c16_n40_p04 = _mm_set1_epi32(0xFFD80004); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); //row6 const __m128i c16_n40_p45 = _mm_set1_epi32(0xFFD8002D); const __m128i c16_p04_p21 = _mm_set1_epi32(0x00040015); const __m128i c16_p43_n29 = _mm_set1_epi32(0x002BFFE3); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); //row7 const __m128i c16_n29_p21 = _mm_set1_epi32(0xFFE30015); const __m128i c16_n40_p35 = _mm_set1_epi32(0xFFD80023); const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019); const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); const __m128i c16_n25_n44 = _mm_set1_epi32(0xFFE7FFD4); const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); const __m128i c16_p38_p09 = _mm_set1_epi32(0x00260009); const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); const __m128i c16_n44_p38 = _mm_set1_epi32(0xFFD40026); const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); __m128i c32_rnd = _mm_set1_epi32(1 << (shift1 - 1)); // add1 // DCT1 __m128i in00[2], in01[2], in02[2], in03[2]; __m128i res00[2], res01[2], res02[2], res03[2]; int i, part; for (i = 0; i < 2; i++) { const int offset = (i << 3); in00[i] = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]); // [07 06 05 04 03 02 01 00] in01[i] = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]); // [17 16 15 14 13 12 11 10] in02[i] = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]); // [27 26 25 24 23 22 21 20] in03[i] = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]); // [37 36 35 34 33 32 31 30] } for (part = 0; part < 2; part++) { const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); const __m128i T_00_01A = _mm_unpacklo_epi16(in00[part], in02[part]); const __m128i T_00_01B = _mm_unpackhi_epi16(in00[part], in02[part]); __m128i E0A, E0B, E1A, E1B, O0A, O0B, O1A, O1B; E0A = _mm_add_epi32(_mm_madd_epi16(T_00_01A, c16_p32_p32), c32_rnd); E1A = _mm_add_epi32(_mm_madd_epi16(T_00_01A, c16_n32_p32), c32_rnd); E0B = _mm_add_epi32(_mm_madd_epi16(T_00_01B, c16_p32_p32), c32_rnd); E1B = _mm_add_epi32(_mm_madd_epi16(T_00_01B, c16_n32_p32), c32_rnd); O0A = _mm_madd_epi16(T_00_00A, c16_p17_p42); O1A = _mm_madd_epi16(T_00_00A, c16_n42_p17); O0B = _mm_madd_epi16(T_00_00B, c16_p17_p42); O1B = _mm_madd_epi16(T_00_00B, c16_n42_p17); res00[part] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0A, O0A), 5), _mm_srai_epi32(_mm_add_epi32(E0B, O0B), 5)); res03[part] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0A, O0A), 5), _mm_srai_epi32(_mm_sub_epi32(E0B, O0B), 5)); res01[part] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1A, O1A), 5), _mm_srai_epi32(_mm_add_epi32(E1B, O1B), 5)); res02[part] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1A, O1A), 5), _mm_srai_epi32(_mm_sub_epi32(E1B, O1B), 5)); } // transpose matrix { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; tr0_0 = _mm_unpacklo_epi16(res00[0], res01[0]); tr0_1 = _mm_unpacklo_epi16(res02[0], res03[0]); tr0_2 = _mm_unpackhi_epi16(res00[0], res01[0]); tr0_3 = _mm_unpackhi_epi16(res02[0], res03[0]); tr0_4 = _mm_unpacklo_epi16(res00[1], res01[1]); tr0_5 = _mm_unpacklo_epi16(res02[1], res03[1]); tr0_6 = _mm_unpackhi_epi16(res00[1], res01[1]); tr0_7 = _mm_unpackhi_epi16(res02[1], res03[1]); tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // second fft c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 { const __m128i T_00_00A = _mm_unpackhi_epi16(tr1_0, tr1_2); // [33 13 32 12 31 11 30 10] const __m128i T_00_01A = _mm_unpackhi_epi16(tr1_1, tr1_3); // [ ] const __m128i T_00_02A = _mm_unpackhi_epi16(tr1_4, tr1_6); // [ ] const __m128i T_00_03A = _mm_unpackhi_epi16(tr1_5, tr1_7); // [ ] const __m128i T_00_04A = _mm_unpacklo_epi16(tr1_2, tr1_3); // [ ] const __m128i T_00_05A = _mm_unpacklo_epi16(tr1_6, tr1_7); // [ ] const __m128i T_00_06A = _mm_unpacklo_epi16(tr1_1, tr1_5); // [ ]row const __m128i T_00_07A = _mm_unpacklo_epi16(tr1_0, tr1_4); // [83 03 82 02 81 01 81 00] row08 row00 __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A; __m128i EO0A, EO1A, EO2A, EO3A; __m128i EEO0A, EEO1A; __m128i EEE0A, EEE1A; #define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \ row = _mm_add_epi32(_mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)), \ _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315))); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, O0A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, O1A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, O2A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, O3A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, O4A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, O5A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, O6A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, O7A) #undef COMPUTE_ROW EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p38_p44), _mm_madd_epi16(T_00_05A, c16_p09_p25)); // EO0 EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n09_p38), _mm_madd_epi16(T_00_05A, c16_n25_n44)); // EO1 EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n44_p25), _mm_madd_epi16(T_00_05A, c16_p38_p09)); // EO2 EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n25_p09), _mm_madd_epi16(T_00_05A, c16_n44_p38)); // EO3 EEO0A = _mm_madd_epi16(T_00_06A, c16_p17_p42); EEO1A = _mm_madd_epi16(T_00_06A, c16_n42_p17); EEE0A = _mm_madd_epi16(T_00_07A, c16_p32_p32); EEE1A = _mm_madd_epi16(T_00_07A, c16_n32_p32); { const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A); // EE2 = EEE0 - EEO0 const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A); // EE3 = EEE1 - EEO1 const __m128i T10A = _mm_add_epi32(_mm_add_epi32(EE0A, EO0A), c32_rnd); // E0 (= EE0 + EO0) + rnd const __m128i T11A = _mm_add_epi32(_mm_add_epi32(EE1A, EO1A), c32_rnd); // E1 (= EE1 + EO1) + rnd const __m128i T12A = _mm_add_epi32(_mm_add_epi32(EE2A, EO2A), c32_rnd); // E2 (= EE2 + EO2) + rnd const __m128i T13A = _mm_add_epi32(_mm_add_epi32(EE3A, EO3A), c32_rnd); // E3 (= EE3 + EO3) + rnd const __m128i T14A = _mm_add_epi32(_mm_sub_epi32(EE3A, EO3A), c32_rnd); // E4 (= EE3 - EO3) + rnd const __m128i T15A = _mm_add_epi32(_mm_sub_epi32(EE2A, EO2A), c32_rnd); // E5 (= EE2 - EO2) + rnd const __m128i T16A = _mm_add_epi32(_mm_sub_epi32(EE1A, EO1A), c32_rnd); // E6 (= EE1 - EO1) + rnd const __m128i T17A = _mm_add_epi32(_mm_sub_epi32(EE0A, EO0A), c32_rnd); // E7 (= EE0 - EO0) + rnd const __m128i T30A = _mm_srai_epi32(_mm_add_epi32(T10A, O0A), shift2); // E0 + O0 + rnd [30 20 10 00] const __m128i T31A = _mm_srai_epi32(_mm_add_epi32(T11A, O1A), shift2); // E1 + O1 + rnd [31 21 11 01] const __m128i T32A = _mm_srai_epi32(_mm_add_epi32(T12A, O2A), shift2); // E2 + O2 + rnd [32 22 12 02] const __m128i T33A = _mm_srai_epi32(_mm_add_epi32(T13A, O3A), shift2); // E3 + O3 + rnd [33 23 13 03] const __m128i T34A = _mm_srai_epi32(_mm_add_epi32(T14A, O4A), shift2); // E4 [33 24 14 04] const __m128i T35A = _mm_srai_epi32(_mm_add_epi32(T15A, O5A), shift2); // E5 [35 25 15 05] const __m128i T36A = _mm_srai_epi32(_mm_add_epi32(T16A, O6A), shift2); // E6 [36 26 16 06] const __m128i T37A = _mm_srai_epi32(_mm_add_epi32(T17A, O7A), shift2); // E7 [37 27 17 07] const __m128i T38A = _mm_srai_epi32(_mm_sub_epi32(T17A, O7A), shift2); // E7 [30 20 10 00] x8 const __m128i T39A = _mm_srai_epi32(_mm_sub_epi32(T16A, O6A), shift2); // E6 [31 21 11 01] x9 const __m128i T3AA = _mm_srai_epi32(_mm_sub_epi32(T15A, O5A), shift2); // E5 [32 22 12 02] xA const __m128i T3BA = _mm_srai_epi32(_mm_sub_epi32(T14A, O4A), shift2); // E4 [33 23 13 03] xB const __m128i T3CA = _mm_srai_epi32(_mm_sub_epi32(T13A, O3A), shift2); // E3 - O3 + rnd [33 24 14 04] xC const __m128i T3DA = _mm_srai_epi32(_mm_sub_epi32(T12A, O2A), shift2); // E2 - O2 + rnd [35 25 15 05] xD const __m128i T3EA = _mm_srai_epi32(_mm_sub_epi32(T11A, O1A), shift2); // E1 - O1 + rnd [36 26 16 06] xE const __m128i T3FA = _mm_srai_epi32(_mm_sub_epi32(T10A, O0A), shift2); // E0 - O0 + rnd [37 27 17 07] xF res00[0] = _mm_packs_epi32(T30A, T38A); res01[0] = _mm_packs_epi32(T31A, T39A); res02[0] = _mm_packs_epi32(T32A, T3AA); res03[0] = _mm_packs_epi32(T33A, T3BA); res00[1] = _mm_packs_epi32(T34A, T3CA); res01[1] = _mm_packs_epi32(T35A, T3DA); res02[1] = _mm_packs_epi32(T36A, T3EA); res03[1] = _mm_packs_epi32(T37A, T3FA); } } // transpose matrix tr0_0 = _mm_unpacklo_epi16(res00[0], res01[0]); tr0_1 = _mm_unpacklo_epi16(res02[0], res03[0]); tr0_2 = _mm_unpackhi_epi16(res00[0], res01[0]); tr0_3 = _mm_unpackhi_epi16(res02[0], res03[0]); tr0_4 = _mm_unpacklo_epi16(res00[1], res01[1]); tr0_5 = _mm_unpacklo_epi16(res02[1], res03[1]); tr0_6 = _mm_unpackhi_epi16(res00[1], res01[1]); tr0_7 = _mm_unpackhi_epi16(res02[1], res03[1]); tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); res00[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); res01[0] = _mm_unpackhi_epi64(tr1_0, tr1_4); res02[0] = _mm_unpacklo_epi64(tr1_2, tr1_6); res03[0] = _mm_unpackhi_epi64(tr1_2, tr1_6); res00[1] = _mm_unpacklo_epi64(tr1_1, tr1_5); res01[1] = _mm_unpackhi_epi64(tr1_1, tr1_5); res02[1] = _mm_unpacklo_epi64(tr1_3, tr1_7); res03[1] = _mm_unpackhi_epi64(tr1_3, tr1_7); // clip { const __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); const __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); res00[0] = _mm_max_epi16(_mm_min_epi16(res00[0], max_val), min_val); res01[0] = _mm_max_epi16(_mm_min_epi16(res01[0], max_val), min_val); res02[0] = _mm_max_epi16(_mm_min_epi16(res02[0], max_val), min_val); res03[0] = _mm_max_epi16(_mm_min_epi16(res03[0], max_val), min_val); res00[1] = _mm_max_epi16(_mm_min_epi16(res00[1], max_val), min_val); res01[1] = _mm_max_epi16(_mm_min_epi16(res01[1], max_val), min_val); res02[1] = _mm_max_epi16(_mm_min_epi16(res02[1], max_val), min_val); res03[1] = _mm_max_epi16(_mm_min_epi16(res03[1], max_val), min_val); } } _mm_storeu_si128((__m128i*)(dst + 0 * i_dst ), res00[0]); _mm_storeu_si128((__m128i*)(dst + 0 * i_dst + 8), res00[1]); _mm_storeu_si128((__m128i*)(dst + 1 * i_dst ), res01[0]); _mm_storeu_si128((__m128i*)(dst + 1 * i_dst + 8), res01[1]); _mm_storeu_si128((__m128i*)(dst + 2 * i_dst ), res02[0]); _mm_storeu_si128((__m128i*)(dst + 2 * i_dst + 8), res02[1]); _mm_storeu_si128((__m128i*)(dst + 3 * i_dst ), res03[0]); _mm_storeu_si128((__m128i*)(dst + 3 * i_dst + 8), res03[1]); } /* --------------------------------------------------------------------------- */ void idct_c_8x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // const int shift1 = 5; const int shift2 = 20 - g_bit_depth; // const int clip_depth1 = LIMIT_BIT; const int clip_depth2 = g_bit_depth + 1; __m128i S0, S1, S2, S3, S4, S5, S6, S7; __m128i mAdd, T0, T1, T2, T3; __m128i E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l; __m128i O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l; __m128i EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h; __m128i T00, T01, T02, T03, T04, T05, T06, T07; mAdd = _mm_set1_epi32(16); // add1 S1 = _mm_load_si128((__m128i*)&src[8]); S3 = _mm_load_si128((__m128i*)&src[24]); T0 = _mm_unpacklo_epi16(S1, S3); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); T1 = _mm_unpackhi_epi16(S1, S3); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); S5 = _mm_load_si128((__m128i*)&src[40]); S7 = _mm_load_si128((__m128i*)&src[56]); T2 = _mm_unpacklo_epi16(S5, S7); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); T3 = _mm_unpackhi_epi16(S5, S7); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); O0l = _mm_add_epi32(E1l, E2l); O0h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); O1l = _mm_add_epi32(E1l, E2l); O1h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); O2l = _mm_add_epi32(E1l, E2l); O2h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); O3h = _mm_add_epi32(E1h, E2h); O3l = _mm_add_epi32(E1l, E2l); /* ------- */ S0 = _mm_load_si128((__m128i*)&src[0]); S4 = _mm_load_si128((__m128i*)&src[32]); T0 = _mm_unpacklo_epi16(S0, S4); EE0l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); T1 = _mm_unpackhi_epi16(S0, S4); EE0h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); EE1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); EE1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); /* ------- */ S2 = _mm_load_si128((__m128i*)&src[16]); S6 = _mm_load_si128((__m128i*)&src[48]); T0 = _mm_unpacklo_epi16(S2, S6); E00l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); T1 = _mm_unpackhi_epi16(S2, S6); E00h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); E01l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E01h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E0l = _mm_add_epi32(EE0l, E00l); E0l = _mm_add_epi32(E0l, mAdd); E0h = _mm_add_epi32(EE0h, E00h); E0h = _mm_add_epi32(E0h, mAdd); E3l = _mm_sub_epi32(EE0l, E00l); E3l = _mm_add_epi32(E3l, mAdd); E3h = _mm_sub_epi32(EE0h, E00h); E3h = _mm_add_epi32(E3h, mAdd); E1l = _mm_add_epi32(EE1l, E01l); E1l = _mm_add_epi32(E1l, mAdd); E1h = _mm_add_epi32(EE1h, E01h); E1h = _mm_add_epi32(E1h, mAdd); E2l = _mm_sub_epi32(EE1l, E01l); E2l = _mm_add_epi32(E2l, mAdd); E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, mAdd); S0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 5)); // ״η任λ S7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 5)); S1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 5)); S6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 5)); S2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 5), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 5)); S5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 5), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 5)); S3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 5), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 5)); S4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 5), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 5)); /* Inverse matrix */ E0l = _mm_unpacklo_epi16(S0, S4); E1l = _mm_unpacklo_epi16(S1, S5); E2l = _mm_unpacklo_epi16(S2, S6); E3l = _mm_unpacklo_epi16(S3, S7); O0l = _mm_unpackhi_epi16(S0, S4); O1l = _mm_unpackhi_epi16(S1, S5); O2l = _mm_unpackhi_epi16(S2, S6); O3l = _mm_unpackhi_epi16(S3, S7); T0 = _mm_unpacklo_epi16(E0l, E2l); T1 = _mm_unpacklo_epi16(E1l, E3l); S0 = _mm_unpacklo_epi16(T0, T1); S1 = _mm_unpackhi_epi16(T0, T1); T2 = _mm_unpackhi_epi16(E0l, E2l); T3 = _mm_unpackhi_epi16(E1l, E3l); S2 = _mm_unpacklo_epi16(T2, T3); S3 = _mm_unpackhi_epi16(T2, T3); T0 = _mm_unpacklo_epi16(O0l, O2l); T1 = _mm_unpacklo_epi16(O1l, O3l); S4 = _mm_unpacklo_epi16(T0, T1); S5 = _mm_unpackhi_epi16(T0, T1); T2 = _mm_unpackhi_epi16(O0l, O2l); T3 = _mm_unpackhi_epi16(O1l, O3l); S6 = _mm_unpacklo_epi16(T2, T3); S7 = _mm_unpackhi_epi16(T2, T3); mAdd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 T0 = _mm_unpacklo_epi16(S1, S3); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); T1 = _mm_unpackhi_epi16(S1, S3); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); T2 = _mm_unpacklo_epi16(S5, S7); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); T3 = _mm_unpackhi_epi16(S5, S7); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); O0l = _mm_add_epi32(E1l, E2l); O0h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); O1l = _mm_add_epi32(E1l, E2l); O1h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); O2l = _mm_add_epi32(E1l, E2l); O2h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); O3h = _mm_add_epi32(E1h, E2h); O3l = _mm_add_epi32(E1l, E2l); T0 = _mm_unpacklo_epi16(S0, S4); T1 = _mm_unpackhi_epi16(S0, S4); EE0l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); EE0h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); EE1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); EE1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); T0 = _mm_unpacklo_epi16(S2, S6); T1 = _mm_unpackhi_epi16(S2, S6); E00l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); E00h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); E01l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E01h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E0l = _mm_add_epi32(EE0l, E00l); E0l = _mm_add_epi32(E0l, mAdd); E0h = _mm_add_epi32(EE0h, E00h); E0h = _mm_add_epi32(E0h, mAdd); E3l = _mm_sub_epi32(EE0l, E00l); E3l = _mm_add_epi32(E3l, mAdd); E3h = _mm_sub_epi32(EE0h, E00h); E3h = _mm_add_epi32(E3h, mAdd); E1l = _mm_add_epi32(EE1l, E01l); E1l = _mm_add_epi32(E1l, mAdd); E1h = _mm_add_epi32(EE1h, E01h); E1h = _mm_add_epi32(E1h, mAdd); E2l = _mm_sub_epi32(EE1l, E01l); E2l = _mm_add_epi32(E2l, mAdd); E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, mAdd); S0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift2), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift2)); S7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift2), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift2)); S1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift2), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift2)); S6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift2), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift2)); S2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift2), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift2)); S5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift2), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift2)); S3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift2), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift2)); S4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift2), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift2)); // [07 06 05 04 03 02 01 00] // [17 16 15 14 13 12 11 10] // [27 26 25 24 23 22 21 20] // [37 36 35 34 33 32 31 30] // [47 46 45 44 43 42 41 40] // [57 56 55 54 53 52 51 50] // [67 66 65 64 63 62 61 60] // [77 76 75 74 73 72 71 70] T00 = _mm_unpacklo_epi16(S0, S1); // [13 03 12 02 11 01 10 00] T01 = _mm_unpackhi_epi16(S0, S1); // [17 07 16 06 15 05 14 04] T02 = _mm_unpacklo_epi16(S2, S3); // [33 23 32 22 31 21 30 20] T03 = _mm_unpackhi_epi16(S2, S3); // [37 27 36 26 35 25 34 24] T04 = _mm_unpacklo_epi16(S4, S5); // [53 43 52 42 51 41 50 40] T05 = _mm_unpackhi_epi16(S4, S5); // [57 47 56 46 55 45 54 44] T06 = _mm_unpacklo_epi16(S6, S7); // [73 63 72 62 71 61 70 60] T07 = _mm_unpackhi_epi16(S6, S7); // [77 67 76 66 75 65 74 64] // clip { const __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); const __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); T00 = _mm_max_epi16(_mm_min_epi16(T00, max_val), min_val); T01 = _mm_max_epi16(_mm_min_epi16(T01, max_val), min_val); T02 = _mm_max_epi16(_mm_min_epi16(T02, max_val), min_val); T03 = _mm_max_epi16(_mm_min_epi16(T03, max_val), min_val); T04 = _mm_max_epi16(_mm_min_epi16(T04, max_val), min_val); T05 = _mm_max_epi16(_mm_min_epi16(T05, max_val), min_val); T06 = _mm_max_epi16(_mm_min_epi16(T06, max_val), min_val); T07 = _mm_max_epi16(_mm_min_epi16(T07, max_val), min_val); } { __m128i T10, T11, T12, T13; T10 = _mm_unpacklo_epi32(T00, T02); // [31 21 11 01 30 20 10 00] T11 = _mm_unpackhi_epi32(T00, T02); // [33 23 13 03 32 22 12 02] T12 = _mm_unpacklo_epi32(T04, T06); // [71 61 51 41 70 60 50 40] T13 = _mm_unpackhi_epi32(T04, T06); // [73 63 53 43 72 62 52 42] _mm_store_si128((__m128i*)(dst + 0 * i_dst), _mm_unpacklo_epi64(T10, T12)); // [70 60 50 40 30 20 10 00] _mm_store_si128((__m128i*)(dst + 1 * i_dst), _mm_unpackhi_epi64(T10, T12)); // [71 61 51 41 31 21 11 01] _mm_store_si128((__m128i*)(dst + 2 * i_dst), _mm_unpacklo_epi64(T11, T13)); // [72 62 52 42 32 22 12 02] _mm_store_si128((__m128i*)(dst + 3 * i_dst), _mm_unpackhi_epi64(T11, T13)); // [73 63 53 43 33 23 13 03] T10 = _mm_unpacklo_epi32(T01, T03); // [35 25 15 05 34 24 14 04] T12 = _mm_unpacklo_epi32(T05, T07); // [75 65 55 45 74 64 54 44] T11 = _mm_unpackhi_epi32(T01, T03); // [37 27 17 07 36 26 16 06] T13 = _mm_unpackhi_epi32(T05, T07); // [77 67 57 47 76 56 46 36] _mm_store_si128((__m128i*)(dst + 4 * i_dst), _mm_unpacklo_epi64(T10, T12)); // [74 64 54 44 34 24 14 04] _mm_store_si128((__m128i*)(dst + 5 * i_dst), _mm_unpackhi_epi64(T10, T12)); // [75 65 55 45 35 25 15 05] _mm_store_si128((__m128i*)(dst + 6 * i_dst), _mm_unpacklo_epi64(T11, T13)); // [76 66 56 46 36 26 16 06] _mm_store_si128((__m128i*)(dst + 7 * i_dst), _mm_unpackhi_epi64(T11, T13)); // [77 67 57 47 37 27 17 07] } } /* --------------------------------------------------------------------------- */ void idct_c_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { const int shift1 = 5; const int shift2 = 20 - g_bit_depth; //const int clip_depth1 = LIMIT_BIT; const int clip_depth2 = g_bit_depth + 1; const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); //row0 87high - 90low address const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); const __m128i c16_p21_p29 = _mm_set1_epi32(0x0015001D); const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D); const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); //row1 const __m128i c16_n21_p04 = _mm_set1_epi32(0xFFEB0004); const __m128i c16_n45_n40 = _mm_set1_epi32(0xFFD3FFD8); const __m128i c16_n13_n35 = _mm_set1_epi32(0xFFF3FFDD); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); //row2 const __m128i c16_n43_n35 = _mm_set1_epi32(0xFFD5FFDD); const __m128i c16_p29_n13 = _mm_set1_epi32(0x001DFFF3); const __m128i c16_p21_p45 = _mm_set1_epi32(0x0015002D); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); //row3 const __m128i c16_p04_n43 = _mm_set1_epi32(0x0004FFD5); const __m128i c16_p13_p45 = _mm_set1_epi32(0x000D002D); const __m128i c16_n29_n40 = _mm_set1_epi32(0xFFE3FFD8); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); //row4 const __m128i c16_p45_n13 = _mm_set1_epi32(0x002DFFF3); const __m128i c16_n43_n04 = _mm_set1_epi32(0xFFD5FFFC); const __m128i c16_p35_p21 = _mm_set1_epi32(0x00230015); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); //row5 const __m128i c16_p13_p29 = _mm_set1_epi32(0x000D001D); const __m128i c16_p35_n43 = _mm_set1_epi32(0x0023FFD5); const __m128i c16_n40_p04 = _mm_set1_epi32(0xFFD80004); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); //row6 const __m128i c16_n40_p45 = _mm_set1_epi32(0xFFD8002D); const __m128i c16_p04_p21 = _mm_set1_epi32(0x00040015); const __m128i c16_p43_n29 = _mm_set1_epi32(0x002BFFE3); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); //row7 const __m128i c16_n29_p21 = _mm_set1_epi32(0xFFE30015); const __m128i c16_n40_p35 = _mm_set1_epi32(0xFFD80023); const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019); const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); const __m128i c16_n25_n44 = _mm_set1_epi32(0xFFE7FFD4); const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); const __m128i c16_p38_p09 = _mm_set1_epi32(0x00260009); const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); const __m128i c16_n44_p38 = _mm_set1_epi32(0xFFD40026); const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); int i, pass, part; int nShift = shift1; __m128i c32_rnd = _mm_set1_epi32((1 << shift1) >> 1); // add1 // DCT1 __m128i in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2]; __m128i in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2]; __m128i res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2]; __m128i res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2]; for (i = 0; i < 2; i++) { const int offset = (i << 3); in00[i] = _mm_load_si128((const __m128i*)&src[ 0 * 16 + offset]); // [07 06 05 04 03 02 01 00] in01[i] = _mm_load_si128((const __m128i*)&src[ 1 * 16 + offset]); // [17 16 15 14 13 12 11 10] in02[i] = _mm_load_si128((const __m128i*)&src[ 2 * 16 + offset]); // [27 26 25 24 23 22 21 20] in03[i] = _mm_load_si128((const __m128i*)&src[ 3 * 16 + offset]); // [37 36 35 34 33 32 31 30] in04[i] = _mm_load_si128((const __m128i*)&src[ 4 * 16 + offset]); // [47 46 45 44 43 42 41 40] in05[i] = _mm_load_si128((const __m128i*)&src[ 5 * 16 + offset]); // [57 56 55 54 53 52 51 50] in06[i] = _mm_load_si128((const __m128i*)&src[ 6 * 16 + offset]); // [67 66 65 64 63 62 61 60] in07[i] = _mm_load_si128((const __m128i*)&src[ 7 * 16 + offset]); // [77 76 75 74 73 72 71 70] in08[i] = _mm_load_si128((const __m128i*)&src[ 8 * 16 + offset]); in09[i] = _mm_load_si128((const __m128i*)&src[ 9 * 16 + offset]); in10[i] = _mm_load_si128((const __m128i*)&src[10 * 16 + offset]); in11[i] = _mm_load_si128((const __m128i*)&src[11 * 16 + offset]); in12[i] = _mm_load_si128((const __m128i*)&src[12 * 16 + offset]); in13[i] = _mm_load_si128((const __m128i*)&src[13 * 16 + offset]); in14[i] = _mm_load_si128((const __m128i*)&src[14 * 16 + offset]); in15[i] = _mm_load_si128((const __m128i*)&src[15 * 16 + offset]); } for (pass = 0; pass < 2; pass++) { for (part = 0; part < 2; part++) { const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14] const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]); // [ ] const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]); // [ ] const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]); // [ ] const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]); // [ ] const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_05A = _mm_unpacklo_epi16(in10[part], in14[part]); // [ ] const __m128i T_00_05B = _mm_unpackhi_epi16(in10[part], in14[part]); // [ ] const __m128i T_00_06A = _mm_unpacklo_epi16(in04[part], in12[part]); // [ ]row const __m128i T_00_06B = _mm_unpackhi_epi16(in04[part], in12[part]); // [ ] const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part], in08[part]); // [83 03 82 02 81 01 81 00] row08 row00 const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part], in08[part]); // [87 07 86 06 85 05 84 04] __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A; __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B; __m128i EO0A, EO1A, EO2A, EO3A; __m128i EO0B, EO1B, EO2B, EO3B; __m128i EEO0A, EEO1A; __m128i EEO0B, EEO1B; __m128i EEE0A, EEE1A; __m128i EEE0B, EEE1B; __m128i T00, T01; #define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); \ T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315)); \ row = _mm_add_epi32(T00, T01); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, O0A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, O1A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, O2A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, O3A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, O4A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, O5A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, O6A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, O7A) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, O0B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, O1B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, O2B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, O3B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, O4B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, O5B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, O6B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, O7B) #undef COMPUTE_ROW EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p38_p44), _mm_madd_epi16(T_00_05A, c16_p09_p25)); // EO0 EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_p38_p44), _mm_madd_epi16(T_00_05B, c16_p09_p25)); EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n09_p38), _mm_madd_epi16(T_00_05A, c16_n25_n44)); // EO1 EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n09_p38), _mm_madd_epi16(T_00_05B, c16_n25_n44)); EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n44_p25), _mm_madd_epi16(T_00_05A, c16_p38_p09)); // EO2 EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n44_p25), _mm_madd_epi16(T_00_05B, c16_p38_p09)); EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n25_p09), _mm_madd_epi16(T_00_05A, c16_n44_p38)); // EO3 EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n25_p09), _mm_madd_epi16(T_00_05B, c16_n44_p38)); EEO0A = _mm_madd_epi16(T_00_06A, c16_p17_p42); EEO0B = _mm_madd_epi16(T_00_06B, c16_p17_p42); EEO1A = _mm_madd_epi16(T_00_06A, c16_n42_p17); EEO1B = _mm_madd_epi16(T_00_06B, c16_n42_p17); EEE0A = _mm_madd_epi16(T_00_07A, c16_p32_p32); EEE0B = _mm_madd_epi16(T_00_07B, c16_p32_p32); EEE1A = _mm_madd_epi16(T_00_07A, c16_n32_p32); EEE1B = _mm_madd_epi16(T_00_07B, c16_n32_p32); { const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A); // EE2 = EEE0 - EEO0 const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B); const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A); // EE3 = EEE1 - EEO1 const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B); const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 const __m128i E0B = _mm_add_epi32(EE0B, EO0B); const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 const __m128i E1B = _mm_add_epi32(EE1B, EO1B); const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 const __m128i E2B = _mm_add_epi32(EE2B, EO2B); const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 const __m128i E3B = _mm_add_epi32(EE3B, EO3B); const __m128i E7A = _mm_sub_epi32(EE0A, EO0A); // E0 = EE0 - EO0 const __m128i E7B = _mm_sub_epi32(EE0B, EO0B); const __m128i E6A = _mm_sub_epi32(EE1A, EO1A); // E1 = EE1 - EO1 const __m128i E6B = _mm_sub_epi32(EE1B, EO1B); const __m128i E5A = _mm_sub_epi32(EE2A, EO2A); // E2 = EE2 - EO2 const __m128i E5B = _mm_sub_epi32(EE2B, EO2B); const __m128i E4A = _mm_sub_epi32(EE3A, EO3A); // E3 = EE3 - EO3 const __m128i E4B = _mm_sub_epi32(EE3B, EO3B); const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd const __m128i T10B = _mm_add_epi32(E0B, c32_rnd); const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd const __m128i T11B = _mm_add_epi32(E1B, c32_rnd); const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd const __m128i T12B = _mm_add_epi32(E2B, c32_rnd); const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd const __m128i T13B = _mm_add_epi32(E3B, c32_rnd); const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd const __m128i T14B = _mm_add_epi32(E4B, c32_rnd); const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd const __m128i T15B = _mm_add_epi32(E5B, c32_rnd); const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd const __m128i T16B = _mm_add_epi32(E6B, c32_rnd); const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd const __m128i T17B = _mm_add_epi32(E7B, c32_rnd); const __m128i T20A = _mm_add_epi32(T10A, O0A); // E0 + O0 + rnd const __m128i T20B = _mm_add_epi32(T10B, O0B); const __m128i T21A = _mm_add_epi32(T11A, O1A); // E1 + O1 + rnd const __m128i T21B = _mm_add_epi32(T11B, O1B); const __m128i T22A = _mm_add_epi32(T12A, O2A); // E2 + O2 + rnd const __m128i T22B = _mm_add_epi32(T12B, O2B); const __m128i T23A = _mm_add_epi32(T13A, O3A); // E3 + O3 + rnd const __m128i T23B = _mm_add_epi32(T13B, O3B); const __m128i T24A = _mm_add_epi32(T14A, O4A); // E4 const __m128i T24B = _mm_add_epi32(T14B, O4B); const __m128i T25A = _mm_add_epi32(T15A, O5A); // E5 const __m128i T25B = _mm_add_epi32(T15B, O5B); const __m128i T26A = _mm_add_epi32(T16A, O6A); // E6 const __m128i T26B = _mm_add_epi32(T16B, O6B); const __m128i T27A = _mm_add_epi32(T17A, O7A); // E7 const __m128i T27B = _mm_add_epi32(T17B, O7B); const __m128i T2FA = _mm_sub_epi32(T10A, O0A); // E0 - O0 + rnd const __m128i T2FB = _mm_sub_epi32(T10B, O0B); const __m128i T2EA = _mm_sub_epi32(T11A, O1A); // E1 - O1 + rnd const __m128i T2EB = _mm_sub_epi32(T11B, O1B); const __m128i T2DA = _mm_sub_epi32(T12A, O2A); // E2 - O2 + rnd const __m128i T2DB = _mm_sub_epi32(T12B, O2B); const __m128i T2CA = _mm_sub_epi32(T13A, O3A); // E3 - O3 + rnd const __m128i T2CB = _mm_sub_epi32(T13B, O3B); const __m128i T2BA = _mm_sub_epi32(T14A, O4A); // E4 const __m128i T2BB = _mm_sub_epi32(T14B, O4B); const __m128i T2AA = _mm_sub_epi32(T15A, O5A); // E5 const __m128i T2AB = _mm_sub_epi32(T15B, O5B); const __m128i T29A = _mm_sub_epi32(T16A, O6A); // E6 const __m128i T29B = _mm_sub_epi32(T16B, O6B); const __m128i T28A = _mm_sub_epi32(T17A, O7A); // E7 const __m128i T28B = _mm_sub_epi32(T17B, O7B); const __m128i T30A = _mm_srai_epi32(T20A, nShift); // [30 20 10 00] const __m128i T30B = _mm_srai_epi32(T20B, nShift); // [70 60 50 40] const __m128i T31A = _mm_srai_epi32(T21A, nShift); // [31 21 11 01] const __m128i T31B = _mm_srai_epi32(T21B, nShift); // [71 61 51 41] const __m128i T32A = _mm_srai_epi32(T22A, nShift); // [32 22 12 02] const __m128i T32B = _mm_srai_epi32(T22B, nShift); // [72 62 52 42] const __m128i T33A = _mm_srai_epi32(T23A, nShift); // [33 23 13 03] const __m128i T33B = _mm_srai_epi32(T23B, nShift); // [73 63 53 43] const __m128i T34A = _mm_srai_epi32(T24A, nShift); // [33 24 14 04] const __m128i T34B = _mm_srai_epi32(T24B, nShift); // [74 64 54 44] const __m128i T35A = _mm_srai_epi32(T25A, nShift); // [35 25 15 05] const __m128i T35B = _mm_srai_epi32(T25B, nShift); // [75 65 55 45] const __m128i T36A = _mm_srai_epi32(T26A, nShift); // [36 26 16 06] const __m128i T36B = _mm_srai_epi32(T26B, nShift); // [76 66 56 46] const __m128i T37A = _mm_srai_epi32(T27A, nShift); // [37 27 17 07] const __m128i T37B = _mm_srai_epi32(T27B, nShift); // [77 67 57 47] const __m128i T38A = _mm_srai_epi32(T28A, nShift); // [30 20 10 00] x8 const __m128i T38B = _mm_srai_epi32(T28B, nShift); // [70 60 50 40] const __m128i T39A = _mm_srai_epi32(T29A, nShift); // [31 21 11 01] x9 const __m128i T39B = _mm_srai_epi32(T29B, nShift); // [71 61 51 41] const __m128i T3AA = _mm_srai_epi32(T2AA, nShift); // [32 22 12 02] xA const __m128i T3AB = _mm_srai_epi32(T2AB, nShift); // [72 62 52 42] const __m128i T3BA = _mm_srai_epi32(T2BA, nShift); // [33 23 13 03] xB const __m128i T3BB = _mm_srai_epi32(T2BB, nShift); // [73 63 53 43] const __m128i T3CA = _mm_srai_epi32(T2CA, nShift); // [33 24 14 04] xC const __m128i T3CB = _mm_srai_epi32(T2CB, nShift); // [74 64 54 44] const __m128i T3DA = _mm_srai_epi32(T2DA, nShift); // [35 25 15 05] xD const __m128i T3DB = _mm_srai_epi32(T2DB, nShift); // [75 65 55 45] const __m128i T3EA = _mm_srai_epi32(T2EA, nShift); // [36 26 16 06] xE const __m128i T3EB = _mm_srai_epi32(T2EB, nShift); // [76 66 56 46] const __m128i T3FA = _mm_srai_epi32(T2FA, nShift); // [37 27 17 07] xF const __m128i T3FB = _mm_srai_epi32(T2FB, nShift); // [77 67 57 47] res00[part] = _mm_packs_epi32(T30A, T30B); // [70 60 50 40 30 20 10 00] res01[part] = _mm_packs_epi32(T31A, T31B); // [71 61 51 41 31 21 11 01] res02[part] = _mm_packs_epi32(T32A, T32B); // [72 62 52 42 32 22 12 02] res03[part] = _mm_packs_epi32(T33A, T33B); // [73 63 53 43 33 23 13 03] res04[part] = _mm_packs_epi32(T34A, T34B); // [74 64 54 44 34 24 14 04] res05[part] = _mm_packs_epi32(T35A, T35B); // [75 65 55 45 35 25 15 05] res06[part] = _mm_packs_epi32(T36A, T36B); // [76 66 56 46 36 26 16 06] res07[part] = _mm_packs_epi32(T37A, T37B); // [77 67 57 47 37 27 17 07] res08[part] = _mm_packs_epi32(T38A, T38B); // [A0 ... 80] res09[part] = _mm_packs_epi32(T39A, T39B); // [A1 ... 81] res10[part] = _mm_packs_epi32(T3AA, T3AB); // [A2 ... 82] res11[part] = _mm_packs_epi32(T3BA, T3BB); // [A3 ... 83] res12[part] = _mm_packs_epi32(T3CA, T3CB); // [A4 ... 84] res13[part] = _mm_packs_epi32(T3DA, T3DB); // [A5 ... 85] res14[part] = _mm_packs_epi32(T3EA, T3EB); // [A6 ... 86] res15[part] = _mm_packs_epi32(T3FA, T3FB); // [A7 ... 87] } } // transpose matrix 8x8 16bit { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0]) TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1]) #undef TRANSPOSE_8x8_16BIT } nShift = shift2; c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 } // clip { const __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); const __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); in00[0] = _mm_max_epi16(_mm_min_epi16(in00[0], max_val), min_val); in00[1] = _mm_max_epi16(_mm_min_epi16(in00[1], max_val), min_val); in01[0] = _mm_max_epi16(_mm_min_epi16(in01[0], max_val), min_val); in01[1] = _mm_max_epi16(_mm_min_epi16(in01[1], max_val), min_val); in02[0] = _mm_max_epi16(_mm_min_epi16(in02[0], max_val), min_val); in02[1] = _mm_max_epi16(_mm_min_epi16(in02[1], max_val), min_val); in03[0] = _mm_max_epi16(_mm_min_epi16(in03[0], max_val), min_val); in03[1] = _mm_max_epi16(_mm_min_epi16(in03[1], max_val), min_val); in04[0] = _mm_max_epi16(_mm_min_epi16(in04[0], max_val), min_val); in04[1] = _mm_max_epi16(_mm_min_epi16(in04[1], max_val), min_val); in05[0] = _mm_max_epi16(_mm_min_epi16(in05[0], max_val), min_val); in05[1] = _mm_max_epi16(_mm_min_epi16(in05[1], max_val), min_val); in06[0] = _mm_max_epi16(_mm_min_epi16(in06[0], max_val), min_val); in06[1] = _mm_max_epi16(_mm_min_epi16(in06[1], max_val), min_val); in07[0] = _mm_max_epi16(_mm_min_epi16(in07[0], max_val), min_val); in07[1] = _mm_max_epi16(_mm_min_epi16(in07[1], max_val), min_val); in08[0] = _mm_max_epi16(_mm_min_epi16(in08[0], max_val), min_val); in08[1] = _mm_max_epi16(_mm_min_epi16(in08[1], max_val), min_val); in09[0] = _mm_max_epi16(_mm_min_epi16(in09[0], max_val), min_val); in09[1] = _mm_max_epi16(_mm_min_epi16(in09[1], max_val), min_val); in10[0] = _mm_max_epi16(_mm_min_epi16(in10[0], max_val), min_val); in10[1] = _mm_max_epi16(_mm_min_epi16(in10[1], max_val), min_val); in11[0] = _mm_max_epi16(_mm_min_epi16(in11[0], max_val), min_val); in11[1] = _mm_max_epi16(_mm_min_epi16(in11[1], max_val), min_val); in12[0] = _mm_max_epi16(_mm_min_epi16(in12[0], max_val), min_val); in12[1] = _mm_max_epi16(_mm_min_epi16(in12[1], max_val), min_val); in13[0] = _mm_max_epi16(_mm_min_epi16(in13[0], max_val), min_val); in13[1] = _mm_max_epi16(_mm_min_epi16(in13[1], max_val), min_val); in14[0] = _mm_max_epi16(_mm_min_epi16(in14[0], max_val), min_val); in14[1] = _mm_max_epi16(_mm_min_epi16(in14[1], max_val), min_val); in15[0] = _mm_max_epi16(_mm_min_epi16(in15[0], max_val), min_val); in15[1] = _mm_max_epi16(_mm_min_epi16(in15[1], max_val), min_val); } // store _mm_store_si128((__m128i*)(dst + 0 * i_dst + 0), in00[0]); _mm_store_si128((__m128i*)(dst + 0 * i_dst + 8), in00[1]); _mm_store_si128((__m128i*)(dst + 1 * i_dst + 0), in01[0]); _mm_store_si128((__m128i*)(dst + 1 * i_dst + 8), in01[1]); _mm_store_si128((__m128i*)(dst + 2 * i_dst + 0), in02[0]); _mm_store_si128((__m128i*)(dst + 2 * i_dst + 8), in02[1]); _mm_store_si128((__m128i*)(dst + 3 * i_dst + 0), in03[0]); _mm_store_si128((__m128i*)(dst + 3 * i_dst + 8), in03[1]); _mm_store_si128((__m128i*)(dst + 4 * i_dst + 0), in04[0]); _mm_store_si128((__m128i*)(dst + 4 * i_dst + 8), in04[1]); _mm_store_si128((__m128i*)(dst + 5 * i_dst + 0), in05[0]); _mm_store_si128((__m128i*)(dst + 5 * i_dst + 8), in05[1]); _mm_store_si128((__m128i*)(dst + 6 * i_dst + 0), in06[0]); _mm_store_si128((__m128i*)(dst + 6 * i_dst + 8), in06[1]); _mm_store_si128((__m128i*)(dst + 7 * i_dst + 0), in07[0]); _mm_store_si128((__m128i*)(dst + 7 * i_dst + 8), in07[1]); _mm_store_si128((__m128i*)(dst + 8 * i_dst + 0), in08[0]); _mm_store_si128((__m128i*)(dst + 8 * i_dst + 8), in08[1]); _mm_store_si128((__m128i*)(dst + 9 * i_dst + 0), in09[0]); _mm_store_si128((__m128i*)(dst + 9 * i_dst + 8), in09[1]); _mm_store_si128((__m128i*)(dst + 10 * i_dst + 0), in10[0]); _mm_store_si128((__m128i*)(dst + 10 * i_dst + 8), in10[1]); _mm_store_si128((__m128i*)(dst + 11 * i_dst + 0), in11[0]); _mm_store_si128((__m128i*)(dst + 11 * i_dst + 8), in11[1]); _mm_store_si128((__m128i*)(dst + 12 * i_dst + 0), in12[0]); _mm_store_si128((__m128i*)(dst + 12 * i_dst + 8), in12[1]); _mm_store_si128((__m128i*)(dst + 13 * i_dst + 0), in13[0]); _mm_store_si128((__m128i*)(dst + 13 * i_dst + 8), in13[1]); _mm_store_si128((__m128i*)(dst + 14 * i_dst + 0), in14[0]); _mm_store_si128((__m128i*)(dst + 14 * i_dst + 8), in14[1]); _mm_store_si128((__m128i*)(dst + 15 * i_dst + 0), in15[0]); _mm_store_si128((__m128i*)(dst + 15 * i_dst + 8), in15[1]); } /* --------------------------------------------------------------------------- */ void idct_c_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { int a_flag = i_dst & 0x01; //int shift1 = 5; int shift2 = 20 - g_bit_depth - a_flag; //int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1 + a_flag; const __m128i c16_p45_p45 = _mm_set1_epi32(0x002D002D); const __m128i c16_p43_p44 = _mm_set1_epi32(0x002B002C); const __m128i c16_p39_p41 = _mm_set1_epi32(0x00270029); const __m128i c16_p34_p36 = _mm_set1_epi32(0x00220024); const __m128i c16_p27_p30 = _mm_set1_epi32(0x001B001E); const __m128i c16_p19_p23 = _mm_set1_epi32(0x00130017); const __m128i c16_p11_p15 = _mm_set1_epi32(0x000B000F); const __m128i c16_p02_p07 = _mm_set1_epi32(0x00020007); const __m128i c16_p41_p45 = _mm_set1_epi32(0x0029002D); const __m128i c16_p23_p34 = _mm_set1_epi32(0x00170022); const __m128i c16_n02_p11 = _mm_set1_epi32(0xFFFE000B); const __m128i c16_n27_n15 = _mm_set1_epi32(0xFFE5FFF1); const __m128i c16_n43_n36 = _mm_set1_epi32(0xFFD5FFDC); const __m128i c16_n44_n45 = _mm_set1_epi32(0xFFD4FFD3); const __m128i c16_n30_n39 = _mm_set1_epi32(0xFFE2FFD9); const __m128i c16_n07_n19 = _mm_set1_epi32(0xFFF9FFED); const __m128i c16_p34_p44 = _mm_set1_epi32(0x0022002C); const __m128i c16_n07_p15 = _mm_set1_epi32(0xFFF9000F); const __m128i c16_n41_n27 = _mm_set1_epi32(0xFFD7FFE5); const __m128i c16_n39_n45 = _mm_set1_epi32(0xFFD9FFD3); const __m128i c16_n02_n23 = _mm_set1_epi32(0xFFFEFFE9); const __m128i c16_p36_p19 = _mm_set1_epi32(0x00240013); const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); const __m128i c16_p11_p30 = _mm_set1_epi32(0x000B001E); const __m128i c16_p23_p43 = _mm_set1_epi32(0x0017002B); const __m128i c16_n34_n07 = _mm_set1_epi32(0xFFDEFFF9); const __m128i c16_n36_n45 = _mm_set1_epi32(0xFFDCFFD3); const __m128i c16_p19_n11 = _mm_set1_epi32(0x0013FFF5); const __m128i c16_p44_p41 = _mm_set1_epi32(0x002C0029); const __m128i c16_n02_p27 = _mm_set1_epi32(0xFFFE001B); const __m128i c16_n45_n30 = _mm_set1_epi32(0xFFD3FFE2); const __m128i c16_n15_n39 = _mm_set1_epi32(0xFFF1FFD9); const __m128i c16_p11_p41 = _mm_set1_epi32(0x000B0029); const __m128i c16_n45_n27 = _mm_set1_epi32(0xFFD3FFE5); const __m128i c16_p07_n30 = _mm_set1_epi32(0x0007FFE2); const __m128i c16_p43_p39 = _mm_set1_epi32(0x002B0027); const __m128i c16_n23_p15 = _mm_set1_epi32(0xFFE9000F); const __m128i c16_n34_n45 = _mm_set1_epi32(0xFFDEFFD3); const __m128i c16_p36_p02 = _mm_set1_epi32(0x00240002); const __m128i c16_p19_p44 = _mm_set1_epi32(0x0013002C); const __m128i c16_n02_p39 = _mm_set1_epi32(0xFFFE0027); const __m128i c16_n36_n41 = _mm_set1_epi32(0xFFDCFFD7); const __m128i c16_p43_p07 = _mm_set1_epi32(0x002B0007); const __m128i c16_n11_p34 = _mm_set1_epi32(0xFFF50022); const __m128i c16_n30_n44 = _mm_set1_epi32(0xFFE2FFD4); const __m128i c16_p45_p15 = _mm_set1_epi32(0x002D000F); const __m128i c16_n19_p27 = _mm_set1_epi32(0xFFED001B); const __m128i c16_n23_n45 = _mm_set1_epi32(0xFFE9FFD3); const __m128i c16_n15_p36 = _mm_set1_epi32(0xFFF10024); const __m128i c16_n11_n45 = _mm_set1_epi32(0xFFF5FFD3); const __m128i c16_p34_p39 = _mm_set1_epi32(0x00220027); const __m128i c16_n45_n19 = _mm_set1_epi32(0xFFD3FFED); const __m128i c16_p41_n07 = _mm_set1_epi32(0x0029FFF9); const __m128i c16_n23_p30 = _mm_set1_epi32(0xFFE9001E); const __m128i c16_n02_n44 = _mm_set1_epi32(0xFFFEFFD4); const __m128i c16_p27_p43 = _mm_set1_epi32(0x001B002B); const __m128i c16_n27_p34 = _mm_set1_epi32(0xFFE50022); const __m128i c16_p19_n39 = _mm_set1_epi32(0x0013FFD9); const __m128i c16_n11_p43 = _mm_set1_epi32(0xFFF5002B); const __m128i c16_p02_n45 = _mm_set1_epi32(0x0002FFD3); const __m128i c16_p07_p45 = _mm_set1_epi32(0x0007002D); const __m128i c16_n15_n44 = _mm_set1_epi32(0xFFF1FFD4); const __m128i c16_p23_p41 = _mm_set1_epi32(0x00170029); const __m128i c16_n30_n36 = _mm_set1_epi32(0xFFE2FFDC); const __m128i c16_n36_p30 = _mm_set1_epi32(0xFFDC001E); const __m128i c16_p41_n23 = _mm_set1_epi32(0x0029FFE9); const __m128i c16_n44_p15 = _mm_set1_epi32(0xFFD4000F); const __m128i c16_p45_n07 = _mm_set1_epi32(0x002DFFF9); const __m128i c16_n45_n02 = _mm_set1_epi32(0xFFD3FFFE); const __m128i c16_p43_p11 = _mm_set1_epi32(0x002B000B); const __m128i c16_n39_n19 = _mm_set1_epi32(0xFFD9FFED); const __m128i c16_p34_p27 = _mm_set1_epi32(0x0022001B); const __m128i c16_n43_p27 = _mm_set1_epi32(0xFFD5001B); const __m128i c16_p44_n02 = _mm_set1_epi32(0x002CFFFE); const __m128i c16_n30_n23 = _mm_set1_epi32(0xFFE2FFE9); const __m128i c16_p07_p41 = _mm_set1_epi32(0x00070029); const __m128i c16_p19_n45 = _mm_set1_epi32(0x0013FFD3); const __m128i c16_n39_p34 = _mm_set1_epi32(0xFFD90022); const __m128i c16_p45_n11 = _mm_set1_epi32(0x002DFFF5); const __m128i c16_n36_n15 = _mm_set1_epi32(0xFFDCFFF1); const __m128i c16_n45_p23 = _mm_set1_epi32(0xFFD30017); const __m128i c16_p27_p19 = _mm_set1_epi32(0x001B0013); const __m128i c16_p15_n45 = _mm_set1_epi32(0x000FFFD3); const __m128i c16_n44_p30 = _mm_set1_epi32(0xFFD4001E); const __m128i c16_p34_p11 = _mm_set1_epi32(0x0022000B); const __m128i c16_p07_n43 = _mm_set1_epi32(0x0007FFD5); const __m128i c16_n41_p36 = _mm_set1_epi32(0xFFD70024); const __m128i c16_p39_p02 = _mm_set1_epi32(0x00270002); const __m128i c16_n44_p19 = _mm_set1_epi32(0xFFD40013); const __m128i c16_n02_p36 = _mm_set1_epi32(0xFFFE0024); const __m128i c16_p45_n34 = _mm_set1_epi32(0x002DFFDE); const __m128i c16_n15_n23 = _mm_set1_epi32(0xFFF1FFE9); const __m128i c16_n39_p43 = _mm_set1_epi32(0xFFD9002B); const __m128i c16_p30_p07 = _mm_set1_epi32(0x001E0007); const __m128i c16_p27_n45 = _mm_set1_epi32(0x001BFFD3); const __m128i c16_n41_p11 = _mm_set1_epi32(0xFFD7000B); const __m128i c16_n39_p15 = _mm_set1_epi32(0xFFD9000F); const __m128i c16_n30_p45 = _mm_set1_epi32(0xFFE2002D); const __m128i c16_p27_p02 = _mm_set1_epi32(0x001B0002); const __m128i c16_p41_n44 = _mm_set1_epi32(0x0029FFD4); const __m128i c16_n11_n19 = _mm_set1_epi32(0xFFF5FFED); const __m128i c16_n45_p36 = _mm_set1_epi32(0xFFD30024); const __m128i c16_n07_p34 = _mm_set1_epi32(0xFFF90022); const __m128i c16_p43_n23 = _mm_set1_epi32(0x002BFFE9); const __m128i c16_n30_p11 = _mm_set1_epi32(0xFFE2000B); const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_n19_p36 = _mm_set1_epi32(0xFFED0024); const __m128i c16_p23_n02 = _mm_set1_epi32(0x0017FFFE); const __m128i c16_p45_n39 = _mm_set1_epi32(0x002DFFD9); const __m128i c16_p27_n41 = _mm_set1_epi32(0x001BFFD7); const __m128i c16_n15_n07 = _mm_set1_epi32(0xFFF1FFF9); const __m128i c16_n44_p34 = _mm_set1_epi32(0xFFD40022); const __m128i c16_n19_p07 = _mm_set1_epi32(0xFFED0007); const __m128i c16_n39_p30 = _mm_set1_epi32(0xFFD9001E); const __m128i c16_n45_p44 = _mm_set1_epi32(0xFFD3002C); const __m128i c16_n36_p43 = _mm_set1_epi32(0xFFDC002B); const __m128i c16_n15_p27 = _mm_set1_epi32(0xFFF1001B); const __m128i c16_p11_p02 = _mm_set1_epi32(0x000B0002); const __m128i c16_p34_n23 = _mm_set1_epi32(0x0022FFE9); const __m128i c16_p45_n41 = _mm_set1_epi32(0x002DFFD7); const __m128i c16_n07_p02 = _mm_set1_epi32(0xFFF90002); const __m128i c16_n15_p11 = _mm_set1_epi32(0xFFF1000B); const __m128i c16_n23_p19 = _mm_set1_epi32(0xFFE90013); const __m128i c16_n30_p27 = _mm_set1_epi32(0xFFE2001B); const __m128i c16_n36_p34 = _mm_set1_epi32(0xFFDC0022); const __m128i c16_n41_p39 = _mm_set1_epi32(0xFFD70027); const __m128i c16_n44_p43 = _mm_set1_epi32(0xFFD4002B); const __m128i c16_n45_p45 = _mm_set1_epi32(0xFFD3002D); // const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); const __m128i c16_p21_p29 = _mm_set1_epi32(0x0015001D); const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D); const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); const __m128i c16_n21_p04 = _mm_set1_epi32(0xFFEB0004); const __m128i c16_n45_n40 = _mm_set1_epi32(0xFFD3FFD8); const __m128i c16_n13_n35 = _mm_set1_epi32(0xFFF3FFDD); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); const __m128i c16_n43_n35 = _mm_set1_epi32(0xFFD5FFDD); const __m128i c16_p29_n13 = _mm_set1_epi32(0x001DFFF3); const __m128i c16_p21_p45 = _mm_set1_epi32(0x0015002D); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); const __m128i c16_p04_n43 = _mm_set1_epi32(0x0004FFD5); const __m128i c16_p13_p45 = _mm_set1_epi32(0x000D002D); const __m128i c16_n29_n40 = _mm_set1_epi32(0xFFE3FFD8); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); const __m128i c16_p45_n13 = _mm_set1_epi32(0x002DFFF3); const __m128i c16_n43_n04 = _mm_set1_epi32(0xFFD5FFFC); const __m128i c16_p35_p21 = _mm_set1_epi32(0x00230015); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); const __m128i c16_p13_p29 = _mm_set1_epi32(0x000D001D); const __m128i c16_p35_n43 = _mm_set1_epi32(0x0023FFD5); const __m128i c16_n40_p04 = _mm_set1_epi32(0xFFD80004); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); const __m128i c16_n40_p45 = _mm_set1_epi32(0xFFD8002D); const __m128i c16_p04_p21 = _mm_set1_epi32(0x00040015); const __m128i c16_p43_n29 = _mm_set1_epi32(0x002BFFE3); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); const __m128i c16_n29_p21 = _mm_set1_epi32(0xFFE30015); const __m128i c16_n40_p35 = _mm_set1_epi32(0xFFD80023); // const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019); const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); const __m128i c16_n25_n44 = _mm_set1_epi32(0xFFE7FFD4); const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); const __m128i c16_p38_p09 = _mm_set1_epi32(0x00260009); const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); const __m128i c16_n44_p38 = _mm_set1_epi32(0xFFD40026); const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); __m128i c32_rnd = _mm_set1_epi32(16); // add1 int nShift = 5; int i, pass, part; // DCT1 __m128i in00[4], in01[4], in02[4], in03[4], in04[4], in05[4], in06[4], in07[4], in08[4], in09[4], in10[4], in11[4], in12[4], in13[4], in14[4], in15[4]; __m128i in16[4], in17[4], in18[4], in19[4], in20[4], in21[4], in22[4], in23[4], in24[4], in25[4], in26[4], in27[4], in28[4], in29[4], in30[4], in31[4]; __m128i res00[4], res01[4], res02[4], res03[4], res04[4], res05[4], res06[4], res07[4], res08[4], res09[4], res10[4], res11[4], res12[4], res13[4], res14[4], res15[4]; __m128i res16[4], res17[4], res18[4], res19[4], res20[4], res21[4], res22[4], res23[4], res24[4], res25[4], res26[4], res27[4], res28[4], res29[4], res30[4], res31[4]; i_dst &= 0xFE; /* remember to remove the flag bit */ for (i = 0; i < 4; i++) { const int offset = (i << 3); in00[i] = _mm_loadu_si128((const __m128i*)&src[ 0 * 32 + offset]); in01[i] = _mm_loadu_si128((const __m128i*)&src[ 1 * 32 + offset]); in02[i] = _mm_loadu_si128((const __m128i*)&src[ 2 * 32 + offset]); in03[i] = _mm_loadu_si128((const __m128i*)&src[ 3 * 32 + offset]); in04[i] = _mm_loadu_si128((const __m128i*)&src[ 4 * 32 + offset]); in05[i] = _mm_loadu_si128((const __m128i*)&src[ 5 * 32 + offset]); in06[i] = _mm_loadu_si128((const __m128i*)&src[ 6 * 32 + offset]); in07[i] = _mm_loadu_si128((const __m128i*)&src[ 7 * 32 + offset]); in08[i] = _mm_loadu_si128((const __m128i*)&src[ 8 * 32 + offset]); in09[i] = _mm_loadu_si128((const __m128i*)&src[ 9 * 32 + offset]); in10[i] = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]); in11[i] = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]); in12[i] = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]); in13[i] = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]); in14[i] = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]); in15[i] = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]); in16[i] = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]); in17[i] = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]); in18[i] = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]); in19[i] = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]); in20[i] = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]); in21[i] = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]); in22[i] = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]); in23[i] = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]); in24[i] = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]); in25[i] = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]); in26[i] = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]); in27[i] = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]); in28[i] = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]); in29[i] = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]); in30[i] = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]); in31[i] = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]); } for (pass = 0; pass < 2; pass++) { if (pass == 1) { c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 nShift = shift2; } for (part = 0; part < 4; part++) { const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14] const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]); // [ ] const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]); // [ ] const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]); // [ ] const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]); // [ ] const __m128i T_00_04A = _mm_unpacklo_epi16(in17[part], in19[part]); // [ ] const __m128i T_00_04B = _mm_unpackhi_epi16(in17[part], in19[part]); // [ ] const __m128i T_00_05A = _mm_unpacklo_epi16(in21[part], in23[part]); // [ ] const __m128i T_00_05B = _mm_unpackhi_epi16(in21[part], in23[part]); // [ ] const __m128i T_00_06A = _mm_unpacklo_epi16(in25[part], in27[part]); // [ ] const __m128i T_00_06B = _mm_unpackhi_epi16(in25[part], in27[part]); // [ ] const __m128i T_00_07A = _mm_unpacklo_epi16(in29[part], in31[part]); // const __m128i T_00_07B = _mm_unpackhi_epi16(in29[part], in31[part]); // [ ] const __m128i T_00_08A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_08B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_09A = _mm_unpacklo_epi16(in10[part], in14[part]); // [ ] const __m128i T_00_09B = _mm_unpackhi_epi16(in10[part], in14[part]); // [ ] const __m128i T_00_10A = _mm_unpacklo_epi16(in18[part], in22[part]); // [ ] const __m128i T_00_10B = _mm_unpackhi_epi16(in18[part], in22[part]); // [ ] const __m128i T_00_11A = _mm_unpacklo_epi16(in26[part], in30[part]); // [ ] const __m128i T_00_11B = _mm_unpackhi_epi16(in26[part], in30[part]); // [ ] const __m128i T_00_12A = _mm_unpacklo_epi16(in04[part], in12[part]); // [ ] const __m128i T_00_12B = _mm_unpackhi_epi16(in04[part], in12[part]); // [ ] const __m128i T_00_13A = _mm_unpacklo_epi16(in20[part], in28[part]); // [ ] const __m128i T_00_13B = _mm_unpackhi_epi16(in20[part], in28[part]); // [ ] const __m128i T_00_14A = _mm_unpacklo_epi16(in08[part], in24[part]); // const __m128i T_00_14B = _mm_unpackhi_epi16(in08[part], in24[part]); // [ ] const __m128i T_00_15A = _mm_unpacklo_epi16(in00[part], in16[part]); // const __m128i T_00_15B = _mm_unpackhi_epi16(in00[part], in16[part]); // [ ] __m128i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A; __m128i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B; __m128i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A; __m128i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B; { __m128i T00, T01, T02, T03; #define COMPUTE_ROW(r0103, r0507, r0911, r1315, r1719, r2123, r2527, r2931, c0103, c0507, c0911, c1315, c1719, c2123, c2527, c2931, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(r0103, c0103), _mm_madd_epi16(r0507, c0507)); \ T01 = _mm_add_epi32(_mm_madd_epi16(r0911, c0911), _mm_madd_epi16(r1315, c1315)); \ T02 = _mm_add_epi32(_mm_madd_epi16(r1719, c1719), _mm_madd_epi16(r2123, c2123)); \ T03 = _mm_add_epi32(_mm_madd_epi16(r2527, c2527), _mm_madd_epi16(r2931, c2931)); \ row = _mm_add_epi32(_mm_add_epi32(T00, T01), _mm_add_epi32(T02, T03)); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15A) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15B) #undef COMPUTE_ROW } { __m128i T00, T01; #define COMPUTE_ROW(row0206, row1014, row1822, row2630, c0206, c1014, c1822, c2630, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(row0206, c0206), _mm_madd_epi16(row1014, c1014)); \ T01 = _mm_add_epi32(_mm_madd_epi16(row1822, c1822), _mm_madd_epi16(row2630, c2630)); \ row = _mm_add_epi32(T00, T01); COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7A) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7B) #undef COMPUTE_ROW } { const __m128i EEO0A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_p38_p44), _mm_madd_epi16(T_00_13A, c16_p09_p25)); const __m128i EEO1A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n09_p38), _mm_madd_epi16(T_00_13A, c16_n25_n44)); const __m128i EEO2A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n44_p25), _mm_madd_epi16(T_00_13A, c16_p38_p09)); const __m128i EEO3A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n25_p09), _mm_madd_epi16(T_00_13A, c16_n44_p38)); const __m128i EEO0B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_p38_p44), _mm_madd_epi16(T_00_13B, c16_p09_p25)); const __m128i EEO1B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n09_p38), _mm_madd_epi16(T_00_13B, c16_n25_n44)); const __m128i EEO2B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n44_p25), _mm_madd_epi16(T_00_13B, c16_p38_p09)); const __m128i EEO3B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n25_p09), _mm_madd_epi16(T_00_13B, c16_n44_p38)); const __m128i EEEO0A = _mm_madd_epi16(T_00_14A, c16_p17_p42); const __m128i EEEO0B = _mm_madd_epi16(T_00_14B, c16_p17_p42); const __m128i EEEO1A = _mm_madd_epi16(T_00_14A, c16_n42_p17); const __m128i EEEO1B = _mm_madd_epi16(T_00_14B, c16_n42_p17); const __m128i EEEE0A = _mm_madd_epi16(T_00_15A, c16_p32_p32); const __m128i EEEE0B = _mm_madd_epi16(T_00_15B, c16_p32_p32); const __m128i EEEE1A = _mm_madd_epi16(T_00_15A, c16_n32_p32); const __m128i EEEE1B = _mm_madd_epi16(T_00_15B, c16_n32_p32); const __m128i EEE0A = _mm_add_epi32(EEEE0A, EEEO0A); // EEE0 = EEEE0 + EEEO0 const __m128i EEE0B = _mm_add_epi32(EEEE0B, EEEO0B); const __m128i EEE1A = _mm_add_epi32(EEEE1A, EEEO1A); // EEE1 = EEEE1 + EEEO1 const __m128i EEE1B = _mm_add_epi32(EEEE1B, EEEO1B); const __m128i EEE3A = _mm_sub_epi32(EEEE0A, EEEO0A); // EEE2 = EEEE0 - EEEO0 const __m128i EEE3B = _mm_sub_epi32(EEEE0B, EEEO0B); const __m128i EEE2A = _mm_sub_epi32(EEEE1A, EEEO1A); // EEE3 = EEEE1 - EEEO1 const __m128i EEE2B = _mm_sub_epi32(EEEE1B, EEEO1B); const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); const __m128i EE2A = _mm_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0 const __m128i EE2B = _mm_add_epi32(EEE2B, EEO2B); const __m128i EE3A = _mm_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1 const __m128i EE3B = _mm_add_epi32(EEE3B, EEO3B); const __m128i EE7A = _mm_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0 const __m128i EE7B = _mm_sub_epi32(EEE0B, EEO0B); const __m128i EE6A = _mm_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1 const __m128i EE6B = _mm_sub_epi32(EEE1B, EEO1B); const __m128i EE5A = _mm_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0 const __m128i EE5B = _mm_sub_epi32(EEE2B, EEO2B); const __m128i EE4A = _mm_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1 const __m128i EE4B = _mm_sub_epi32(EEE3B, EEO3B); const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 const __m128i E0B = _mm_add_epi32(EE0B, EO0B); const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 const __m128i E1B = _mm_add_epi32(EE1B, EO1B); const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 const __m128i E2B = _mm_add_epi32(EE2B, EO2B); const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 const __m128i E3B = _mm_add_epi32(EE3B, EO3B); const __m128i E4A = _mm_add_epi32(EE4A, EO4A); // E4 = const __m128i E4B = _mm_add_epi32(EE4B, EO4B); const __m128i E5A = _mm_add_epi32(EE5A, EO5A); // E5 = const __m128i E5B = _mm_add_epi32(EE5B, EO5B); const __m128i E6A = _mm_add_epi32(EE6A, EO6A); // E6 = const __m128i E6B = _mm_add_epi32(EE6B, EO6B); const __m128i E7A = _mm_add_epi32(EE7A, EO7A); // E7 = const __m128i E7B = _mm_add_epi32(EE7B, EO7B); const __m128i EFA = _mm_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0 const __m128i EFB = _mm_sub_epi32(EE0B, EO0B); const __m128i EEA = _mm_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1 const __m128i EEB = _mm_sub_epi32(EE1B, EO1B); const __m128i EDA = _mm_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2 const __m128i EDB = _mm_sub_epi32(EE2B, EO2B); const __m128i ECA = _mm_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3 const __m128i ECB = _mm_sub_epi32(EE3B, EO3B); const __m128i EBA = _mm_sub_epi32(EE4A, EO4A); // EB = const __m128i EBB = _mm_sub_epi32(EE4B, EO4B); const __m128i EAA = _mm_sub_epi32(EE5A, EO5A); // EA = const __m128i EAB = _mm_sub_epi32(EE5B, EO5B); const __m128i E9A = _mm_sub_epi32(EE6A, EO6A); // E9 = const __m128i E9B = _mm_sub_epi32(EE6B, EO6B); const __m128i E8A = _mm_sub_epi32(EE7A, EO7A); // E8 = const __m128i E8B = _mm_sub_epi32(EE7B, EO7B); const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd const __m128i T10B = _mm_add_epi32(E0B, c32_rnd); const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd const __m128i T11B = _mm_add_epi32(E1B, c32_rnd); const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd const __m128i T12B = _mm_add_epi32(E2B, c32_rnd); const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd const __m128i T13B = _mm_add_epi32(E3B, c32_rnd); const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd const __m128i T14B = _mm_add_epi32(E4B, c32_rnd); const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd const __m128i T15B = _mm_add_epi32(E5B, c32_rnd); const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd const __m128i T16B = _mm_add_epi32(E6B, c32_rnd); const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd const __m128i T17B = _mm_add_epi32(E7B, c32_rnd); const __m128i T18A = _mm_add_epi32(E8A, c32_rnd); // E8 + rnd const __m128i T18B = _mm_add_epi32(E8B, c32_rnd); const __m128i T19A = _mm_add_epi32(E9A, c32_rnd); // E9 + rnd const __m128i T19B = _mm_add_epi32(E9B, c32_rnd); const __m128i T1AA = _mm_add_epi32(EAA, c32_rnd); // E10 + rnd const __m128i T1AB = _mm_add_epi32(EAB, c32_rnd); const __m128i T1BA = _mm_add_epi32(EBA, c32_rnd); // E11 + rnd const __m128i T1BB = _mm_add_epi32(EBB, c32_rnd); const __m128i T1CA = _mm_add_epi32(ECA, c32_rnd); // E12 + rnd const __m128i T1CB = _mm_add_epi32(ECB, c32_rnd); const __m128i T1DA = _mm_add_epi32(EDA, c32_rnd); // E13 + rnd const __m128i T1DB = _mm_add_epi32(EDB, c32_rnd); const __m128i T1EA = _mm_add_epi32(EEA, c32_rnd); // E14 + rnd const __m128i T1EB = _mm_add_epi32(EEB, c32_rnd); const __m128i T1FA = _mm_add_epi32(EFA, c32_rnd); // E15 + rnd const __m128i T1FB = _mm_add_epi32(EFB, c32_rnd); const __m128i T2_00A = _mm_add_epi32(T10A, O00A); // E0 + O0 + rnd const __m128i T2_00B = _mm_add_epi32(T10B, O00B); const __m128i T2_01A = _mm_add_epi32(T11A, O01A); // E1 + O1 + rnd const __m128i T2_01B = _mm_add_epi32(T11B, O01B); const __m128i T2_02A = _mm_add_epi32(T12A, O02A); // E2 + O2 + rnd const __m128i T2_02B = _mm_add_epi32(T12B, O02B); const __m128i T2_03A = _mm_add_epi32(T13A, O03A); // E3 + O3 + rnd const __m128i T2_03B = _mm_add_epi32(T13B, O03B); const __m128i T2_04A = _mm_add_epi32(T14A, O04A); // E4 const __m128i T2_04B = _mm_add_epi32(T14B, O04B); const __m128i T2_05A = _mm_add_epi32(T15A, O05A); // E5 const __m128i T2_05B = _mm_add_epi32(T15B, O05B); const __m128i T2_06A = _mm_add_epi32(T16A, O06A); // E6 const __m128i T2_06B = _mm_add_epi32(T16B, O06B); const __m128i T2_07A = _mm_add_epi32(T17A, O07A); // E7 const __m128i T2_07B = _mm_add_epi32(T17B, O07B); const __m128i T2_08A = _mm_add_epi32(T18A, O08A); // E8 const __m128i T2_08B = _mm_add_epi32(T18B, O08B); const __m128i T2_09A = _mm_add_epi32(T19A, O09A); // E9 const __m128i T2_09B = _mm_add_epi32(T19B, O09B); const __m128i T2_10A = _mm_add_epi32(T1AA, O10A); // E10 const __m128i T2_10B = _mm_add_epi32(T1AB, O10B); const __m128i T2_11A = _mm_add_epi32(T1BA, O11A); // E11 const __m128i T2_11B = _mm_add_epi32(T1BB, O11B); const __m128i T2_12A = _mm_add_epi32(T1CA, O12A); // E12 const __m128i T2_12B = _mm_add_epi32(T1CB, O12B); const __m128i T2_13A = _mm_add_epi32(T1DA, O13A); // E13 const __m128i T2_13B = _mm_add_epi32(T1DB, O13B); const __m128i T2_14A = _mm_add_epi32(T1EA, O14A); // E14 const __m128i T2_14B = _mm_add_epi32(T1EB, O14B); const __m128i T2_15A = _mm_add_epi32(T1FA, O15A); // E15 const __m128i T2_15B = _mm_add_epi32(T1FB, O15B); const __m128i T2_31A = _mm_sub_epi32(T10A, O00A); // E0 - O0 + rnd const __m128i T2_31B = _mm_sub_epi32(T10B, O00B); const __m128i T2_30A = _mm_sub_epi32(T11A, O01A); // E1 - O1 + rnd const __m128i T2_30B = _mm_sub_epi32(T11B, O01B); const __m128i T2_29A = _mm_sub_epi32(T12A, O02A); // E2 - O2 + rnd const __m128i T2_29B = _mm_sub_epi32(T12B, O02B); const __m128i T2_28A = _mm_sub_epi32(T13A, O03A); // E3 - O3 + rnd const __m128i T2_28B = _mm_sub_epi32(T13B, O03B); const __m128i T2_27A = _mm_sub_epi32(T14A, O04A); // E4 const __m128i T2_27B = _mm_sub_epi32(T14B, O04B); const __m128i T2_26A = _mm_sub_epi32(T15A, O05A); // E5 const __m128i T2_26B = _mm_sub_epi32(T15B, O05B); const __m128i T2_25A = _mm_sub_epi32(T16A, O06A); // E6 const __m128i T2_25B = _mm_sub_epi32(T16B, O06B); const __m128i T2_24A = _mm_sub_epi32(T17A, O07A); // E7 const __m128i T2_24B = _mm_sub_epi32(T17B, O07B); const __m128i T2_23A = _mm_sub_epi32(T18A, O08A); // const __m128i T2_23B = _mm_sub_epi32(T18B, O08B); const __m128i T2_22A = _mm_sub_epi32(T19A, O09A); // const __m128i T2_22B = _mm_sub_epi32(T19B, O09B); const __m128i T2_21A = _mm_sub_epi32(T1AA, O10A); // const __m128i T2_21B = _mm_sub_epi32(T1AB, O10B); const __m128i T2_20A = _mm_sub_epi32(T1BA, O11A); // const __m128i T2_20B = _mm_sub_epi32(T1BB, O11B); const __m128i T2_19A = _mm_sub_epi32(T1CA, O12A); // const __m128i T2_19B = _mm_sub_epi32(T1CB, O12B); const __m128i T2_18A = _mm_sub_epi32(T1DA, O13A); // const __m128i T2_18B = _mm_sub_epi32(T1DB, O13B); const __m128i T2_17A = _mm_sub_epi32(T1EA, O14A); // const __m128i T2_17B = _mm_sub_epi32(T1EB, O14B); const __m128i T2_16A = _mm_sub_epi32(T1FA, O15A); // const __m128i T2_16B = _mm_sub_epi32(T1FB, O15B); const __m128i T3_00A = _mm_srai_epi32(T2_00A, nShift); // [30 20 10 00] const __m128i T3_00B = _mm_srai_epi32(T2_00B, nShift); // [70 60 50 40] const __m128i T3_01A = _mm_srai_epi32(T2_01A, nShift); // [31 21 11 01] const __m128i T3_01B = _mm_srai_epi32(T2_01B, nShift); // [71 61 51 41] const __m128i T3_02A = _mm_srai_epi32(T2_02A, nShift); // [32 22 12 02] const __m128i T3_02B = _mm_srai_epi32(T2_02B, nShift); // [72 62 52 42] const __m128i T3_03A = _mm_srai_epi32(T2_03A, nShift); // [33 23 13 03] const __m128i T3_03B = _mm_srai_epi32(T2_03B, nShift); // [73 63 53 43] const __m128i T3_04A = _mm_srai_epi32(T2_04A, nShift); // [33 24 14 04] const __m128i T3_04B = _mm_srai_epi32(T2_04B, nShift); // [74 64 54 44] const __m128i T3_05A = _mm_srai_epi32(T2_05A, nShift); // [35 25 15 05] const __m128i T3_05B = _mm_srai_epi32(T2_05B, nShift); // [75 65 55 45] const __m128i T3_06A = _mm_srai_epi32(T2_06A, nShift); // [36 26 16 06] const __m128i T3_06B = _mm_srai_epi32(T2_06B, nShift); // [76 66 56 46] const __m128i T3_07A = _mm_srai_epi32(T2_07A, nShift); // [37 27 17 07] const __m128i T3_07B = _mm_srai_epi32(T2_07B, nShift); // [77 67 57 47] const __m128i T3_08A = _mm_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8 const __m128i T3_08B = _mm_srai_epi32(T2_08B, nShift); // [70 60 50 40] const __m128i T3_09A = _mm_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9 const __m128i T3_09B = _mm_srai_epi32(T2_09B, nShift); // [71 61 51 41] const __m128i T3_10A = _mm_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA const __m128i T3_10B = _mm_srai_epi32(T2_10B, nShift); // [72 62 52 42] const __m128i T3_11A = _mm_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB const __m128i T3_11B = _mm_srai_epi32(T2_11B, nShift); // [73 63 53 43] const __m128i T3_12A = _mm_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC const __m128i T3_12B = _mm_srai_epi32(T2_12B, nShift); // [74 64 54 44] const __m128i T3_13A = _mm_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD const __m128i T3_13B = _mm_srai_epi32(T2_13B, nShift); // [75 65 55 45] const __m128i T3_14A = _mm_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE const __m128i T3_14B = _mm_srai_epi32(T2_14B, nShift); // [76 66 56 46] const __m128i T3_15A = _mm_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF const __m128i T3_15B = _mm_srai_epi32(T2_15B, nShift); // [77 67 57 47] const __m128i T3_16A = _mm_srai_epi32(T2_16A, nShift); // [30 20 10 00] const __m128i T3_16B = _mm_srai_epi32(T2_16B, nShift); // [70 60 50 40] const __m128i T3_17A = _mm_srai_epi32(T2_17A, nShift); // [31 21 11 01] const __m128i T3_17B = _mm_srai_epi32(T2_17B, nShift); // [71 61 51 41] const __m128i T3_18A = _mm_srai_epi32(T2_18A, nShift); // [32 22 12 02] const __m128i T3_18B = _mm_srai_epi32(T2_18B, nShift); // [72 62 52 42] const __m128i T3_19A = _mm_srai_epi32(T2_19A, nShift); // [33 23 13 03] const __m128i T3_19B = _mm_srai_epi32(T2_19B, nShift); // [73 63 53 43] const __m128i T3_20A = _mm_srai_epi32(T2_20A, nShift); // [33 24 14 04] const __m128i T3_20B = _mm_srai_epi32(T2_20B, nShift); // [74 64 54 44] const __m128i T3_21A = _mm_srai_epi32(T2_21A, nShift); // [35 25 15 05] const __m128i T3_21B = _mm_srai_epi32(T2_21B, nShift); // [75 65 55 45] const __m128i T3_22A = _mm_srai_epi32(T2_22A, nShift); // [36 26 16 06] const __m128i T3_22B = _mm_srai_epi32(T2_22B, nShift); // [76 66 56 46] const __m128i T3_23A = _mm_srai_epi32(T2_23A, nShift); // [37 27 17 07] const __m128i T3_23B = _mm_srai_epi32(T2_23B, nShift); // [77 67 57 47] const __m128i T3_24A = _mm_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8 const __m128i T3_24B = _mm_srai_epi32(T2_24B, nShift); // [70 60 50 40] const __m128i T3_25A = _mm_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9 const __m128i T3_25B = _mm_srai_epi32(T2_25B, nShift); // [71 61 51 41] const __m128i T3_26A = _mm_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA const __m128i T3_26B = _mm_srai_epi32(T2_26B, nShift); // [72 62 52 42] const __m128i T3_27A = _mm_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB const __m128i T3_27B = _mm_srai_epi32(T2_27B, nShift); // [73 63 53 43] const __m128i T3_28A = _mm_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC const __m128i T3_28B = _mm_srai_epi32(T2_28B, nShift); // [74 64 54 44] const __m128i T3_29A = _mm_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD const __m128i T3_29B = _mm_srai_epi32(T2_29B, nShift); // [75 65 55 45] const __m128i T3_30A = _mm_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE const __m128i T3_30B = _mm_srai_epi32(T2_30B, nShift); // [76 66 56 46] const __m128i T3_31A = _mm_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF const __m128i T3_31B = _mm_srai_epi32(T2_31B, nShift); // [77 67 57 47] res00[part] = _mm_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00] res01[part] = _mm_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01] res02[part] = _mm_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02] res03[part] = _mm_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03] res04[part] = _mm_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04] res05[part] = _mm_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05] res06[part] = _mm_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06] res07[part] = _mm_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07] res08[part] = _mm_packs_epi32(T3_08A, T3_08B); // [A0 ... 80] res09[part] = _mm_packs_epi32(T3_09A, T3_09B); // [A1 ... 81] res10[part] = _mm_packs_epi32(T3_10A, T3_10B); // [A2 ... 82] res11[part] = _mm_packs_epi32(T3_11A, T3_11B); // [A3 ... 83] res12[part] = _mm_packs_epi32(T3_12A, T3_12B); // [A4 ... 84] res13[part] = _mm_packs_epi32(T3_13A, T3_13B); // [A5 ... 85] res14[part] = _mm_packs_epi32(T3_14A, T3_14B); // [A6 ... 86] res15[part] = _mm_packs_epi32(T3_15A, T3_15B); // [A7 ... 87] res16[part] = _mm_packs_epi32(T3_16A, T3_16B); res17[part] = _mm_packs_epi32(T3_17A, T3_17B); res18[part] = _mm_packs_epi32(T3_18A, T3_18B); res19[part] = _mm_packs_epi32(T3_19A, T3_19B); res20[part] = _mm_packs_epi32(T3_20A, T3_20B); res21[part] = _mm_packs_epi32(T3_21A, T3_21B); res22[part] = _mm_packs_epi32(T3_22A, T3_22B); res23[part] = _mm_packs_epi32(T3_23A, T3_23B); res24[part] = _mm_packs_epi32(T3_24A, T3_24B); res25[part] = _mm_packs_epi32(T3_25A, T3_25B); res26[part] = _mm_packs_epi32(T3_26A, T3_26B); res27[part] = _mm_packs_epi32(T3_27A, T3_27B); res28[part] = _mm_packs_epi32(T3_28A, T3_28B); res29[part] = _mm_packs_epi32(T3_29A, T3_29B); res30[part] = _mm_packs_epi32(T3_30A, T3_30B); res31[part] = _mm_packs_epi32(T3_31A, T3_31B); } } //transpose matrix 8x8 16bit. { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0]) TRANSPOSE_8x8_16BIT(res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2], in16[0], in17[0], in18[0], in19[0], in20[0], in21[0], in22[0], in23[0]) TRANSPOSE_8x8_16BIT(res00[3], res01[3], res02[3], res03[3], res04[3], res05[3], res06[3], res07[3], in24[0], in25[0], in26[0], in27[0], in28[0], in29[0], in30[0], in31[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1]) TRANSPOSE_8x8_16BIT(res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2], in16[1], in17[1], in18[1], in19[1], in20[1], in21[1], in22[1], in23[1]) TRANSPOSE_8x8_16BIT(res08[3], res09[3], res10[3], res11[3], res12[3], res13[3], res14[3], res15[3], in24[1], in25[1], in26[1], in27[1], in28[1], in29[1], in30[1], in31[1]) TRANSPOSE_8x8_16BIT(res16[0], res17[0], res18[0], res19[0], res20[0], res21[0], res22[0], res23[0], in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2]) TRANSPOSE_8x8_16BIT(res16[1], res17[1], res18[1], res19[1], res20[1], res21[1], res22[1], res23[1], in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2]) TRANSPOSE_8x8_16BIT(res16[2], res17[2], res18[2], res19[2], res20[2], res21[2], res22[2], res23[2], in16[2], in17[2], in18[2], in19[2], in20[2], in21[2], in22[2], in23[2]) TRANSPOSE_8x8_16BIT(res16[3], res17[3], res18[3], res19[3], res20[3], res21[3], res22[3], res23[3], in24[2], in25[2], in26[2], in27[2], in28[2], in29[2], in30[2], in31[2]) TRANSPOSE_8x8_16BIT(res24[0], res25[0], res26[0], res27[0], res28[0], res29[0], res30[0], res31[0], in00[3], in01[3], in02[3], in03[3], in04[3], in05[3], in06[3], in07[3]) TRANSPOSE_8x8_16BIT(res24[1], res25[1], res26[1], res27[1], res28[1], res29[1], res30[1], res31[1], in08[3], in09[3], in10[3], in11[3], in12[3], in13[3], in14[3], in15[3]) TRANSPOSE_8x8_16BIT(res24[2], res25[2], res26[2], res27[2], res28[2], res29[2], res30[2], res31[2], in16[3], in17[3], in18[3], in19[3], in20[3], in21[3], in22[3], in23[3]) TRANSPOSE_8x8_16BIT(res24[3], res25[3], res26[3], res27[3], res28[3], res29[3], res30[3], res31[3], in24[3], in25[3], in26[3], in27[3], in28[3], in29[3], in30[3], in31[3]) #undef TRANSPOSE_8x8_16BIT } } //clip { __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); int k; for (k = 0; k < 4; k++) { in00[k] = _mm_max_epi16(_mm_min_epi16(in00[k], max_val), min_val); in01[k] = _mm_max_epi16(_mm_min_epi16(in01[k], max_val), min_val); in02[k] = _mm_max_epi16(_mm_min_epi16(in02[k], max_val), min_val); in03[k] = _mm_max_epi16(_mm_min_epi16(in03[k], max_val), min_val); in04[k] = _mm_max_epi16(_mm_min_epi16(in04[k], max_val), min_val); in05[k] = _mm_max_epi16(_mm_min_epi16(in05[k], max_val), min_val); in06[k] = _mm_max_epi16(_mm_min_epi16(in06[k], max_val), min_val); in07[k] = _mm_max_epi16(_mm_min_epi16(in07[k], max_val), min_val); in08[k] = _mm_max_epi16(_mm_min_epi16(in08[k], max_val), min_val); in09[k] = _mm_max_epi16(_mm_min_epi16(in09[k], max_val), min_val); in10[k] = _mm_max_epi16(_mm_min_epi16(in10[k], max_val), min_val); in11[k] = _mm_max_epi16(_mm_min_epi16(in11[k], max_val), min_val); in12[k] = _mm_max_epi16(_mm_min_epi16(in12[k], max_val), min_val); in13[k] = _mm_max_epi16(_mm_min_epi16(in13[k], max_val), min_val); in14[k] = _mm_max_epi16(_mm_min_epi16(in14[k], max_val), min_val); in15[k] = _mm_max_epi16(_mm_min_epi16(in15[k], max_val), min_val); in16[k] = _mm_max_epi16(_mm_min_epi16(in16[k], max_val), min_val); in17[k] = _mm_max_epi16(_mm_min_epi16(in17[k], max_val), min_val); in18[k] = _mm_max_epi16(_mm_min_epi16(in18[k], max_val), min_val); in19[k] = _mm_max_epi16(_mm_min_epi16(in19[k], max_val), min_val); in20[k] = _mm_max_epi16(_mm_min_epi16(in20[k], max_val), min_val); in21[k] = _mm_max_epi16(_mm_min_epi16(in21[k], max_val), min_val); in22[k] = _mm_max_epi16(_mm_min_epi16(in22[k], max_val), min_val); in23[k] = _mm_max_epi16(_mm_min_epi16(in23[k], max_val), min_val); in24[k] = _mm_max_epi16(_mm_min_epi16(in24[k], max_val), min_val); in25[k] = _mm_max_epi16(_mm_min_epi16(in25[k], max_val), min_val); in26[k] = _mm_max_epi16(_mm_min_epi16(in26[k], max_val), min_val); in27[k] = _mm_max_epi16(_mm_min_epi16(in27[k], max_val), min_val); in28[k] = _mm_max_epi16(_mm_min_epi16(in28[k], max_val), min_val); in29[k] = _mm_max_epi16(_mm_min_epi16(in29[k], max_val), min_val); in30[k] = _mm_max_epi16(_mm_min_epi16(in30[k], max_val), min_val); in31[k] = _mm_max_epi16(_mm_min_epi16(in31[k], max_val), min_val); } } // Add for (i = 0; i < 2; i++) { #define STORE_LINE(L0, L1, L2, L3, L4, L5, L6, L7, H0, H1, H2, H3, H4, H5, H6, H7, offsetV, offsetH) \ _mm_storeu_si128((__m128i*)(dst + (0 + (offsetV)) * i_dst + (offsetH)+0), L0); \ _mm_storeu_si128((__m128i*)(dst + (0 + (offsetV)) * i_dst + (offsetH)+8), H0); \ _mm_storeu_si128((__m128i*)(dst + (1 + (offsetV)) * i_dst + (offsetH)+0), L1); \ _mm_storeu_si128((__m128i*)(dst + (1 + (offsetV)) * i_dst + (offsetH)+8), H1); \ _mm_storeu_si128((__m128i*)(dst + (2 + (offsetV)) * i_dst + (offsetH)+0), L2); \ _mm_storeu_si128((__m128i*)(dst + (2 + (offsetV)) * i_dst + (offsetH)+8), H2); \ _mm_storeu_si128((__m128i*)(dst + (3 + (offsetV)) * i_dst + (offsetH)+0), L3); \ _mm_storeu_si128((__m128i*)(dst + (3 + (offsetV)) * i_dst + (offsetH)+8), H3); \ _mm_storeu_si128((__m128i*)(dst + (4 + (offsetV)) * i_dst + (offsetH)+0), L4); \ _mm_storeu_si128((__m128i*)(dst + (4 + (offsetV)) * i_dst + (offsetH)+8), H4); \ _mm_storeu_si128((__m128i*)(dst + (5 + (offsetV)) * i_dst + (offsetH)+0), L5); \ _mm_storeu_si128((__m128i*)(dst + (5 + (offsetV)) * i_dst + (offsetH)+8), H5); \ _mm_storeu_si128((__m128i*)(dst + (6 + (offsetV)) * i_dst + (offsetH)+0), L6); \ _mm_storeu_si128((__m128i*)(dst + (6 + (offsetV)) * i_dst + (offsetH)+8), H6); \ _mm_storeu_si128((__m128i*)(dst + (7 + (offsetV)) * i_dst + (offsetH)+0), L7); \ _mm_storeu_si128((__m128i*)(dst + (7 + (offsetV)) * i_dst + (offsetH)+8), H7); const int k = i * 2; STORE_LINE(in00[k], in01[k], in02[k], in03[k], in04[k], in05[k], in06[k], in07[k], in00[k + 1], in01[k + 1], in02[k + 1], in03[k + 1], in04[k + 1], in05[k + 1], in06[k + 1], in07[k + 1], 0, i * 16) STORE_LINE(in08[k], in09[k], in10[k], in11[k], in12[k], in13[k], in14[k], in15[k], in08[k + 1], in09[k + 1], in10[k + 1], in11[k + 1], in12[k + 1], in13[k + 1], in14[k + 1], in15[k + 1], 8, i * 16) STORE_LINE(in16[k], in17[k], in18[k], in19[k], in20[k], in21[k], in22[k], in23[k], in16[k + 1], in17[k + 1], in18[k + 1], in19[k + 1], in20[k + 1], in21[k + 1], in22[k + 1], in23[k + 1], 16, i * 16) STORE_LINE(in24[k], in25[k], in26[k], in27[k], in28[k], in29[k], in30[k], in31[k], in24[k + 1], in25[k + 1], in26[k + 1], in27[k + 1], in28[k + 1], in29[k + 1], in30[k + 1], in31[k + 1], 24, i * 16) #undef STORE_LINE } } /* --------------------------------------------------------------------------- */ void idct_c_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { __m128i m128iS0[4], m128iS1[4], m128iS2[4], m128iS3[4], m128iS4[4], m128iS5[4], m128iS6[4], m128iS7[4]; __m128i m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3; __m128i E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l; __m128i O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l; __m128i EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h; //int shift1 = 5; int shift2 = 20 - g_bit_depth - (i_dst & 0x01); //int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01); int i, pass; i_dst &= 0xFE; /* remember to remove the flag bit */ m128iAdd = _mm_set1_epi32(16); // add1 for (pass = 0; pass < 4; pass++) { m128iS1[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 1 * 32]); m128iS3[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 3 * 32]); m128Tmp0 = _mm_unpacklo_epi16(m128iS1[pass], m128iS3[pass]); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS1[pass], m128iS3[pass]); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); m128iS5[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 5 * 32]); m128iS7[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 7 * 32]); m128Tmp2 = _mm_unpacklo_epi16(m128iS5[pass], m128iS7[pass]); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); m128Tmp3 = _mm_unpackhi_epi16(m128iS5[pass], m128iS7[pass]); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); O0l = _mm_add_epi32(E1l, E2l); O0h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); O1l = _mm_add_epi32(E1l, E2l); O1h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); O2l = _mm_add_epi32(E1l, E2l); O2h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); O3h = _mm_add_epi32(E1h, E2h); O3l = _mm_add_epi32(E1l, E2l); /* ------- */ m128iS0[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 0 * 32]); m128iS4[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 4 * 32]); m128Tmp0 = _mm_unpacklo_epi16(m128iS0[pass], m128iS4[pass]); EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS0[pass], m128iS4[pass]); EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); /* ------- */ m128iS2[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 2 * 32]); m128iS6[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 6 * 32]); m128Tmp0 = _mm_unpacklo_epi16(m128iS2[pass], m128iS6[pass]); E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS2[pass], m128iS6[pass]); E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E0l = _mm_add_epi32(EE0l, E00l); E0l = _mm_add_epi32(E0l, m128iAdd); E0h = _mm_add_epi32(EE0h, E00h); E0h = _mm_add_epi32(E0h, m128iAdd); E3l = _mm_sub_epi32(EE0l, E00l); E3l = _mm_add_epi32(E3l, m128iAdd); E3h = _mm_sub_epi32(EE0h, E00h); E3h = _mm_add_epi32(E3h, m128iAdd); E1l = _mm_add_epi32(EE1l, E01l); E1l = _mm_add_epi32(E1l, m128iAdd); E1h = _mm_add_epi32(EE1h, E01h); E1h = _mm_add_epi32(E1h, m128iAdd); E2l = _mm_sub_epi32(EE1l, E01l); E2l = _mm_add_epi32(E2l, m128iAdd); E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, m128iAdd); m128iS0[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 5)); // ״η任λ m128iS7[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 5)); m128iS1[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 5)); m128iS6[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 5)); m128iS2[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 5), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 5)); m128iS5[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 5), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 5)); m128iS3[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 5), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 5)); m128iS4[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 5), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 5)); /* Inverts matrix */ E0l = _mm_unpacklo_epi16(m128iS0[pass], m128iS4[pass]); E1l = _mm_unpacklo_epi16(m128iS1[pass], m128iS5[pass]); E2l = _mm_unpacklo_epi16(m128iS2[pass], m128iS6[pass]); E3l = _mm_unpacklo_epi16(m128iS3[pass], m128iS7[pass]); O0l = _mm_unpackhi_epi16(m128iS0[pass], m128iS4[pass]); O1l = _mm_unpackhi_epi16(m128iS1[pass], m128iS5[pass]); O2l = _mm_unpackhi_epi16(m128iS2[pass], m128iS6[pass]); O3l = _mm_unpackhi_epi16(m128iS3[pass], m128iS7[pass]); m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); m128iS0[pass] = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); m128iS1[pass] = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); m128iS2[pass] = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); m128iS3[pass] = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); m128iS4[pass] = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); m128iS5[pass] = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); m128iS6[pass] = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); m128iS7[pass] = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); } { const __m128i c16_p45_p45 = _mm_set1_epi32(0x002D002D); const __m128i c16_p43_p44 = _mm_set1_epi32(0x002B002C); const __m128i c16_p39_p41 = _mm_set1_epi32(0x00270029); const __m128i c16_p34_p36 = _mm_set1_epi32(0x00220024); const __m128i c16_p27_p30 = _mm_set1_epi32(0x001B001E); const __m128i c16_p19_p23 = _mm_set1_epi32(0x00130017); const __m128i c16_p11_p15 = _mm_set1_epi32(0x000B000F); const __m128i c16_p02_p07 = _mm_set1_epi32(0x00020007); const __m128i c16_p41_p45 = _mm_set1_epi32(0x0029002D); const __m128i c16_p23_p34 = _mm_set1_epi32(0x00170022); const __m128i c16_n02_p11 = _mm_set1_epi32(0xFFFE000B); const __m128i c16_n27_n15 = _mm_set1_epi32(0xFFE5FFF1); const __m128i c16_n43_n36 = _mm_set1_epi32(0xFFD5FFDC); const __m128i c16_n44_n45 = _mm_set1_epi32(0xFFD4FFD3); const __m128i c16_n30_n39 = _mm_set1_epi32(0xFFE2FFD9); const __m128i c16_n07_n19 = _mm_set1_epi32(0xFFF9FFED); const __m128i c16_p34_p44 = _mm_set1_epi32(0x0022002C); const __m128i c16_n07_p15 = _mm_set1_epi32(0xFFF9000F); const __m128i c16_n41_n27 = _mm_set1_epi32(0xFFD7FFE5); const __m128i c16_n39_n45 = _mm_set1_epi32(0xFFD9FFD3); const __m128i c16_n02_n23 = _mm_set1_epi32(0xFFFEFFE9); const __m128i c16_p36_p19 = _mm_set1_epi32(0x00240013); const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); const __m128i c16_p11_p30 = _mm_set1_epi32(0x000B001E); const __m128i c16_p23_p43 = _mm_set1_epi32(0x0017002B); const __m128i c16_n34_n07 = _mm_set1_epi32(0xFFDEFFF9); const __m128i c16_n36_n45 = _mm_set1_epi32(0xFFDCFFD3); const __m128i c16_p19_n11 = _mm_set1_epi32(0x0013FFF5); const __m128i c16_p44_p41 = _mm_set1_epi32(0x002C0029); const __m128i c16_n02_p27 = _mm_set1_epi32(0xFFFE001B); const __m128i c16_n45_n30 = _mm_set1_epi32(0xFFD3FFE2); const __m128i c16_n15_n39 = _mm_set1_epi32(0xFFF1FFD9); const __m128i c16_p11_p41 = _mm_set1_epi32(0x000B0029); const __m128i c16_n45_n27 = _mm_set1_epi32(0xFFD3FFE5); const __m128i c16_p07_n30 = _mm_set1_epi32(0x0007FFE2); const __m128i c16_p43_p39 = _mm_set1_epi32(0x002B0027); const __m128i c16_n23_p15 = _mm_set1_epi32(0xFFE9000F); const __m128i c16_n34_n45 = _mm_set1_epi32(0xFFDEFFD3); const __m128i c16_p36_p02 = _mm_set1_epi32(0x00240002); const __m128i c16_p19_p44 = _mm_set1_epi32(0x0013002C); const __m128i c16_n02_p39 = _mm_set1_epi32(0xFFFE0027); const __m128i c16_n36_n41 = _mm_set1_epi32(0xFFDCFFD7); const __m128i c16_p43_p07 = _mm_set1_epi32(0x002B0007); const __m128i c16_n11_p34 = _mm_set1_epi32(0xFFF50022); const __m128i c16_n30_n44 = _mm_set1_epi32(0xFFE2FFD4); const __m128i c16_p45_p15 = _mm_set1_epi32(0x002D000F); const __m128i c16_n19_p27 = _mm_set1_epi32(0xFFED001B); const __m128i c16_n23_n45 = _mm_set1_epi32(0xFFE9FFD3); const __m128i c16_n15_p36 = _mm_set1_epi32(0xFFF10024); const __m128i c16_n11_n45 = _mm_set1_epi32(0xFFF5FFD3); const __m128i c16_p34_p39 = _mm_set1_epi32(0x00220027); const __m128i c16_n45_n19 = _mm_set1_epi32(0xFFD3FFED); const __m128i c16_p41_n07 = _mm_set1_epi32(0x0029FFF9); const __m128i c16_n23_p30 = _mm_set1_epi32(0xFFE9001E); const __m128i c16_n02_n44 = _mm_set1_epi32(0xFFFEFFD4); const __m128i c16_p27_p43 = _mm_set1_epi32(0x001B002B); const __m128i c16_n27_p34 = _mm_set1_epi32(0xFFE50022); const __m128i c16_p19_n39 = _mm_set1_epi32(0x0013FFD9); const __m128i c16_n11_p43 = _mm_set1_epi32(0xFFF5002B); const __m128i c16_p02_n45 = _mm_set1_epi32(0x0002FFD3); const __m128i c16_p07_p45 = _mm_set1_epi32(0x0007002D); const __m128i c16_n15_n44 = _mm_set1_epi32(0xFFF1FFD4); const __m128i c16_p23_p41 = _mm_set1_epi32(0x00170029); const __m128i c16_n30_n36 = _mm_set1_epi32(0xFFE2FFDC); const __m128i c16_n36_p30 = _mm_set1_epi32(0xFFDC001E); const __m128i c16_p41_n23 = _mm_set1_epi32(0x0029FFE9); const __m128i c16_n44_p15 = _mm_set1_epi32(0xFFD4000F); const __m128i c16_p45_n07 = _mm_set1_epi32(0x002DFFF9); const __m128i c16_n45_n02 = _mm_set1_epi32(0xFFD3FFFE); const __m128i c16_p43_p11 = _mm_set1_epi32(0x002B000B); const __m128i c16_n39_n19 = _mm_set1_epi32(0xFFD9FFED); const __m128i c16_p34_p27 = _mm_set1_epi32(0x0022001B); const __m128i c16_n43_p27 = _mm_set1_epi32(0xFFD5001B); const __m128i c16_p44_n02 = _mm_set1_epi32(0x002CFFFE); const __m128i c16_n30_n23 = _mm_set1_epi32(0xFFE2FFE9); const __m128i c16_p07_p41 = _mm_set1_epi32(0x00070029); const __m128i c16_p19_n45 = _mm_set1_epi32(0x0013FFD3); const __m128i c16_n39_p34 = _mm_set1_epi32(0xFFD90022); const __m128i c16_p45_n11 = _mm_set1_epi32(0x002DFFF5); const __m128i c16_n36_n15 = _mm_set1_epi32(0xFFDCFFF1); const __m128i c16_n45_p23 = _mm_set1_epi32(0xFFD30017); const __m128i c16_p27_p19 = _mm_set1_epi32(0x001B0013); const __m128i c16_p15_n45 = _mm_set1_epi32(0x000FFFD3); const __m128i c16_n44_p30 = _mm_set1_epi32(0xFFD4001E); const __m128i c16_p34_p11 = _mm_set1_epi32(0x0022000B); const __m128i c16_p07_n43 = _mm_set1_epi32(0x0007FFD5); const __m128i c16_n41_p36 = _mm_set1_epi32(0xFFD70024); const __m128i c16_p39_p02 = _mm_set1_epi32(0x00270002); const __m128i c16_n44_p19 = _mm_set1_epi32(0xFFD40013); const __m128i c16_n02_p36 = _mm_set1_epi32(0xFFFE0024); const __m128i c16_p45_n34 = _mm_set1_epi32(0x002DFFDE); const __m128i c16_n15_n23 = _mm_set1_epi32(0xFFF1FFE9); const __m128i c16_n39_p43 = _mm_set1_epi32(0xFFD9002B); const __m128i c16_p30_p07 = _mm_set1_epi32(0x001E0007); const __m128i c16_p27_n45 = _mm_set1_epi32(0x001BFFD3); const __m128i c16_n41_p11 = _mm_set1_epi32(0xFFD7000B); const __m128i c16_n39_p15 = _mm_set1_epi32(0xFFD9000F); const __m128i c16_n30_p45 = _mm_set1_epi32(0xFFE2002D); const __m128i c16_p27_p02 = _mm_set1_epi32(0x001B0002); const __m128i c16_p41_n44 = _mm_set1_epi32(0x0029FFD4); const __m128i c16_n11_n19 = _mm_set1_epi32(0xFFF5FFED); const __m128i c16_n45_p36 = _mm_set1_epi32(0xFFD30024); const __m128i c16_n07_p34 = _mm_set1_epi32(0xFFF90022); const __m128i c16_p43_n23 = _mm_set1_epi32(0x002BFFE9); const __m128i c16_n30_p11 = _mm_set1_epi32(0xFFE2000B); const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_n19_p36 = _mm_set1_epi32(0xFFED0024); const __m128i c16_p23_n02 = _mm_set1_epi32(0x0017FFFE); const __m128i c16_p45_n39 = _mm_set1_epi32(0x002DFFD9); const __m128i c16_p27_n41 = _mm_set1_epi32(0x001BFFD7); const __m128i c16_n15_n07 = _mm_set1_epi32(0xFFF1FFF9); const __m128i c16_n44_p34 = _mm_set1_epi32(0xFFD40022); const __m128i c16_n19_p07 = _mm_set1_epi32(0xFFED0007); const __m128i c16_n39_p30 = _mm_set1_epi32(0xFFD9001E); const __m128i c16_n45_p44 = _mm_set1_epi32(0xFFD3002C); const __m128i c16_n36_p43 = _mm_set1_epi32(0xFFDC002B); const __m128i c16_n15_p27 = _mm_set1_epi32(0xFFF1001B); const __m128i c16_p11_p02 = _mm_set1_epi32(0x000B0002); const __m128i c16_p34_n23 = _mm_set1_epi32(0x0022FFE9); const __m128i c16_p45_n41 = _mm_set1_epi32(0x002DFFD7); const __m128i c16_n07_p02 = _mm_set1_epi32(0xFFF90002); const __m128i c16_n15_p11 = _mm_set1_epi32(0xFFF1000B); const __m128i c16_n23_p19 = _mm_set1_epi32(0xFFE90013); const __m128i c16_n30_p27 = _mm_set1_epi32(0xFFE2001B); const __m128i c16_n36_p34 = _mm_set1_epi32(0xFFDC0022); const __m128i c16_n41_p39 = _mm_set1_epi32(0xFFD70027); const __m128i c16_n44_p43 = _mm_set1_epi32(0xFFD4002B); const __m128i c16_n45_p45 = _mm_set1_epi32(0xFFD3002D); // const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); const __m128i c16_p21_p29 = _mm_set1_epi32(0x0015001D); const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D); const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); const __m128i c16_n21_p04 = _mm_set1_epi32(0xFFEB0004); const __m128i c16_n45_n40 = _mm_set1_epi32(0xFFD3FFD8); const __m128i c16_n13_n35 = _mm_set1_epi32(0xFFF3FFDD); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); const __m128i c16_n43_n35 = _mm_set1_epi32(0xFFD5FFDD); const __m128i c16_p29_n13 = _mm_set1_epi32(0x001DFFF3); const __m128i c16_p21_p45 = _mm_set1_epi32(0x0015002D); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); const __m128i c16_p04_n43 = _mm_set1_epi32(0x0004FFD5); const __m128i c16_p13_p45 = _mm_set1_epi32(0x000D002D); const __m128i c16_n29_n40 = _mm_set1_epi32(0xFFE3FFD8); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); const __m128i c16_p45_n13 = _mm_set1_epi32(0x002DFFF3); const __m128i c16_n43_n04 = _mm_set1_epi32(0xFFD5FFFC); const __m128i c16_p35_p21 = _mm_set1_epi32(0x00230015); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); const __m128i c16_p13_p29 = _mm_set1_epi32(0x000D001D); const __m128i c16_p35_n43 = _mm_set1_epi32(0x0023FFD5); const __m128i c16_n40_p04 = _mm_set1_epi32(0xFFD80004); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); const __m128i c16_n40_p45 = _mm_set1_epi32(0xFFD8002D); const __m128i c16_p04_p21 = _mm_set1_epi32(0x00040015); const __m128i c16_p43_n29 = _mm_set1_epi32(0x002BFFE3); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); const __m128i c16_n29_p21 = _mm_set1_epi32(0xFFE30015); const __m128i c16_n40_p35 = _mm_set1_epi32(0xFFD80023); // const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019); const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); const __m128i c16_n25_n44 = _mm_set1_epi32(0xFFE7FFD4); const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); const __m128i c16_p38_p09 = _mm_set1_epi32(0x00260009); const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); const __m128i c16_n44_p38 = _mm_set1_epi32(0xFFD40026); const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); __m128i c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 int nShift = shift2; // DCT1 __m128i res00[4], res01[4], res02[4], res03[4], res04[4], res05[4], res06[4], res07[4], res08[4], res09[4], res10[4], res11[4], res12[4], res13[4], res14[4], res15[4]; __m128i res16[4], res17[4], res18[4], res19[4], res20[4], res21[4], res22[4], res23[4], res24[4], res25[4], res26[4], res27[4], res28[4], res29[4], res30[4], res31[4]; const __m128i T_00_00A = _mm_unpacklo_epi16(m128iS1[0], m128iS3[0]); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(m128iS1[0], m128iS3[0]); // [37 17 36 16 35 15 34 14] const __m128i T_00_01A = _mm_unpacklo_epi16(m128iS5[0], m128iS7[0]); // [ ] const __m128i T_00_01B = _mm_unpackhi_epi16(m128iS5[0], m128iS7[0]); // [ ] const __m128i T_00_02A = _mm_unpacklo_epi16(m128iS1[1], m128iS3[1]); // [ ] const __m128i T_00_02B = _mm_unpackhi_epi16(m128iS1[1], m128iS3[1]); // [ ] const __m128i T_00_03A = _mm_unpacklo_epi16(m128iS5[1], m128iS7[1]); // [ ] const __m128i T_00_03B = _mm_unpackhi_epi16(m128iS5[1], m128iS7[1]); // [ ] const __m128i T_00_04A = _mm_unpacklo_epi16(m128iS1[2], m128iS3[2]); // [ ] const __m128i T_00_04B = _mm_unpackhi_epi16(m128iS1[2], m128iS3[2]); // [ ] const __m128i T_00_05A = _mm_unpacklo_epi16(m128iS5[2], m128iS7[2]); // [ ] const __m128i T_00_05B = _mm_unpackhi_epi16(m128iS5[2], m128iS7[2]); // [ ] const __m128i T_00_06A = _mm_unpacklo_epi16(m128iS1[3], m128iS3[3]); // [ ] const __m128i T_00_06B = _mm_unpackhi_epi16(m128iS1[3], m128iS3[3]); // [ ] const __m128i T_00_07A = _mm_unpacklo_epi16(m128iS5[3], m128iS7[3]); // const __m128i T_00_07B = _mm_unpackhi_epi16(m128iS5[3], m128iS7[3]); // [ ] const __m128i T_00_08A = _mm_unpacklo_epi16(m128iS2[0], m128iS6[0]); // [ ] const __m128i T_00_08B = _mm_unpackhi_epi16(m128iS2[0], m128iS6[0]); // [ ] const __m128i T_00_09A = _mm_unpacklo_epi16(m128iS2[1], m128iS6[1]); // [ ] const __m128i T_00_09B = _mm_unpackhi_epi16(m128iS2[1], m128iS6[1]); // [ ] const __m128i T_00_10A = _mm_unpacklo_epi16(m128iS2[2], m128iS6[2]); // [ ] const __m128i T_00_10B = _mm_unpackhi_epi16(m128iS2[2], m128iS6[2]); // [ ] const __m128i T_00_11A = _mm_unpacklo_epi16(m128iS2[3], m128iS6[3]); // [ ] const __m128i T_00_11B = _mm_unpackhi_epi16(m128iS2[3], m128iS6[3]); // [ ] const __m128i T_00_12A = _mm_unpacklo_epi16(m128iS4[0], m128iS4[1]); // [ ] const __m128i T_00_12B = _mm_unpackhi_epi16(m128iS4[0], m128iS4[1]); // [ ] const __m128i T_00_13A = _mm_unpacklo_epi16(m128iS4[2], m128iS4[3]); // [ ] const __m128i T_00_13B = _mm_unpackhi_epi16(m128iS4[2], m128iS4[3]); // [ ] const __m128i T_00_14A = _mm_unpacklo_epi16(m128iS0[1], m128iS0[3]); // const __m128i T_00_14B = _mm_unpackhi_epi16(m128iS0[1], m128iS0[3]); // [ ] const __m128i T_00_15A = _mm_unpacklo_epi16(m128iS0[0], m128iS0[2]); // const __m128i T_00_15B = _mm_unpackhi_epi16(m128iS0[0], m128iS0[2]); // [ ] __m128i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A; __m128i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B; __m128i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A; __m128i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B; __m128i T00, T01, T02, T03; #define COMPUTE_ROW(r0103, r0507, r0911, r1315, r1719, r2123, r2527, r2931, c0103, c0507, c0911, c1315, c1719, c2123, c2527, c2931, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(r0103, c0103), _mm_madd_epi16(r0507, c0507)); \ T01 = _mm_add_epi32(_mm_madd_epi16(r0911, c0911), _mm_madd_epi16(r1315, c1315)); \ T02 = _mm_add_epi32(_mm_madd_epi16(r1719, c1719), _mm_madd_epi16(r2123, c2123)); \ T03 = _mm_add_epi32(_mm_madd_epi16(r2527, c2527), _mm_madd_epi16(r2931, c2931)); \ row = _mm_add_epi32(_mm_add_epi32(T00, T01), _mm_add_epi32(T02, T03)); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15A) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15B) #undef COMPUTE_ROW { #define COMPUTE_ROW(row0206, row1014, row1822, row2630, c0206, c1014, c1822, c2630, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(row0206, c0206), _mm_madd_epi16(row1014, c1014)); \ T01 = _mm_add_epi32(_mm_madd_epi16(row1822, c1822), _mm_madd_epi16(row2630, c2630)); \ row = _mm_add_epi32(T00, T01); COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7A) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7B) #undef COMPUTE_ROW } { const __m128i EEO0A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_p38_p44), _mm_madd_epi16(T_00_13A, c16_p09_p25)); const __m128i EEO1A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n09_p38), _mm_madd_epi16(T_00_13A, c16_n25_n44)); const __m128i EEO2A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n44_p25), _mm_madd_epi16(T_00_13A, c16_p38_p09)); const __m128i EEO3A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n25_p09), _mm_madd_epi16(T_00_13A, c16_n44_p38)); const __m128i EEO0B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_p38_p44), _mm_madd_epi16(T_00_13B, c16_p09_p25)); const __m128i EEO1B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n09_p38), _mm_madd_epi16(T_00_13B, c16_n25_n44)); const __m128i EEO2B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n44_p25), _mm_madd_epi16(T_00_13B, c16_p38_p09)); const __m128i EEO3B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n25_p09), _mm_madd_epi16(T_00_13B, c16_n44_p38)); const __m128i EEEO0A = _mm_madd_epi16(T_00_14A, c16_p17_p42); const __m128i EEEO0B = _mm_madd_epi16(T_00_14B, c16_p17_p42); const __m128i EEEO1A = _mm_madd_epi16(T_00_14A, c16_n42_p17); const __m128i EEEO1B = _mm_madd_epi16(T_00_14B, c16_n42_p17); const __m128i EEEE0A = _mm_madd_epi16(T_00_15A, c16_p32_p32); const __m128i EEEE0B = _mm_madd_epi16(T_00_15B, c16_p32_p32); const __m128i EEEE1A = _mm_madd_epi16(T_00_15A, c16_n32_p32); const __m128i EEEE1B = _mm_madd_epi16(T_00_15B, c16_n32_p32); const __m128i EEE0A = _mm_add_epi32(EEEE0A, EEEO0A); // EEE0 = EEEE0 + EEEO0 const __m128i EEE0B = _mm_add_epi32(EEEE0B, EEEO0B); const __m128i EEE1A = _mm_add_epi32(EEEE1A, EEEO1A); // EEE1 = EEEE1 + EEEO1 const __m128i EEE1B = _mm_add_epi32(EEEE1B, EEEO1B); const __m128i EEE3A = _mm_sub_epi32(EEEE0A, EEEO0A); // EEE2 = EEEE0 - EEEO0 const __m128i EEE3B = _mm_sub_epi32(EEEE0B, EEEO0B); const __m128i EEE2A = _mm_sub_epi32(EEEE1A, EEEO1A); // EEE3 = EEEE1 - EEEO1 const __m128i EEE2B = _mm_sub_epi32(EEEE1B, EEEO1B); const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); const __m128i EE2A = _mm_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0 const __m128i EE2B = _mm_add_epi32(EEE2B, EEO2B); const __m128i EE3A = _mm_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1 const __m128i EE3B = _mm_add_epi32(EEE3B, EEO3B); const __m128i EE7A = _mm_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0 const __m128i EE7B = _mm_sub_epi32(EEE0B, EEO0B); const __m128i EE6A = _mm_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1 const __m128i EE6B = _mm_sub_epi32(EEE1B, EEO1B); const __m128i EE5A = _mm_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0 const __m128i EE5B = _mm_sub_epi32(EEE2B, EEO2B); const __m128i EE4A = _mm_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1 const __m128i EE4B = _mm_sub_epi32(EEE3B, EEO3B); const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 const __m128i E0B = _mm_add_epi32(EE0B, EO0B); const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 const __m128i E1B = _mm_add_epi32(EE1B, EO1B); const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 const __m128i E2B = _mm_add_epi32(EE2B, EO2B); const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 const __m128i E3B = _mm_add_epi32(EE3B, EO3B); const __m128i E4A = _mm_add_epi32(EE4A, EO4A); // E4 = const __m128i E4B = _mm_add_epi32(EE4B, EO4B); const __m128i E5A = _mm_add_epi32(EE5A, EO5A); // E5 = const __m128i E5B = _mm_add_epi32(EE5B, EO5B); const __m128i E6A = _mm_add_epi32(EE6A, EO6A); // E6 = const __m128i E6B = _mm_add_epi32(EE6B, EO6B); const __m128i E7A = _mm_add_epi32(EE7A, EO7A); // E7 = const __m128i E7B = _mm_add_epi32(EE7B, EO7B); const __m128i EFA = _mm_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0 const __m128i EFB = _mm_sub_epi32(EE0B, EO0B); const __m128i EEA = _mm_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1 const __m128i EEB = _mm_sub_epi32(EE1B, EO1B); const __m128i EDA = _mm_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2 const __m128i EDB = _mm_sub_epi32(EE2B, EO2B); const __m128i ECA = _mm_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3 const __m128i ECB = _mm_sub_epi32(EE3B, EO3B); const __m128i EBA = _mm_sub_epi32(EE4A, EO4A); // EB = const __m128i EBB = _mm_sub_epi32(EE4B, EO4B); const __m128i EAA = _mm_sub_epi32(EE5A, EO5A); // EA = const __m128i EAB = _mm_sub_epi32(EE5B, EO5B); const __m128i E9A = _mm_sub_epi32(EE6A, EO6A); // E9 = const __m128i E9B = _mm_sub_epi32(EE6B, EO6B); const __m128i E8A = _mm_sub_epi32(EE7A, EO7A); // E8 = const __m128i E8B = _mm_sub_epi32(EE7B, EO7B); const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd const __m128i T10B = _mm_add_epi32(E0B, c32_rnd); const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd const __m128i T11B = _mm_add_epi32(E1B, c32_rnd); const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd const __m128i T12B = _mm_add_epi32(E2B, c32_rnd); const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd const __m128i T13B = _mm_add_epi32(E3B, c32_rnd); const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd const __m128i T14B = _mm_add_epi32(E4B, c32_rnd); const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd const __m128i T15B = _mm_add_epi32(E5B, c32_rnd); const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd const __m128i T16B = _mm_add_epi32(E6B, c32_rnd); const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd const __m128i T17B = _mm_add_epi32(E7B, c32_rnd); const __m128i T18A = _mm_add_epi32(E8A, c32_rnd); // E8 + rnd const __m128i T18B = _mm_add_epi32(E8B, c32_rnd); const __m128i T19A = _mm_add_epi32(E9A, c32_rnd); // E9 + rnd const __m128i T19B = _mm_add_epi32(E9B, c32_rnd); const __m128i T1AA = _mm_add_epi32(EAA, c32_rnd); // E10 + rnd const __m128i T1AB = _mm_add_epi32(EAB, c32_rnd); const __m128i T1BA = _mm_add_epi32(EBA, c32_rnd); // E11 + rnd const __m128i T1BB = _mm_add_epi32(EBB, c32_rnd); const __m128i T1CA = _mm_add_epi32(ECA, c32_rnd); // E12 + rnd const __m128i T1CB = _mm_add_epi32(ECB, c32_rnd); const __m128i T1DA = _mm_add_epi32(EDA, c32_rnd); // E13 + rnd const __m128i T1DB = _mm_add_epi32(EDB, c32_rnd); const __m128i T1EA = _mm_add_epi32(EEA, c32_rnd); // E14 + rnd const __m128i T1EB = _mm_add_epi32(EEB, c32_rnd); const __m128i T1FA = _mm_add_epi32(EFA, c32_rnd); // E15 + rnd const __m128i T1FB = _mm_add_epi32(EFB, c32_rnd); const __m128i T2_00A = _mm_add_epi32(T10A, O00A); // E0 + O0 + rnd const __m128i T2_00B = _mm_add_epi32(T10B, O00B); const __m128i T2_01A = _mm_add_epi32(T11A, O01A); // E1 + O1 + rnd const __m128i T2_01B = _mm_add_epi32(T11B, O01B); const __m128i T2_02A = _mm_add_epi32(T12A, O02A); // E2 + O2 + rnd const __m128i T2_02B = _mm_add_epi32(T12B, O02B); const __m128i T2_03A = _mm_add_epi32(T13A, O03A); // E3 + O3 + rnd const __m128i T2_03B = _mm_add_epi32(T13B, O03B); const __m128i T2_04A = _mm_add_epi32(T14A, O04A); // E4 const __m128i T2_04B = _mm_add_epi32(T14B, O04B); const __m128i T2_05A = _mm_add_epi32(T15A, O05A); // E5 const __m128i T2_05B = _mm_add_epi32(T15B, O05B); const __m128i T2_06A = _mm_add_epi32(T16A, O06A); // E6 const __m128i T2_06B = _mm_add_epi32(T16B, O06B); const __m128i T2_07A = _mm_add_epi32(T17A, O07A); // E7 const __m128i T2_07B = _mm_add_epi32(T17B, O07B); const __m128i T2_08A = _mm_add_epi32(T18A, O08A); // E8 const __m128i T2_08B = _mm_add_epi32(T18B, O08B); const __m128i T2_09A = _mm_add_epi32(T19A, O09A); // E9 const __m128i T2_09B = _mm_add_epi32(T19B, O09B); const __m128i T2_10A = _mm_add_epi32(T1AA, O10A); // E10 const __m128i T2_10B = _mm_add_epi32(T1AB, O10B); const __m128i T2_11A = _mm_add_epi32(T1BA, O11A); // E11 const __m128i T2_11B = _mm_add_epi32(T1BB, O11B); const __m128i T2_12A = _mm_add_epi32(T1CA, O12A); // E12 const __m128i T2_12B = _mm_add_epi32(T1CB, O12B); const __m128i T2_13A = _mm_add_epi32(T1DA, O13A); // E13 const __m128i T2_13B = _mm_add_epi32(T1DB, O13B); const __m128i T2_14A = _mm_add_epi32(T1EA, O14A); // E14 const __m128i T2_14B = _mm_add_epi32(T1EB, O14B); const __m128i T2_15A = _mm_add_epi32(T1FA, O15A); // E15 const __m128i T2_15B = _mm_add_epi32(T1FB, O15B); const __m128i T2_31A = _mm_sub_epi32(T10A, O00A); // E0 - O0 + rnd const __m128i T2_31B = _mm_sub_epi32(T10B, O00B); const __m128i T2_30A = _mm_sub_epi32(T11A, O01A); // E1 - O1 + rnd const __m128i T2_30B = _mm_sub_epi32(T11B, O01B); const __m128i T2_29A = _mm_sub_epi32(T12A, O02A); // E2 - O2 + rnd const __m128i T2_29B = _mm_sub_epi32(T12B, O02B); const __m128i T2_28A = _mm_sub_epi32(T13A, O03A); // E3 - O3 + rnd const __m128i T2_28B = _mm_sub_epi32(T13B, O03B); const __m128i T2_27A = _mm_sub_epi32(T14A, O04A); // E4 const __m128i T2_27B = _mm_sub_epi32(T14B, O04B); const __m128i T2_26A = _mm_sub_epi32(T15A, O05A); // E5 const __m128i T2_26B = _mm_sub_epi32(T15B, O05B); const __m128i T2_25A = _mm_sub_epi32(T16A, O06A); // E6 const __m128i T2_25B = _mm_sub_epi32(T16B, O06B); const __m128i T2_24A = _mm_sub_epi32(T17A, O07A); // E7 const __m128i T2_24B = _mm_sub_epi32(T17B, O07B); const __m128i T2_23A = _mm_sub_epi32(T18A, O08A); // const __m128i T2_23B = _mm_sub_epi32(T18B, O08B); const __m128i T2_22A = _mm_sub_epi32(T19A, O09A); // const __m128i T2_22B = _mm_sub_epi32(T19B, O09B); const __m128i T2_21A = _mm_sub_epi32(T1AA, O10A); // const __m128i T2_21B = _mm_sub_epi32(T1AB, O10B); const __m128i T2_20A = _mm_sub_epi32(T1BA, O11A); // const __m128i T2_20B = _mm_sub_epi32(T1BB, O11B); const __m128i T2_19A = _mm_sub_epi32(T1CA, O12A); // const __m128i T2_19B = _mm_sub_epi32(T1CB, O12B); const __m128i T2_18A = _mm_sub_epi32(T1DA, O13A); // const __m128i T2_18B = _mm_sub_epi32(T1DB, O13B); const __m128i T2_17A = _mm_sub_epi32(T1EA, O14A); // const __m128i T2_17B = _mm_sub_epi32(T1EB, O14B); const __m128i T2_16A = _mm_sub_epi32(T1FA, O15A); // const __m128i T2_16B = _mm_sub_epi32(T1FB, O15B); const __m128i T3_00A = _mm_srai_epi32(T2_00A, nShift); // [30 20 10 00] const __m128i T3_00B = _mm_srai_epi32(T2_00B, nShift); // [70 60 50 40] const __m128i T3_01A = _mm_srai_epi32(T2_01A, nShift); // [31 21 11 01] const __m128i T3_01B = _mm_srai_epi32(T2_01B, nShift); // [71 61 51 41] const __m128i T3_02A = _mm_srai_epi32(T2_02A, nShift); // [32 22 12 02] const __m128i T3_02B = _mm_srai_epi32(T2_02B, nShift); // [72 62 52 42] const __m128i T3_03A = _mm_srai_epi32(T2_03A, nShift); // [33 23 13 03] const __m128i T3_03B = _mm_srai_epi32(T2_03B, nShift); // [73 63 53 43] const __m128i T3_04A = _mm_srai_epi32(T2_04A, nShift); // [33 24 14 04] const __m128i T3_04B = _mm_srai_epi32(T2_04B, nShift); // [74 64 54 44] const __m128i T3_05A = _mm_srai_epi32(T2_05A, nShift); // [35 25 15 05] const __m128i T3_05B = _mm_srai_epi32(T2_05B, nShift); // [75 65 55 45] const __m128i T3_06A = _mm_srai_epi32(T2_06A, nShift); // [36 26 16 06] const __m128i T3_06B = _mm_srai_epi32(T2_06B, nShift); // [76 66 56 46] const __m128i T3_07A = _mm_srai_epi32(T2_07A, nShift); // [37 27 17 07] const __m128i T3_07B = _mm_srai_epi32(T2_07B, nShift); // [77 67 57 47] const __m128i T3_08A = _mm_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8 const __m128i T3_08B = _mm_srai_epi32(T2_08B, nShift); // [70 60 50 40] const __m128i T3_09A = _mm_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9 const __m128i T3_09B = _mm_srai_epi32(T2_09B, nShift); // [71 61 51 41] const __m128i T3_10A = _mm_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA const __m128i T3_10B = _mm_srai_epi32(T2_10B, nShift); // [72 62 52 42] const __m128i T3_11A = _mm_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB const __m128i T3_11B = _mm_srai_epi32(T2_11B, nShift); // [73 63 53 43] const __m128i T3_12A = _mm_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC const __m128i T3_12B = _mm_srai_epi32(T2_12B, nShift); // [74 64 54 44] const __m128i T3_13A = _mm_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD const __m128i T3_13B = _mm_srai_epi32(T2_13B, nShift); // [75 65 55 45] const __m128i T3_14A = _mm_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE const __m128i T3_14B = _mm_srai_epi32(T2_14B, nShift); // [76 66 56 46] const __m128i T3_15A = _mm_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF const __m128i T3_15B = _mm_srai_epi32(T2_15B, nShift); // [77 67 57 47] const __m128i T3_16A = _mm_srai_epi32(T2_16A, nShift); // [30 20 10 00] const __m128i T3_16B = _mm_srai_epi32(T2_16B, nShift); // [70 60 50 40] const __m128i T3_17A = _mm_srai_epi32(T2_17A, nShift); // [31 21 11 01] const __m128i T3_17B = _mm_srai_epi32(T2_17B, nShift); // [71 61 51 41] const __m128i T3_18A = _mm_srai_epi32(T2_18A, nShift); // [32 22 12 02] const __m128i T3_18B = _mm_srai_epi32(T2_18B, nShift); // [72 62 52 42] const __m128i T3_19A = _mm_srai_epi32(T2_19A, nShift); // [33 23 13 03] const __m128i T3_19B = _mm_srai_epi32(T2_19B, nShift); // [73 63 53 43] const __m128i T3_20A = _mm_srai_epi32(T2_20A, nShift); // [33 24 14 04] const __m128i T3_20B = _mm_srai_epi32(T2_20B, nShift); // [74 64 54 44] const __m128i T3_21A = _mm_srai_epi32(T2_21A, nShift); // [35 25 15 05] const __m128i T3_21B = _mm_srai_epi32(T2_21B, nShift); // [75 65 55 45] const __m128i T3_22A = _mm_srai_epi32(T2_22A, nShift); // [36 26 16 06] const __m128i T3_22B = _mm_srai_epi32(T2_22B, nShift); // [76 66 56 46] const __m128i T3_23A = _mm_srai_epi32(T2_23A, nShift); // [37 27 17 07] const __m128i T3_23B = _mm_srai_epi32(T2_23B, nShift); // [77 67 57 47] const __m128i T3_24A = _mm_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8 const __m128i T3_24B = _mm_srai_epi32(T2_24B, nShift); // [70 60 50 40] const __m128i T3_25A = _mm_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9 const __m128i T3_25B = _mm_srai_epi32(T2_25B, nShift); // [71 61 51 41] const __m128i T3_26A = _mm_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA const __m128i T3_26B = _mm_srai_epi32(T2_26B, nShift); // [72 62 52 42] const __m128i T3_27A = _mm_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB const __m128i T3_27B = _mm_srai_epi32(T2_27B, nShift); // [73 63 53 43] const __m128i T3_28A = _mm_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC const __m128i T3_28B = _mm_srai_epi32(T2_28B, nShift); // [74 64 54 44] const __m128i T3_29A = _mm_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD const __m128i T3_29B = _mm_srai_epi32(T2_29B, nShift); // [75 65 55 45] const __m128i T3_30A = _mm_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE const __m128i T3_30B = _mm_srai_epi32(T2_30B, nShift); // [76 66 56 46] const __m128i T3_31A = _mm_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF const __m128i T3_31B = _mm_srai_epi32(T2_31B, nShift); // [77 67 57 47] res00[0] = _mm_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00] res01[0] = _mm_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01] res02[0] = _mm_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02] res03[0] = _mm_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03] res04[0] = _mm_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04] res05[0] = _mm_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05] res06[0] = _mm_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06] res07[0] = _mm_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07] res08[0] = _mm_packs_epi32(T3_08A, T3_08B); // [A0 ... 80] res09[0] = _mm_packs_epi32(T3_09A, T3_09B); // [A1 ... 81] res10[0] = _mm_packs_epi32(T3_10A, T3_10B); // [A2 ... 82] res11[0] = _mm_packs_epi32(T3_11A, T3_11B); // [A3 ... 83] res12[0] = _mm_packs_epi32(T3_12A, T3_12B); // [A4 ... 84] res13[0] = _mm_packs_epi32(T3_13A, T3_13B); // [A5 ... 85] res14[0] = _mm_packs_epi32(T3_14A, T3_14B); // [A6 ... 86] res15[0] = _mm_packs_epi32(T3_15A, T3_15B); // [A7 ... 87] res16[0] = _mm_packs_epi32(T3_16A, T3_16B); res17[0] = _mm_packs_epi32(T3_17A, T3_17B); res18[0] = _mm_packs_epi32(T3_18A, T3_18B); res19[0] = _mm_packs_epi32(T3_19A, T3_19B); res20[0] = _mm_packs_epi32(T3_20A, T3_20B); res21[0] = _mm_packs_epi32(T3_21A, T3_21B); res22[0] = _mm_packs_epi32(T3_22A, T3_22B); res23[0] = _mm_packs_epi32(T3_23A, T3_23B); res24[0] = _mm_packs_epi32(T3_24A, T3_24B); res25[0] = _mm_packs_epi32(T3_25A, T3_25B); res26[0] = _mm_packs_epi32(T3_26A, T3_26B); res27[0] = _mm_packs_epi32(T3_27A, T3_27B); res28[0] = _mm_packs_epi32(T3_28A, T3_28B); res29[0] = _mm_packs_epi32(T3_29A, T3_29B); res30[0] = _mm_packs_epi32(T3_30A, T3_30B); res31[0] = _mm_packs_epi32(T3_31A, T3_31B); } //transpose matrix 8x8 16bit. { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], m128iS0[0], m128iS1[0], m128iS2[0], m128iS3[0], m128iS4[0], m128iS5[0], m128iS6[0], m128iS7[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], m128iS0[1], m128iS1[1], m128iS2[1], m128iS3[1], m128iS4[1], m128iS5[1], m128iS6[1], m128iS7[1]) TRANSPOSE_8x8_16BIT(res16[0], res17[0], res18[0], res19[0], res20[0], res21[0], res22[0], res23[0], m128iS0[2], m128iS1[2], m128iS2[2], m128iS3[2], m128iS4[2], m128iS5[2], m128iS6[2], m128iS7[2]) TRANSPOSE_8x8_16BIT(res24[0], res25[0], res26[0], res27[0], res28[0], res29[0], res30[0], res31[0], m128iS0[3], m128iS1[3], m128iS2[3], m128iS3[3], m128iS4[3], m128iS5[3], m128iS6[3], m128iS7[3]) #undef TRANSPOSE_8x8_16BIT } } //clip { __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); for (i = 0; i < 4; i++) { m128iS0[i] = _mm_min_epi16(m128iS0[i], max_val); m128iS0[i] = _mm_max_epi16(m128iS0[i], min_val); m128iS1[i] = _mm_min_epi16(m128iS1[i], max_val); m128iS1[i] = _mm_max_epi16(m128iS1[i], min_val); m128iS2[i] = _mm_min_epi16(m128iS2[i], max_val); m128iS2[i] = _mm_max_epi16(m128iS2[i], min_val); m128iS3[i] = _mm_min_epi16(m128iS3[i], max_val); m128iS3[i] = _mm_max_epi16(m128iS3[i], min_val); m128iS4[i] = _mm_min_epi16(m128iS4[i], max_val); m128iS4[i] = _mm_max_epi16(m128iS4[i], min_val); m128iS5[i] = _mm_min_epi16(m128iS5[i], max_val); m128iS5[i] = _mm_max_epi16(m128iS5[i], min_val); m128iS6[i] = _mm_min_epi16(m128iS6[i], max_val); m128iS6[i] = _mm_max_epi16(m128iS6[i], min_val); m128iS7[i] = _mm_min_epi16(m128iS7[i], max_val); m128iS7[i] = _mm_max_epi16(m128iS7[i], min_val); } } // coeff_t blk2[32 * 8]; // Add for (i = 0; i < 2; i++) { #define STORE_LINE(L0, L1, L2, L3, offsetV) \ _mm_store_si128((__m128i*)(dst + offsetV * i_dst + 0), L0); \ _mm_store_si128((__m128i*)(dst + offsetV * i_dst + 8), L1); \ _mm_store_si128((__m128i*)(dst + offsetV * i_dst + 16), L2); \ _mm_store_si128((__m128i*)(dst + offsetV * i_dst + 24), L3); STORE_LINE(m128iS0[0], m128iS0[1], m128iS0[2], m128iS0[3], 0) STORE_LINE(m128iS1[0], m128iS1[1], m128iS1[2], m128iS1[3], 1) STORE_LINE(m128iS2[0], m128iS2[1], m128iS2[2], m128iS2[3], 2) STORE_LINE(m128iS3[0], m128iS3[1], m128iS3[2], m128iS3[3], 3) STORE_LINE(m128iS4[0], m128iS4[1], m128iS4[2], m128iS4[3], 4) STORE_LINE(m128iS5[0], m128iS5[1], m128iS5[2], m128iS5[3], 5) STORE_LINE(m128iS6[0], m128iS6[1], m128iS6[2], m128iS6[3], 6) STORE_LINE(m128iS7[0], m128iS7[1], m128iS7[2], m128iS7[3], 7) #undef STORE_LINE } } /* --------------------------------------------------------------------------- */ void idct_c_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { const __m128i c16_p45_p45 = _mm_set1_epi32(0x002D002D); const __m128i c16_p43_p44 = _mm_set1_epi32(0x002B002C); const __m128i c16_p39_p41 = _mm_set1_epi32(0x00270029); const __m128i c16_p34_p36 = _mm_set1_epi32(0x00220024); const __m128i c16_p27_p30 = _mm_set1_epi32(0x001B001E); const __m128i c16_p19_p23 = _mm_set1_epi32(0x00130017); const __m128i c16_p11_p15 = _mm_set1_epi32(0x000B000F); const __m128i c16_p02_p07 = _mm_set1_epi32(0x00020007); const __m128i c16_p41_p45 = _mm_set1_epi32(0x0029002D); const __m128i c16_p23_p34 = _mm_set1_epi32(0x00170022); const __m128i c16_n02_p11 = _mm_set1_epi32(0xFFFE000B); const __m128i c16_n27_n15 = _mm_set1_epi32(0xFFE5FFF1); const __m128i c16_n43_n36 = _mm_set1_epi32(0xFFD5FFDC); const __m128i c16_n44_n45 = _mm_set1_epi32(0xFFD4FFD3); const __m128i c16_n30_n39 = _mm_set1_epi32(0xFFE2FFD9); const __m128i c16_n07_n19 = _mm_set1_epi32(0xFFF9FFED); const __m128i c16_p34_p44 = _mm_set1_epi32(0x0022002C); const __m128i c16_n07_p15 = _mm_set1_epi32(0xFFF9000F); const __m128i c16_n41_n27 = _mm_set1_epi32(0xFFD7FFE5); const __m128i c16_n39_n45 = _mm_set1_epi32(0xFFD9FFD3); const __m128i c16_n02_n23 = _mm_set1_epi32(0xFFFEFFE9); const __m128i c16_p36_p19 = _mm_set1_epi32(0x00240013); const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); const __m128i c16_p11_p30 = _mm_set1_epi32(0x000B001E); const __m128i c16_p23_p43 = _mm_set1_epi32(0x0017002B); const __m128i c16_n34_n07 = _mm_set1_epi32(0xFFDEFFF9); const __m128i c16_n36_n45 = _mm_set1_epi32(0xFFDCFFD3); const __m128i c16_p19_n11 = _mm_set1_epi32(0x0013FFF5); const __m128i c16_p44_p41 = _mm_set1_epi32(0x002C0029); const __m128i c16_n02_p27 = _mm_set1_epi32(0xFFFE001B); const __m128i c16_n45_n30 = _mm_set1_epi32(0xFFD3FFE2); const __m128i c16_n15_n39 = _mm_set1_epi32(0xFFF1FFD9); const __m128i c16_p11_p41 = _mm_set1_epi32(0x000B0029); const __m128i c16_n45_n27 = _mm_set1_epi32(0xFFD3FFE5); const __m128i c16_p07_n30 = _mm_set1_epi32(0x0007FFE2); const __m128i c16_p43_p39 = _mm_set1_epi32(0x002B0027); const __m128i c16_n23_p15 = _mm_set1_epi32(0xFFE9000F); const __m128i c16_n34_n45 = _mm_set1_epi32(0xFFDEFFD3); const __m128i c16_p36_p02 = _mm_set1_epi32(0x00240002); const __m128i c16_p19_p44 = _mm_set1_epi32(0x0013002C); const __m128i c16_n02_p39 = _mm_set1_epi32(0xFFFE0027); const __m128i c16_n36_n41 = _mm_set1_epi32(0xFFDCFFD7); const __m128i c16_p43_p07 = _mm_set1_epi32(0x002B0007); const __m128i c16_n11_p34 = _mm_set1_epi32(0xFFF50022); const __m128i c16_n30_n44 = _mm_set1_epi32(0xFFE2FFD4); const __m128i c16_p45_p15 = _mm_set1_epi32(0x002D000F); const __m128i c16_n19_p27 = _mm_set1_epi32(0xFFED001B); const __m128i c16_n23_n45 = _mm_set1_epi32(0xFFE9FFD3); const __m128i c16_n15_p36 = _mm_set1_epi32(0xFFF10024); const __m128i c16_n11_n45 = _mm_set1_epi32(0xFFF5FFD3); const __m128i c16_p34_p39 = _mm_set1_epi32(0x00220027); const __m128i c16_n45_n19 = _mm_set1_epi32(0xFFD3FFED); const __m128i c16_p41_n07 = _mm_set1_epi32(0x0029FFF9); const __m128i c16_n23_p30 = _mm_set1_epi32(0xFFE9001E); const __m128i c16_n02_n44 = _mm_set1_epi32(0xFFFEFFD4); const __m128i c16_p27_p43 = _mm_set1_epi32(0x001B002B); const __m128i c16_n27_p34 = _mm_set1_epi32(0xFFE50022); const __m128i c16_p19_n39 = _mm_set1_epi32(0x0013FFD9); const __m128i c16_n11_p43 = _mm_set1_epi32(0xFFF5002B); const __m128i c16_p02_n45 = _mm_set1_epi32(0x0002FFD3); const __m128i c16_p07_p45 = _mm_set1_epi32(0x0007002D); const __m128i c16_n15_n44 = _mm_set1_epi32(0xFFF1FFD4); const __m128i c16_p23_p41 = _mm_set1_epi32(0x00170029); const __m128i c16_n30_n36 = _mm_set1_epi32(0xFFE2FFDC); const __m128i c16_n36_p30 = _mm_set1_epi32(0xFFDC001E); const __m128i c16_p41_n23 = _mm_set1_epi32(0x0029FFE9); const __m128i c16_n44_p15 = _mm_set1_epi32(0xFFD4000F); const __m128i c16_p45_n07 = _mm_set1_epi32(0x002DFFF9); const __m128i c16_n45_n02 = _mm_set1_epi32(0xFFD3FFFE); const __m128i c16_p43_p11 = _mm_set1_epi32(0x002B000B); const __m128i c16_n39_n19 = _mm_set1_epi32(0xFFD9FFED); const __m128i c16_p34_p27 = _mm_set1_epi32(0x0022001B); const __m128i c16_n43_p27 = _mm_set1_epi32(0xFFD5001B); const __m128i c16_p44_n02 = _mm_set1_epi32(0x002CFFFE); const __m128i c16_n30_n23 = _mm_set1_epi32(0xFFE2FFE9); const __m128i c16_p07_p41 = _mm_set1_epi32(0x00070029); const __m128i c16_p19_n45 = _mm_set1_epi32(0x0013FFD3); const __m128i c16_n39_p34 = _mm_set1_epi32(0xFFD90022); const __m128i c16_p45_n11 = _mm_set1_epi32(0x002DFFF5); const __m128i c16_n36_n15 = _mm_set1_epi32(0xFFDCFFF1); const __m128i c16_n45_p23 = _mm_set1_epi32(0xFFD30017); const __m128i c16_p27_p19 = _mm_set1_epi32(0x001B0013); const __m128i c16_p15_n45 = _mm_set1_epi32(0x000FFFD3); const __m128i c16_n44_p30 = _mm_set1_epi32(0xFFD4001E); const __m128i c16_p34_p11 = _mm_set1_epi32(0x0022000B); const __m128i c16_p07_n43 = _mm_set1_epi32(0x0007FFD5); const __m128i c16_n41_p36 = _mm_set1_epi32(0xFFD70024); const __m128i c16_p39_p02 = _mm_set1_epi32(0x00270002); const __m128i c16_n44_p19 = _mm_set1_epi32(0xFFD40013); const __m128i c16_n02_p36 = _mm_set1_epi32(0xFFFE0024); const __m128i c16_p45_n34 = _mm_set1_epi32(0x002DFFDE); const __m128i c16_n15_n23 = _mm_set1_epi32(0xFFF1FFE9); const __m128i c16_n39_p43 = _mm_set1_epi32(0xFFD9002B); const __m128i c16_p30_p07 = _mm_set1_epi32(0x001E0007); const __m128i c16_p27_n45 = _mm_set1_epi32(0x001BFFD3); const __m128i c16_n41_p11 = _mm_set1_epi32(0xFFD7000B); const __m128i c16_n39_p15 = _mm_set1_epi32(0xFFD9000F); const __m128i c16_n30_p45 = _mm_set1_epi32(0xFFE2002D); const __m128i c16_p27_p02 = _mm_set1_epi32(0x001B0002); const __m128i c16_p41_n44 = _mm_set1_epi32(0x0029FFD4); const __m128i c16_n11_n19 = _mm_set1_epi32(0xFFF5FFED); const __m128i c16_n45_p36 = _mm_set1_epi32(0xFFD30024); const __m128i c16_n07_p34 = _mm_set1_epi32(0xFFF90022); const __m128i c16_p43_n23 = _mm_set1_epi32(0x002BFFE9); const __m128i c16_n30_p11 = _mm_set1_epi32(0xFFE2000B); const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_n19_p36 = _mm_set1_epi32(0xFFED0024); const __m128i c16_p23_n02 = _mm_set1_epi32(0x0017FFFE); const __m128i c16_p45_n39 = _mm_set1_epi32(0x002DFFD9); const __m128i c16_p27_n41 = _mm_set1_epi32(0x001BFFD7); const __m128i c16_n15_n07 = _mm_set1_epi32(0xFFF1FFF9); const __m128i c16_n44_p34 = _mm_set1_epi32(0xFFD40022); const __m128i c16_n19_p07 = _mm_set1_epi32(0xFFED0007); const __m128i c16_n39_p30 = _mm_set1_epi32(0xFFD9001E); const __m128i c16_n45_p44 = _mm_set1_epi32(0xFFD3002C); const __m128i c16_n36_p43 = _mm_set1_epi32(0xFFDC002B); const __m128i c16_n15_p27 = _mm_set1_epi32(0xFFF1001B); const __m128i c16_p11_p02 = _mm_set1_epi32(0x000B0002); const __m128i c16_p34_n23 = _mm_set1_epi32(0x0022FFE9); const __m128i c16_p45_n41 = _mm_set1_epi32(0x002DFFD7); const __m128i c16_n07_p02 = _mm_set1_epi32(0xFFF90002); const __m128i c16_n15_p11 = _mm_set1_epi32(0xFFF1000B); const __m128i c16_n23_p19 = _mm_set1_epi32(0xFFE90013); const __m128i c16_n30_p27 = _mm_set1_epi32(0xFFE2001B); const __m128i c16_n36_p34 = _mm_set1_epi32(0xFFDC0022); const __m128i c16_n41_p39 = _mm_set1_epi32(0xFFD70027); const __m128i c16_n44_p43 = _mm_set1_epi32(0xFFD4002B); const __m128i c16_n45_p45 = _mm_set1_epi32(0xFFD3002D); // const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); const __m128i c16_p21_p29 = _mm_set1_epi32(0x0015001D); const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D); const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); const __m128i c16_n21_p04 = _mm_set1_epi32(0xFFEB0004); const __m128i c16_n45_n40 = _mm_set1_epi32(0xFFD3FFD8); const __m128i c16_n13_n35 = _mm_set1_epi32(0xFFF3FFDD); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); const __m128i c16_n43_n35 = _mm_set1_epi32(0xFFD5FFDD); const __m128i c16_p29_n13 = _mm_set1_epi32(0x001DFFF3); const __m128i c16_p21_p45 = _mm_set1_epi32(0x0015002D); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); const __m128i c16_p04_n43 = _mm_set1_epi32(0x0004FFD5); const __m128i c16_p13_p45 = _mm_set1_epi32(0x000D002D); const __m128i c16_n29_n40 = _mm_set1_epi32(0xFFE3FFD8); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); const __m128i c16_p45_n13 = _mm_set1_epi32(0x002DFFF3); const __m128i c16_n43_n04 = _mm_set1_epi32(0xFFD5FFFC); const __m128i c16_p35_p21 = _mm_set1_epi32(0x00230015); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); const __m128i c16_p13_p29 = _mm_set1_epi32(0x000D001D); const __m128i c16_p35_n43 = _mm_set1_epi32(0x0023FFD5); const __m128i c16_n40_p04 = _mm_set1_epi32(0xFFD80004); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); const __m128i c16_n40_p45 = _mm_set1_epi32(0xFFD8002D); const __m128i c16_p04_p21 = _mm_set1_epi32(0x00040015); const __m128i c16_p43_n29 = _mm_set1_epi32(0x002BFFE3); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); const __m128i c16_n29_p21 = _mm_set1_epi32(0xFFE30015); const __m128i c16_n40_p35 = _mm_set1_epi32(0xFFD80023); // const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019); const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); const __m128i c16_n25_n44 = _mm_set1_epi32(0xFFE7FFD4); const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); const __m128i c16_p38_p09 = _mm_set1_epi32(0x00260009); const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); const __m128i c16_n44_p38 = _mm_set1_epi32(0xFFD40026); const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); __m128i c32_rnd = _mm_set1_epi32(16); int nShift = 5, pass; //int shift1 = 5; int shift2 = 20 - g_bit_depth - (i_dst & 0x01); //int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01); // DCT1 __m128i in00, in01, in02, in03, in04, in05, in06, in07, in08, in09, in10, in11, in12, in13, in14, in15; __m128i in16, in17, in18, in19, in20, in21, in22, in23, in24, in25, in26, in27, in28, in29, in30, in31; __m128i res00[4], res01[4], res02[4], res03[4], res04[4], res05[4], res06[4], res07[4]; i_dst &= 0xFE; in00 = _mm_load_si128((const __m128i*)&src[0 * 8]); in01 = _mm_load_si128((const __m128i*)&src[ 1 * 8]); in02 = _mm_load_si128((const __m128i*)&src[ 2 * 8]); in03 = _mm_load_si128((const __m128i*)&src[ 3 * 8]); in04 = _mm_load_si128((const __m128i*)&src[ 4 * 8]); in05 = _mm_load_si128((const __m128i*)&src[ 5 * 8]); in06 = _mm_load_si128((const __m128i*)&src[ 6 * 8]); in07 = _mm_load_si128((const __m128i*)&src[ 7 * 8]); in08 = _mm_load_si128((const __m128i*)&src[ 8 * 8]); in09 = _mm_load_si128((const __m128i*)&src[ 9 * 8]); in10 = _mm_load_si128((const __m128i*)&src[10 * 8]); in11 = _mm_load_si128((const __m128i*)&src[11 * 8]); in12 = _mm_load_si128((const __m128i*)&src[12 * 8]); in13 = _mm_load_si128((const __m128i*)&src[13 * 8]); in14 = _mm_load_si128((const __m128i*)&src[14 * 8]); in15 = _mm_load_si128((const __m128i*)&src[15 * 8]); in16 = _mm_load_si128((const __m128i*)&src[16 * 8]); in17 = _mm_load_si128((const __m128i*)&src[17 * 8]); in18 = _mm_load_si128((const __m128i*)&src[18 * 8]); in19 = _mm_load_si128((const __m128i*)&src[19 * 8]); in20 = _mm_load_si128((const __m128i*)&src[20 * 8]); in21 = _mm_load_si128((const __m128i*)&src[21 * 8]); in22 = _mm_load_si128((const __m128i*)&src[22 * 8]); in23 = _mm_load_si128((const __m128i*)&src[23 * 8]); in24 = _mm_load_si128((const __m128i*)&src[24 * 8]); in25 = _mm_load_si128((const __m128i*)&src[25 * 8]); in26 = _mm_load_si128((const __m128i*)&src[26 * 8]); in27 = _mm_load_si128((const __m128i*)&src[27 * 8]); in28 = _mm_load_si128((const __m128i*)&src[28 * 8]); in29 = _mm_load_si128((const __m128i*)&src[29 * 8]); in30 = _mm_load_si128((const __m128i*)&src[30 * 8]); in31 = _mm_load_si128((const __m128i*)&src[31 * 8]); { const __m128i T_00_00A = _mm_unpacklo_epi16(in01, in03); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(in01, in03); // [37 17 36 16 35 15 34 14] const __m128i T_00_01A = _mm_unpacklo_epi16(in05, in07); // [ ] const __m128i T_00_01B = _mm_unpackhi_epi16(in05, in07); // [ ] const __m128i T_00_02A = _mm_unpacklo_epi16(in09, in11); // [ ] const __m128i T_00_02B = _mm_unpackhi_epi16(in09, in11); // [ ] const __m128i T_00_03A = _mm_unpacklo_epi16(in13, in15); // [ ] const __m128i T_00_03B = _mm_unpackhi_epi16(in13, in15); // [ ] const __m128i T_00_04A = _mm_unpacklo_epi16(in17, in19); // [ ] const __m128i T_00_04B = _mm_unpackhi_epi16(in17, in19); // [ ] const __m128i T_00_05A = _mm_unpacklo_epi16(in21, in23); // [ ] const __m128i T_00_05B = _mm_unpackhi_epi16(in21, in23); // [ ] const __m128i T_00_06A = _mm_unpacklo_epi16(in25, in27); // [ ] const __m128i T_00_06B = _mm_unpackhi_epi16(in25, in27); // [ ] const __m128i T_00_07A = _mm_unpacklo_epi16(in29, in31); // const __m128i T_00_07B = _mm_unpackhi_epi16(in29, in31); // [ ] const __m128i T_00_08A = _mm_unpacklo_epi16(in02, in06); // [ ] const __m128i T_00_08B = _mm_unpackhi_epi16(in02, in06); // [ ] const __m128i T_00_09A = _mm_unpacklo_epi16(in10, in14); // [ ] const __m128i T_00_09B = _mm_unpackhi_epi16(in10, in14); // [ ] const __m128i T_00_10A = _mm_unpacklo_epi16(in18, in22); // [ ] const __m128i T_00_10B = _mm_unpackhi_epi16(in18, in22); // [ ] const __m128i T_00_11A = _mm_unpacklo_epi16(in26, in30); // [ ] const __m128i T_00_11B = _mm_unpackhi_epi16(in26, in30); // [ ] const __m128i T_00_12A = _mm_unpacklo_epi16(in04, in12); // [ ] const __m128i T_00_12B = _mm_unpackhi_epi16(in04, in12); // [ ] const __m128i T_00_13A = _mm_unpacklo_epi16(in20, in28); // [ ] const __m128i T_00_13B = _mm_unpackhi_epi16(in20, in28); // [ ] const __m128i T_00_14A = _mm_unpacklo_epi16(in08, in24); // const __m128i T_00_14B = _mm_unpackhi_epi16(in08, in24); // [ ] const __m128i T_00_15A = _mm_unpacklo_epi16(in00, in16); // const __m128i T_00_15B = _mm_unpackhi_epi16(in00, in16); // [ ] __m128i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A; __m128i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B; __m128i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A; __m128i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B; { __m128i T00, T01, T02, T03; #define COMPUTE_ROW(r0103, r0507, r0911, r1315, r1719, r2123, r2527, r2931, c0103, c0507, c0911, c1315, c1719, c2123, c2527, c2931, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(r0103, c0103), _mm_madd_epi16(r0507, c0507)); \ T01 = _mm_add_epi32(_mm_madd_epi16(r0911, c0911), _mm_madd_epi16(r1315, c1315)); \ T02 = _mm_add_epi32(_mm_madd_epi16(r1719, c1719), _mm_madd_epi16(r2123, c2123)); \ T03 = _mm_add_epi32(_mm_madd_epi16(r2527, c2527), _mm_madd_epi16(r2931, c2931)); \ row = _mm_add_epi32(_mm_add_epi32(T00, T01), _mm_add_epi32(T02, T03)); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15A) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15B) #undef COMPUTE_ROW } { __m128i T00, T01; #define COMPUTE_ROW(row0206, row1014, row1822, row2630, c0206, c1014, c1822, c2630, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(row0206, c0206), _mm_madd_epi16(row1014, c1014)); \ T01 = _mm_add_epi32(_mm_madd_epi16(row1822, c1822), _mm_madd_epi16(row2630, c2630)); \ row = _mm_add_epi32(T00, T01); COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7A) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7B) #undef COMPUTE_ROW } { const __m128i EEO0A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_p38_p44), _mm_madd_epi16(T_00_13A, c16_p09_p25)); const __m128i EEO1A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n09_p38), _mm_madd_epi16(T_00_13A, c16_n25_n44)); const __m128i EEO2A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n44_p25), _mm_madd_epi16(T_00_13A, c16_p38_p09)); const __m128i EEO3A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n25_p09), _mm_madd_epi16(T_00_13A, c16_n44_p38)); const __m128i EEO0B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_p38_p44), _mm_madd_epi16(T_00_13B, c16_p09_p25)); const __m128i EEO1B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n09_p38), _mm_madd_epi16(T_00_13B, c16_n25_n44)); const __m128i EEO2B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n44_p25), _mm_madd_epi16(T_00_13B, c16_p38_p09)); const __m128i EEO3B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n25_p09), _mm_madd_epi16(T_00_13B, c16_n44_p38)); const __m128i EEEO0A = _mm_madd_epi16(T_00_14A, c16_p17_p42); const __m128i EEEO0B = _mm_madd_epi16(T_00_14B, c16_p17_p42); const __m128i EEEO1A = _mm_madd_epi16(T_00_14A, c16_n42_p17); const __m128i EEEO1B = _mm_madd_epi16(T_00_14B, c16_n42_p17); const __m128i EEEE0A = _mm_madd_epi16(T_00_15A, c16_p32_p32); const __m128i EEEE0B = _mm_madd_epi16(T_00_15B, c16_p32_p32); const __m128i EEEE1A = _mm_madd_epi16(T_00_15A, c16_n32_p32); const __m128i EEEE1B = _mm_madd_epi16(T_00_15B, c16_n32_p32); const __m128i EEE0A = _mm_add_epi32(EEEE0A, EEEO0A); // EEE0 = EEEE0 + EEEO0 const __m128i EEE0B = _mm_add_epi32(EEEE0B, EEEO0B); const __m128i EEE1A = _mm_add_epi32(EEEE1A, EEEO1A); // EEE1 = EEEE1 + EEEO1 const __m128i EEE1B = _mm_add_epi32(EEEE1B, EEEO1B); const __m128i EEE3A = _mm_sub_epi32(EEEE0A, EEEO0A); // EEE2 = EEEE0 - EEEO0 const __m128i EEE3B = _mm_sub_epi32(EEEE0B, EEEO0B); const __m128i EEE2A = _mm_sub_epi32(EEEE1A, EEEO1A); // EEE3 = EEEE1 - EEEO1 const __m128i EEE2B = _mm_sub_epi32(EEEE1B, EEEO1B); const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); const __m128i EE2A = _mm_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0 const __m128i EE2B = _mm_add_epi32(EEE2B, EEO2B); const __m128i EE3A = _mm_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1 const __m128i EE3B = _mm_add_epi32(EEE3B, EEO3B); const __m128i EE7A = _mm_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0 const __m128i EE7B = _mm_sub_epi32(EEE0B, EEO0B); const __m128i EE6A = _mm_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1 const __m128i EE6B = _mm_sub_epi32(EEE1B, EEO1B); const __m128i EE5A = _mm_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0 const __m128i EE5B = _mm_sub_epi32(EEE2B, EEO2B); const __m128i EE4A = _mm_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1 const __m128i EE4B = _mm_sub_epi32(EEE3B, EEO3B); const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 const __m128i E0B = _mm_add_epi32(EE0B, EO0B); const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 const __m128i E1B = _mm_add_epi32(EE1B, EO1B); const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 const __m128i E2B = _mm_add_epi32(EE2B, EO2B); const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 const __m128i E3B = _mm_add_epi32(EE3B, EO3B); const __m128i E4A = _mm_add_epi32(EE4A, EO4A); // E4 = const __m128i E4B = _mm_add_epi32(EE4B, EO4B); const __m128i E5A = _mm_add_epi32(EE5A, EO5A); // E5 = const __m128i E5B = _mm_add_epi32(EE5B, EO5B); const __m128i E6A = _mm_add_epi32(EE6A, EO6A); // E6 = const __m128i E6B = _mm_add_epi32(EE6B, EO6B); const __m128i E7A = _mm_add_epi32(EE7A, EO7A); // E7 = const __m128i E7B = _mm_add_epi32(EE7B, EO7B); const __m128i EFA = _mm_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0 const __m128i EFB = _mm_sub_epi32(EE0B, EO0B); const __m128i EEA = _mm_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1 const __m128i EEB = _mm_sub_epi32(EE1B, EO1B); const __m128i EDA = _mm_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2 const __m128i EDB = _mm_sub_epi32(EE2B, EO2B); const __m128i ECA = _mm_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3 const __m128i ECB = _mm_sub_epi32(EE3B, EO3B); const __m128i EBA = _mm_sub_epi32(EE4A, EO4A); // EB = const __m128i EBB = _mm_sub_epi32(EE4B, EO4B); const __m128i EAA = _mm_sub_epi32(EE5A, EO5A); // EA = const __m128i EAB = _mm_sub_epi32(EE5B, EO5B); const __m128i E9A = _mm_sub_epi32(EE6A, EO6A); // E9 = const __m128i E9B = _mm_sub_epi32(EE6B, EO6B); const __m128i E8A = _mm_sub_epi32(EE7A, EO7A); // E8 = const __m128i E8B = _mm_sub_epi32(EE7B, EO7B); const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd const __m128i T10B = _mm_add_epi32(E0B, c32_rnd); const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd const __m128i T11B = _mm_add_epi32(E1B, c32_rnd); const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd const __m128i T12B = _mm_add_epi32(E2B, c32_rnd); const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd const __m128i T13B = _mm_add_epi32(E3B, c32_rnd); const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd const __m128i T14B = _mm_add_epi32(E4B, c32_rnd); const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd const __m128i T15B = _mm_add_epi32(E5B, c32_rnd); const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd const __m128i T16B = _mm_add_epi32(E6B, c32_rnd); const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd const __m128i T17B = _mm_add_epi32(E7B, c32_rnd); const __m128i T18A = _mm_add_epi32(E8A, c32_rnd); // E8 + rnd const __m128i T18B = _mm_add_epi32(E8B, c32_rnd); const __m128i T19A = _mm_add_epi32(E9A, c32_rnd); // E9 + rnd const __m128i T19B = _mm_add_epi32(E9B, c32_rnd); const __m128i T1AA = _mm_add_epi32(EAA, c32_rnd); // E10 + rnd const __m128i T1AB = _mm_add_epi32(EAB, c32_rnd); const __m128i T1BA = _mm_add_epi32(EBA, c32_rnd); // E11 + rnd const __m128i T1BB = _mm_add_epi32(EBB, c32_rnd); const __m128i T1CA = _mm_add_epi32(ECA, c32_rnd); // E12 + rnd const __m128i T1CB = _mm_add_epi32(ECB, c32_rnd); const __m128i T1DA = _mm_add_epi32(EDA, c32_rnd); // E13 + rnd const __m128i T1DB = _mm_add_epi32(EDB, c32_rnd); const __m128i T1EA = _mm_add_epi32(EEA, c32_rnd); // E14 + rnd const __m128i T1EB = _mm_add_epi32(EEB, c32_rnd); const __m128i T1FA = _mm_add_epi32(EFA, c32_rnd); // E15 + rnd const __m128i T1FB = _mm_add_epi32(EFB, c32_rnd); const __m128i T2_00A = _mm_add_epi32(T10A, O00A); // E0 + O0 + rnd const __m128i T2_00B = _mm_add_epi32(T10B, O00B); const __m128i T2_01A = _mm_add_epi32(T11A, O01A); // E1 + O1 + rnd const __m128i T2_01B = _mm_add_epi32(T11B, O01B); const __m128i T2_02A = _mm_add_epi32(T12A, O02A); // E2 + O2 + rnd const __m128i T2_02B = _mm_add_epi32(T12B, O02B); const __m128i T2_03A = _mm_add_epi32(T13A, O03A); // E3 + O3 + rnd const __m128i T2_03B = _mm_add_epi32(T13B, O03B); const __m128i T2_04A = _mm_add_epi32(T14A, O04A); // E4 const __m128i T2_04B = _mm_add_epi32(T14B, O04B); const __m128i T2_05A = _mm_add_epi32(T15A, O05A); // E5 const __m128i T2_05B = _mm_add_epi32(T15B, O05B); const __m128i T2_06A = _mm_add_epi32(T16A, O06A); // E6 const __m128i T2_06B = _mm_add_epi32(T16B, O06B); const __m128i T2_07A = _mm_add_epi32(T17A, O07A); // E7 const __m128i T2_07B = _mm_add_epi32(T17B, O07B); const __m128i T2_08A = _mm_add_epi32(T18A, O08A); // E8 const __m128i T2_08B = _mm_add_epi32(T18B, O08B); const __m128i T2_09A = _mm_add_epi32(T19A, O09A); // E9 const __m128i T2_09B = _mm_add_epi32(T19B, O09B); const __m128i T2_10A = _mm_add_epi32(T1AA, O10A); // E10 const __m128i T2_10B = _mm_add_epi32(T1AB, O10B); const __m128i T2_11A = _mm_add_epi32(T1BA, O11A); // E11 const __m128i T2_11B = _mm_add_epi32(T1BB, O11B); const __m128i T2_12A = _mm_add_epi32(T1CA, O12A); // E12 const __m128i T2_12B = _mm_add_epi32(T1CB, O12B); const __m128i T2_13A = _mm_add_epi32(T1DA, O13A); // E13 const __m128i T2_13B = _mm_add_epi32(T1DB, O13B); const __m128i T2_14A = _mm_add_epi32(T1EA, O14A); // E14 const __m128i T2_14B = _mm_add_epi32(T1EB, O14B); const __m128i T2_15A = _mm_add_epi32(T1FA, O15A); // E15 const __m128i T2_15B = _mm_add_epi32(T1FB, O15B); const __m128i T2_31A = _mm_sub_epi32(T10A, O00A); // E0 - O0 + rnd const __m128i T2_31B = _mm_sub_epi32(T10B, O00B); const __m128i T2_30A = _mm_sub_epi32(T11A, O01A); // E1 - O1 + rnd const __m128i T2_30B = _mm_sub_epi32(T11B, O01B); const __m128i T2_29A = _mm_sub_epi32(T12A, O02A); // E2 - O2 + rnd const __m128i T2_29B = _mm_sub_epi32(T12B, O02B); const __m128i T2_28A = _mm_sub_epi32(T13A, O03A); // E3 - O3 + rnd const __m128i T2_28B = _mm_sub_epi32(T13B, O03B); const __m128i T2_27A = _mm_sub_epi32(T14A, O04A); // E4 const __m128i T2_27B = _mm_sub_epi32(T14B, O04B); const __m128i T2_26A = _mm_sub_epi32(T15A, O05A); // E5 const __m128i T2_26B = _mm_sub_epi32(T15B, O05B); const __m128i T2_25A = _mm_sub_epi32(T16A, O06A); // E6 const __m128i T2_25B = _mm_sub_epi32(T16B, O06B); const __m128i T2_24A = _mm_sub_epi32(T17A, O07A); // E7 const __m128i T2_24B = _mm_sub_epi32(T17B, O07B); const __m128i T2_23A = _mm_sub_epi32(T18A, O08A); // const __m128i T2_23B = _mm_sub_epi32(T18B, O08B); const __m128i T2_22A = _mm_sub_epi32(T19A, O09A); // const __m128i T2_22B = _mm_sub_epi32(T19B, O09B); const __m128i T2_21A = _mm_sub_epi32(T1AA, O10A); // const __m128i T2_21B = _mm_sub_epi32(T1AB, O10B); const __m128i T2_20A = _mm_sub_epi32(T1BA, O11A); // const __m128i T2_20B = _mm_sub_epi32(T1BB, O11B); const __m128i T2_19A = _mm_sub_epi32(T1CA, O12A); // const __m128i T2_19B = _mm_sub_epi32(T1CB, O12B); const __m128i T2_18A = _mm_sub_epi32(T1DA, O13A); // const __m128i T2_18B = _mm_sub_epi32(T1DB, O13B); const __m128i T2_17A = _mm_sub_epi32(T1EA, O14A); // const __m128i T2_17B = _mm_sub_epi32(T1EB, O14B); const __m128i T2_16A = _mm_sub_epi32(T1FA, O15A); // const __m128i T2_16B = _mm_sub_epi32(T1FB, O15B); const __m128i T3_00A = _mm_srai_epi32(T2_00A, nShift); // [30 20 10 00] const __m128i T3_00B = _mm_srai_epi32(T2_00B, nShift); // [70 60 50 40] const __m128i T3_01A = _mm_srai_epi32(T2_01A, nShift); // [31 21 11 01] const __m128i T3_01B = _mm_srai_epi32(T2_01B, nShift); // [71 61 51 41] const __m128i T3_02A = _mm_srai_epi32(T2_02A, nShift); // [32 22 12 02] const __m128i T3_02B = _mm_srai_epi32(T2_02B, nShift); // [72 62 52 42] const __m128i T3_03A = _mm_srai_epi32(T2_03A, nShift); // [33 23 13 03] const __m128i T3_03B = _mm_srai_epi32(T2_03B, nShift); // [73 63 53 43] const __m128i T3_04A = _mm_srai_epi32(T2_04A, nShift); // [33 24 14 04] const __m128i T3_04B = _mm_srai_epi32(T2_04B, nShift); // [74 64 54 44] const __m128i T3_05A = _mm_srai_epi32(T2_05A, nShift); // [35 25 15 05] const __m128i T3_05B = _mm_srai_epi32(T2_05B, nShift); // [75 65 55 45] const __m128i T3_06A = _mm_srai_epi32(T2_06A, nShift); // [36 26 16 06] const __m128i T3_06B = _mm_srai_epi32(T2_06B, nShift); // [76 66 56 46] const __m128i T3_07A = _mm_srai_epi32(T2_07A, nShift); // [37 27 17 07] const __m128i T3_07B = _mm_srai_epi32(T2_07B, nShift); // [77 67 57 47] const __m128i T3_08A = _mm_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8 const __m128i T3_08B = _mm_srai_epi32(T2_08B, nShift); // [70 60 50 40] const __m128i T3_09A = _mm_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9 const __m128i T3_09B = _mm_srai_epi32(T2_09B, nShift); // [71 61 51 41] const __m128i T3_10A = _mm_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA const __m128i T3_10B = _mm_srai_epi32(T2_10B, nShift); // [72 62 52 42] const __m128i T3_11A = _mm_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB const __m128i T3_11B = _mm_srai_epi32(T2_11B, nShift); // [73 63 53 43] const __m128i T3_12A = _mm_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC const __m128i T3_12B = _mm_srai_epi32(T2_12B, nShift); // [74 64 54 44] const __m128i T3_13A = _mm_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD const __m128i T3_13B = _mm_srai_epi32(T2_13B, nShift); // [75 65 55 45] const __m128i T3_14A = _mm_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE const __m128i T3_14B = _mm_srai_epi32(T2_14B, nShift); // [76 66 56 46] const __m128i T3_15A = _mm_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF const __m128i T3_15B = _mm_srai_epi32(T2_15B, nShift); // [77 67 57 47] const __m128i T3_16A = _mm_srai_epi32(T2_16A, nShift); // [30 20 10 00] const __m128i T3_16B = _mm_srai_epi32(T2_16B, nShift); // [70 60 50 40] const __m128i T3_17A = _mm_srai_epi32(T2_17A, nShift); // [31 21 11 01] const __m128i T3_17B = _mm_srai_epi32(T2_17B, nShift); // [71 61 51 41] const __m128i T3_18A = _mm_srai_epi32(T2_18A, nShift); // [32 22 12 02] const __m128i T3_18B = _mm_srai_epi32(T2_18B, nShift); // [72 62 52 42] const __m128i T3_19A = _mm_srai_epi32(T2_19A, nShift); // [33 23 13 03] const __m128i T3_19B = _mm_srai_epi32(T2_19B, nShift); // [73 63 53 43] const __m128i T3_20A = _mm_srai_epi32(T2_20A, nShift); // [33 24 14 04] const __m128i T3_20B = _mm_srai_epi32(T2_20B, nShift); // [74 64 54 44] const __m128i T3_21A = _mm_srai_epi32(T2_21A, nShift); // [35 25 15 05] const __m128i T3_21B = _mm_srai_epi32(T2_21B, nShift); // [75 65 55 45] const __m128i T3_22A = _mm_srai_epi32(T2_22A, nShift); // [36 26 16 06] const __m128i T3_22B = _mm_srai_epi32(T2_22B, nShift); // [76 66 56 46] const __m128i T3_23A = _mm_srai_epi32(T2_23A, nShift); // [37 27 17 07] const __m128i T3_23B = _mm_srai_epi32(T2_23B, nShift); // [77 67 57 47] const __m128i T3_24A = _mm_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8 const __m128i T3_24B = _mm_srai_epi32(T2_24B, nShift); // [70 60 50 40] const __m128i T3_25A = _mm_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9 const __m128i T3_25B = _mm_srai_epi32(T2_25B, nShift); // [71 61 51 41] const __m128i T3_26A = _mm_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA const __m128i T3_26B = _mm_srai_epi32(T2_26B, nShift); // [72 62 52 42] const __m128i T3_27A = _mm_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB const __m128i T3_27B = _mm_srai_epi32(T2_27B, nShift); // [73 63 53 43] const __m128i T3_28A = _mm_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC const __m128i T3_28B = _mm_srai_epi32(T2_28B, nShift); // [74 64 54 44] const __m128i T3_29A = _mm_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD const __m128i T3_29B = _mm_srai_epi32(T2_29B, nShift); // [75 65 55 45] const __m128i T3_30A = _mm_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE const __m128i T3_30B = _mm_srai_epi32(T2_30B, nShift); // [76 66 56 46] const __m128i T3_31A = _mm_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF const __m128i T3_31B = _mm_srai_epi32(T2_31B, nShift); // [77 67 57 47] res00[0] = _mm_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00] res01[0] = _mm_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01] res02[0] = _mm_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02] res03[0] = _mm_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03] res04[0] = _mm_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04] res05[0] = _mm_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05] res06[0] = _mm_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06] res07[0] = _mm_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07] res00[1] = _mm_packs_epi32(T3_08A, T3_08B); // [A0 ... 80] res01[1] = _mm_packs_epi32(T3_09A, T3_09B); // [A1 ... 81] res02[1] = _mm_packs_epi32(T3_10A, T3_10B); // [A2 ... 82] res03[1] = _mm_packs_epi32(T3_11A, T3_11B); // [A3 ... 83] res04[1] = _mm_packs_epi32(T3_12A, T3_12B); // [A4 ... 84] res05[1] = _mm_packs_epi32(T3_13A, T3_13B); // [A5 ... 85] res06[1] = _mm_packs_epi32(T3_14A, T3_14B); // [A6 ... 86] res07[1] = _mm_packs_epi32(T3_15A, T3_15B); // [A7 ... 87] res00[2] = _mm_packs_epi32(T3_16A, T3_16B); res01[2] = _mm_packs_epi32(T3_17A, T3_17B); res02[2] = _mm_packs_epi32(T3_18A, T3_18B); res03[2] = _mm_packs_epi32(T3_19A, T3_19B); res04[2] = _mm_packs_epi32(T3_20A, T3_20B); res05[2] = _mm_packs_epi32(T3_21A, T3_21B); res06[2] = _mm_packs_epi32(T3_22A, T3_22B); res07[2] = _mm_packs_epi32(T3_23A, T3_23B); res00[3] = _mm_packs_epi32(T3_24A, T3_24B); res01[3] = _mm_packs_epi32(T3_25A, T3_25B); res02[3] = _mm_packs_epi32(T3_26A, T3_26B); res03[3] = _mm_packs_epi32(T3_27A, T3_27B); res04[3] = _mm_packs_epi32(T3_28A, T3_28B); res05[3] = _mm_packs_epi32(T3_29A, T3_29B); res06[3] = _mm_packs_epi32(T3_30A, T3_30B); res07[3] = _mm_packs_epi32(T3_31A, T3_31B); } } #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); //clip { __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 nShift = shift2; for (pass = 0; pass < 4; pass++) { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; __m128i m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h; TRANSPOSE_8x8_16BIT(res00[pass], res01[pass], res02[pass], res03[pass], res04[pass], res05[pass], res06[pass], res07[pass], in00, in01, in02, in03, in04, in05, in06, in07) m128Tmp0 = _mm_unpacklo_epi16(in01, in03); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); m128Tmp1 = _mm_unpackhi_epi16(in01, in03); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); m128Tmp2 = _mm_unpacklo_epi16(in05, in07); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); m128Tmp3 = _mm_unpackhi_epi16(in05, in07); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); O0l = _mm_add_epi32(E1l, E2l); O0h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); O1l = _mm_add_epi32(E1l, E2l); O1h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); O2l = _mm_add_epi32(E1l, E2l); O2h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); O3h = _mm_add_epi32(E1h, E2h); O3l = _mm_add_epi32(E1l, E2l); /* ------- */ m128Tmp0 = _mm_unpacklo_epi16(in00, in04); EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); m128Tmp1 = _mm_unpackhi_epi16(in00, in04); EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); /* ------- */ m128Tmp0 = _mm_unpacklo_epi16(in02, in06); E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); m128Tmp1 = _mm_unpackhi_epi16(in02, in06); E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E0l = _mm_add_epi32(EE0l, E00l); E0l = _mm_add_epi32(E0l, c32_rnd); E0h = _mm_add_epi32(EE0h, E00h); E0h = _mm_add_epi32(E0h, c32_rnd); E3l = _mm_sub_epi32(EE0l, E00l); E3l = _mm_add_epi32(E3l, c32_rnd); E3h = _mm_sub_epi32(EE0h, E00h); E3h = _mm_add_epi32(E3h, c32_rnd); E1l = _mm_add_epi32(EE1l, E01l); E1l = _mm_add_epi32(E1l, c32_rnd); E1h = _mm_add_epi32(EE1h, E01h); E1h = _mm_add_epi32(E1h, c32_rnd); E2l = _mm_sub_epi32(EE1l, E01l); E2l = _mm_add_epi32(E2l, c32_rnd); E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, c32_rnd); in00 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), nShift), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), nShift)); // ״η任λ in07 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), nShift), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), nShift)); in01 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), nShift), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), nShift)); in06 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), nShift), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), nShift)); in02 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), nShift), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), nShift)); in05 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), nShift), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), nShift)); in03 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), nShift), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), nShift)); in04 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), nShift), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), nShift)); /* Invers matrix */ E0l = _mm_unpacklo_epi16(in00, in04); E1l = _mm_unpacklo_epi16(in01, in05); E2l = _mm_unpacklo_epi16(in02, in06); E3l = _mm_unpacklo_epi16(in03, in07); O0l = _mm_unpackhi_epi16(in00, in04); O1l = _mm_unpackhi_epi16(in01, in05); O2l = _mm_unpackhi_epi16(in02, in06); O3l = _mm_unpackhi_epi16(in03, in07); m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); in00 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); in00 = _mm_min_epi16(in00, max_val); in00 = _mm_max_epi16(in00, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 0 * 8], in00); in01 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); in01 = _mm_min_epi16(in01, max_val); in01 = _mm_max_epi16(in01, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 1 * 8], in01); m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); in02 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); in02 = _mm_min_epi16(in02, max_val); in02 = _mm_max_epi16(in02, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 2 * 8], in02); in03 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); in03 = _mm_min_epi16(in03, max_val); in03 = _mm_max_epi16(in03, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 3 * 8], in03); m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); in04 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); in04 = _mm_min_epi16(in04, max_val); in04 = _mm_max_epi16(in04, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 4 * 8], in04); in05 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); in05 = _mm_min_epi16(in05, max_val); in05 = _mm_max_epi16(in05, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 5 * 8], in05); m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); in06 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); in06 = _mm_min_epi16(in06, max_val); in06 = _mm_max_epi16(in06, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 6 * 8], in06); in07 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); in07 = _mm_min_epi16(in07, max_val); in07 = _mm_max_epi16(in07, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 7 * 8], in07); } } #undef TRANSPOSE_8x8_16BIT } /* --------------------------------------------------------------------------- */ static void inv_2nd_trans_hor_sse128(coeff_t *coeff, int i_coeff, int i_shift, const int16_t *tc) { int rnd_factor = 1 << (i_shift - 1); int j; __m128i factor = _mm_set1_epi32(rnd_factor); __m128i tmpZero = _mm_setzero_si128(); // 0 elements // load tc data, a matrix of 4x4 __m128i tmpLoad0 = _mm_loadu_si128((__m128i*)&tc[0 * SEC_TR_SIZE + 0]); // tc[0][] & tc[1][] __m128i tmpLoad1 = _mm_loadu_si128((__m128i*)&tc[2 * SEC_TR_SIZE + 0]); // tc[2][] & tc[3][] __m128i tmpCoef0 = _mm_unpacklo_epi16(tmpLoad0, tmpZero); // tc[0][] __m128i tmpCoef1 = _mm_unpackhi_epi16(tmpLoad0, tmpZero); // tc[1][] __m128i tmpCoef2 = _mm_unpacklo_epi16(tmpLoad1, tmpZero); // tc[2][] __m128i tmpCoef3 = _mm_unpackhi_epi16(tmpLoad1, tmpZero); // tc[3][] for (j = 0; j < 4; j++) { // multiple & add __m128i tmpProduct0 = _mm_madd_epi16(tmpCoef0, _mm_set1_epi32(coeff[0])); __m128i tmpProduct1 = _mm_madd_epi16(tmpCoef1, _mm_set1_epi32(coeff[1])); __m128i tmpProduct2 = _mm_madd_epi16(tmpCoef2, _mm_set1_epi32(coeff[2])); __m128i tmpProduct3 = _mm_madd_epi16(tmpCoef3, _mm_set1_epi32(coeff[3])); // add operation __m128i tmpDst0 = _mm_add_epi32(_mm_add_epi32(tmpProduct0, tmpProduct1), _mm_add_epi32(tmpProduct2, tmpProduct3)); // shift operation tmpDst0 = _mm_srai_epi32(_mm_add_epi32(tmpDst0, factor), i_shift); // clip3 operation tmpDst0 = _mm_packs_epi32(tmpDst0, tmpZero); // only low 64bits (4xSHORT) are valid! _mm_storel_epi64((__m128i*)coeff, tmpDst0); // store from &coeff[0] coeff += i_coeff; } } /* --------------------------------------------------------------------------- */ static void inv_2nd_trans_ver_sse128(coeff_t *coeff, int i_coeff, int i_shift, const int16_t *tc) { const int rnd_factor = 1 << (i_shift - 1); __m128i factor = _mm_set1_epi32(rnd_factor); __m128i tmpZero = _mm_setzero_si128(); // 0 elements // load coeff data __m128i tmpLoad0 = _mm_loadu_si128((__m128i*)&coeff[0 ]); __m128i tmpLoad1 = _mm_loadu_si128((__m128i*)&coeff[1 * i_coeff]); __m128i tmpLoad2 = _mm_loadu_si128((__m128i*)&coeff[2 * i_coeff]); __m128i tmpLoad3 = _mm_loadu_si128((__m128i*)&coeff[3 * i_coeff]); __m128i tmpSrc0 = _mm_unpacklo_epi16(tmpLoad0, tmpZero); // tmpSrc[0][] __m128i tmpSrc1 = _mm_unpacklo_epi16(tmpLoad1, tmpZero); // tmpSrc[1][] __m128i tmpSrc2 = _mm_unpacklo_epi16(tmpLoad2, tmpZero); // tmpSrc[2][] __m128i tmpSrc3 = _mm_unpacklo_epi16(tmpLoad3, tmpZero); // tmpSrc[3][] int i; for (i = 0; i < 4; i++) { // multiple & add __m128i tmpProduct0 = _mm_madd_epi16(_mm_set1_epi32(tc[0 * SEC_TR_SIZE + i]), tmpSrc0); __m128i tmpProduct1 = _mm_madd_epi16(_mm_set1_epi32(tc[1 * SEC_TR_SIZE + i]), tmpSrc1); __m128i tmpProduct2 = _mm_madd_epi16(_mm_set1_epi32(tc[2 * SEC_TR_SIZE + i]), tmpSrc2); __m128i tmpProduct3 = _mm_madd_epi16(_mm_set1_epi32(tc[3 * SEC_TR_SIZE + i]), tmpSrc3); // add operation __m128i tmpDst0 = _mm_add_epi32(_mm_add_epi32(tmpProduct0, tmpProduct1), _mm_add_epi32(tmpProduct2, tmpProduct3)); // shift operation tmpDst0 = _mm_srai_epi32(_mm_add_epi32(tmpDst0, factor), i_shift); // clip3 operation tmpDst0 = _mm_packs_epi32(tmpDst0, tmpZero); // only low 64bits (4xSHORT) are valid! // store from &coeff[0] _mm_storel_epi64((__m128i*)&coeff[0 * i_coeff + 0], tmpDst0); coeff += i_coeff; } } /* --------------------------------------------------------------------------- */ void inv_transform_2nd_sse128(coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left) { int vt = (i_mode >= 0 && i_mode <= 23); int ht = (i_mode >= 13 && i_mode <= 32) || (i_mode >= 0 && i_mode <= 2); if (ht && b_left) { inv_2nd_trans_hor_sse128(coeff, i_coeff, 7, g_2T); } if (vt && b_top) { inv_2nd_trans_ver_sse128(coeff, i_coeff, 7, g_2T); } } /* --------------------------------------------------------------------------- */ void inv_transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff) { const int shift1 = 5; const int shift2 = 20 - g_bit_depth + 2; const int clip_depth2 = g_bit_depth + 1; /*---vertical transform first---*/ __m128i factor = _mm_set1_epi32(1 << (shift1 - 1)); // add1 __m128i tmpZero = _mm_setzero_si128(); // 0 elements // load coeff data __m128i tmpLoad0 = _mm_loadu_si128((__m128i*)&coeff[0 ]); __m128i tmpLoad1 = _mm_loadu_si128((__m128i*)&coeff[1 * i_coeff]); __m128i tmpLoad2 = _mm_loadu_si128((__m128i*)&coeff[2 * i_coeff]); __m128i tmpLoad3 = _mm_loadu_si128((__m128i*)&coeff[3 * i_coeff]); __m128i tmpSrc0 = _mm_unpacklo_epi16(tmpLoad0, tmpZero); // tmpSrc[0][] __m128i tmpSrc1 = _mm_unpacklo_epi16(tmpLoad1, tmpZero); // tmpSrc[1][] __m128i tmpSrc2 = _mm_unpacklo_epi16(tmpLoad2, tmpZero); // tmpSrc[2][] __m128i tmpSrc3 = _mm_unpacklo_epi16(tmpLoad3, tmpZero); // tmpSrc[3][] int i; for (i = 0; i < 4; i++) { // multiple & add __m128i tmpProduct0 = _mm_madd_epi16(_mm_set1_epi32(g_2T_C[0 * SEC_TR_SIZE + i]), tmpSrc0); __m128i tmpProduct1 = _mm_madd_epi16(_mm_set1_epi32(g_2T_C[1 * SEC_TR_SIZE + i]), tmpSrc1); __m128i tmpProduct2 = _mm_madd_epi16(_mm_set1_epi32(g_2T_C[2 * SEC_TR_SIZE + i]), tmpSrc2); __m128i tmpProduct3 = _mm_madd_epi16(_mm_set1_epi32(g_2T_C[3 * SEC_TR_SIZE + i]), tmpSrc3); // add operation __m128i tmpDst0 = _mm_add_epi32(_mm_add_epi32(tmpProduct0, tmpProduct1), _mm_add_epi32(tmpProduct2, tmpProduct3)); // shift operation tmpDst0 = _mm_srai_epi32(_mm_add_epi32(tmpDst0, factor), shift1); // clip3 operation tmpDst0 = _mm_packs_epi32(tmpDst0, tmpZero); // only low 64bits (4xSHORT) are valid! _mm_storel_epi64((__m128i*)&coeff[i * i_coeff + 0], tmpDst0); // store from &coeff[0] } /*---hor transform---*/ factor = _mm_set1_epi32(1 << (shift2 - 1)); const __m128i vmax_val = _mm_set1_epi32((1 << (clip_depth2 - 1)) - 1); const __m128i vmin_val = _mm_set1_epi32(-(1 << (clip_depth2 - 1))); //load coef data, a matrix of 4x4 tmpLoad0 = _mm_loadu_si128((__m128i*)&g_2T_C[0 * SEC_TR_SIZE + 0]); // coef[0][] & coef[1][] tmpLoad1 = _mm_loadu_si128((__m128i*)&g_2T_C[2 * SEC_TR_SIZE + 0]); // coef[2][] & coef[3][] const __m128i tmpCoef0 = _mm_unpacklo_epi16(tmpLoad0, tmpZero); // coef[0][] const __m128i tmpCoef1 = _mm_unpackhi_epi16(tmpLoad0, tmpZero); // coef[1][] const __m128i tmpCoef2 = _mm_unpacklo_epi16(tmpLoad1, tmpZero); // coef[2][] const __m128i tmpCoef3 = _mm_unpackhi_epi16(tmpLoad1, tmpZero); // coef[3][] for (i = 0; i < 4; i++) { // multiple & add __m128i tmpProduct0 = _mm_madd_epi16(tmpCoef0, _mm_set1_epi32(coeff[0])); __m128i tmpProduct1 = _mm_madd_epi16(tmpCoef1, _mm_set1_epi32(coeff[1])); __m128i tmpProduct2 = _mm_madd_epi16(tmpCoef2, _mm_set1_epi32(coeff[2])); __m128i tmpProduct3 = _mm_madd_epi16(tmpCoef3, _mm_set1_epi32(coeff[3])); // add operation __m128i tmpDst0 = _mm_add_epi32(_mm_add_epi32(tmpProduct0, tmpProduct1), _mm_add_epi32(tmpProduct2, tmpProduct3)); // shift operation tmpDst0 = _mm_srai_epi32(_mm_add_epi32(tmpDst0, factor), shift2); // clip3 operation tmpDst0 = _mm_max_epi32(_mm_min_epi32(tmpDst0, vmax_val), vmin_val); tmpDst0 = _mm_packs_epi32(tmpDst0, tmpZero); // only low 64bits (4xSHORT) are valid! _mm_storel_epi64((__m128i*)coeff, tmpDst0); // store from &coeff[0] coeff += i_coeff; } } // transpose 8x8 & transpose 16x16 #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ #define TRANSPOSE_16x16_16BIT(A0_0, A1_0, A2_0, A3_0, A4_0, A5_0, A6_0, A7_0, A8_0, A9_0, A10_0, A11_0, A12_0, A13_0, A14_0, A15_0, A0_1, A1_1, A2_1, A3_1, A4_1, A5_1, A6_1, A7_1, A8_1, A9_1, A10_1, A11_1, A12_1, A13_1, A14_1, A15_1, B0_0, B1_0, B2_0, B3_0, B4_0, B5_0, B6_0, B7_0, B8_0, B9_0, B10_0, B11_0, B12_0, B13_0, B14_0, B15_0, B0_1, B1_1, B2_1, B3_1, B4_1, B5_1, B6_1, B7_1, B8_1, B9_1, B10_1, B11_1, B12_1, B13_1, B14_1, B15_1) \ TRANSPOSE_8x8_16BIT(A0_0, A1_0, A2_0, A3_0, A4_0, A5_0, A6_0, A7_0, B0_0, B1_0, B2_0, B3_0, B4_0, B5_0, B6_0, B7_0); \ TRANSPOSE_8x8_16BIT(A8_0, A9_0, A10_0, A11_0, A12_0, A13_0, A14_0, A15_0, B0_1, B1_1, B2_1, B3_1, B4_1, B5_1, B6_1, B7_1); \ TRANSPOSE_8x8_16BIT(A0_1, A1_1, A2_1, A3_1, A4_1, A5_1, A6_1, A7_1, B8_0, B9_0, B10_0, B11_0, B12_0, B13_0, B14_0, B15_0); \ TRANSPOSE_8x8_16BIT(A8_1, A9_1, A10_1, A11_1, A12_1, A13_1, A14_1, A15_1, B8_1, B9_1, B10_1, B11_1, B12_1, B13_1, B14_1, B15_1); \ /* --------------------------------------------------------------------------- */ static void inv_wavelet_64x64_sse128(coeff_t *coeff) { int i; // 64*64 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8], T16[8], T17[8], T18[8], T19[8], T20[8], T21[8], T22[8], T23[8], T24[8], T25[8], T26[8], T27[8], T28[8], T29[8], T30[8], T31[8], T32[8], T33[8], T34[8], T35[8], T36[8], T37[8], T38[8], T39[8], T40[8], T41[8], T42[8], T43[8], T44[8], T45[8], T46[8], T47[8], T48[8], T49[8], T50[8], T51[8], T52[8], T53[8], T54[8], T55[8], T56[8], T57[8], T58[8], T59[8], T60[8], T61[8], T62[8], T63[8]; // 16*64 __m128i V00[8], V01[8], V02[8], V03[8], V04[8], V05[8], V06[8], V07[8], V08[8], V09[8], V10[8], V11[8], V12[8], V13[8], V14[8], V15[8], V16[8], V17[8], V18[8], V19[8], V20[8], V21[8], V22[8], V23[8], V24[8], V25[8], V26[8], V27[8], V28[8], V29[8], V30[8], V31[8], V32[8], V33[8], V34[8], V35[8], V36[8], V37[8], V38[8], V39[8], V40[8], V41[8], V42[8], V43[8], V44[8], V45[8], V46[8], V47[8], V48[8], V49[8], V50[8], V51[8], V52[8], V53[8], V54[8], V55[8], V56[8], V57[8], V58[8], V59[8], V60[8], V61[8], V62[8], V63[8]; __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; /*--vertical transform--*/ //32*32, LOAD AND SHIFT for (i = 0; i < 4; i++) { T00[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 0]), 1); T01[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 1]), 1); T02[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 2]), 1); T03[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 3]), 1); T04[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 4]), 1); T05[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 5]), 1); T06[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 6]), 1); T07[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 7]), 1); T08[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 8]), 1); T09[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 9]), 1); T10[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 10]), 1); T11[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 11]), 1); T12[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 12]), 1); T13[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 13]), 1); T14[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 14]), 1); T15[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 15]), 1); T16[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 16]), 1); T17[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 17]), 1); T18[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 18]), 1); T19[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 19]), 1); T20[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 20]), 1); T21[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 21]), 1); T22[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 22]), 1); T23[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 23]), 1); T24[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 24]), 1); T25[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 25]), 1); T26[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 26]), 1); T27[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 27]), 1); T28[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 28]), 1); T29[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 29]), 1); T30[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 30]), 1); T31[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 31]), 1); } //filter (odd pixel/row) for (i = 0; i < 4; i++) { T32[i] = _mm_srai_epi16(_mm_add_epi16(T00[i], T01[i]), 1); T33[i] = _mm_srai_epi16(_mm_add_epi16(T01[i], T02[i]), 1); T34[i] = _mm_srai_epi16(_mm_add_epi16(T02[i], T03[i]), 1); T35[i] = _mm_srai_epi16(_mm_add_epi16(T03[i], T04[i]), 1); T36[i] = _mm_srai_epi16(_mm_add_epi16(T04[i], T05[i]), 1); T37[i] = _mm_srai_epi16(_mm_add_epi16(T05[i], T06[i]), 1); T38[i] = _mm_srai_epi16(_mm_add_epi16(T06[i], T07[i]), 1); T39[i] = _mm_srai_epi16(_mm_add_epi16(T07[i], T08[i]), 1); T40[i] = _mm_srai_epi16(_mm_add_epi16(T08[i], T09[i]), 1); T41[i] = _mm_srai_epi16(_mm_add_epi16(T09[i], T10[i]), 1); T42[i] = _mm_srai_epi16(_mm_add_epi16(T10[i], T11[i]), 1); T43[i] = _mm_srai_epi16(_mm_add_epi16(T11[i], T12[i]), 1); T44[i] = _mm_srai_epi16(_mm_add_epi16(T12[i], T13[i]), 1); T45[i] = _mm_srai_epi16(_mm_add_epi16(T13[i], T14[i]), 1); T46[i] = _mm_srai_epi16(_mm_add_epi16(T14[i], T15[i]), 1); T47[i] = _mm_srai_epi16(_mm_add_epi16(T15[i], T16[i]), 1); T48[i] = _mm_srai_epi16(_mm_add_epi16(T16[i], T17[i]), 1); T49[i] = _mm_srai_epi16(_mm_add_epi16(T17[i], T18[i]), 1); T50[i] = _mm_srai_epi16(_mm_add_epi16(T18[i], T19[i]), 1); T51[i] = _mm_srai_epi16(_mm_add_epi16(T19[i], T20[i]), 1); T52[i] = _mm_srai_epi16(_mm_add_epi16(T20[i], T21[i]), 1); T53[i] = _mm_srai_epi16(_mm_add_epi16(T21[i], T22[i]), 1); T54[i] = _mm_srai_epi16(_mm_add_epi16(T22[i], T23[i]), 1); T55[i] = _mm_srai_epi16(_mm_add_epi16(T23[i], T24[i]), 1); T56[i] = _mm_srai_epi16(_mm_add_epi16(T24[i], T25[i]), 1); T57[i] = _mm_srai_epi16(_mm_add_epi16(T25[i], T26[i]), 1); T58[i] = _mm_srai_epi16(_mm_add_epi16(T26[i], T27[i]), 1); T59[i] = _mm_srai_epi16(_mm_add_epi16(T27[i], T28[i]), 1); T60[i] = _mm_srai_epi16(_mm_add_epi16(T28[i], T29[i]), 1); T61[i] = _mm_srai_epi16(_mm_add_epi16(T29[i], T30[i]), 1); T62[i] = _mm_srai_epi16(_mm_add_epi16(T30[i], T31[i]), 1); T63[i] = _mm_srai_epi16(_mm_add_epi16(T31[i], T31[i]), 1); } /*--transposition--*/ //32x64 -> 64x32 TRANSPOSE_16x16_16BIT( T00[0], T32[0], T01[0], T33[0], T02[0], T34[0], T03[0], T35[0], T04[0], T36[0], T05[0], T37[0], T06[0], T38[0], T07[0], T39[0], T00[1], T32[1], T01[1], T33[1], T02[1], T34[1], T03[1], T35[1], T04[1], T36[1], T05[1], T37[1], T06[1], T38[1], T07[1], T39[1], V00[0], V01[0], V02[0], V03[0], V04[0], V05[0], V06[0], V07[0], V08[0], V09[0], V10[0], V11[0], V12[0], V13[0], V14[0], V15[0], V00[1], V01[1], V02[1], V03[1], V04[1], V05[1], V06[1], V07[1], V08[1], V09[1], V10[1], V11[1], V12[1], V13[1], V14[1], V15[1]); TRANSPOSE_16x16_16BIT( T00[2], T32[2], T01[2], T33[2], T02[2], T34[2], T03[2], T35[2], T04[2], T36[2], T05[2], T37[2], T06[2], T38[2], T07[2], T39[2], T00[3], T32[3], T01[3], T33[3], T02[3], T34[3], T03[3], T35[3], T04[3], T36[3], T05[3], T37[3], T06[3], T38[3], T07[3], T39[3], V16[0], V17[0], V18[0], V19[0], V20[0], V21[0], V22[0], V23[0], V24[0], V25[0], V26[0], V27[0], V28[0], V29[0], V30[0], V31[0], V16[1], V17[1], V18[1], V19[1], V20[1], V21[1], V22[1], V23[1], V24[1], V25[1], V26[1], V27[1], V28[1], V29[1], V30[1], V31[1]); TRANSPOSE_16x16_16BIT( T08[0], T40[0], T09[0], T41[0], T10[0], T42[0], T11[0], T43[0], T12[0], T44[0], T13[0], T45[0], T14[0], T46[0], T15[0], T47[0], T08[1], T40[1], T09[1], T41[1], T10[1], T42[1], T11[1], T43[1], T12[1], T44[1], T13[1], T45[1], T14[1], T46[1], T15[1], T47[1], V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V00[3], V01[3], V02[3], V03[3], V04[3], V05[3], V06[3], V07[3], V08[3], V09[3], V10[3], V11[3], V12[3], V13[3], V14[3], V15[3]); TRANSPOSE_16x16_16BIT( T08[2], T40[2], T09[2], T41[2], T10[2], T42[2], T11[2], T43[2], T12[2], T44[2], T13[2], T45[2], T14[2], T46[2], T15[2], T47[2], T08[3], T40[3], T09[3], T41[3], T10[3], T42[3], T11[3], T43[3], T12[3], T44[3], T13[3], T45[3], T14[3], T46[3], T15[3], T47[3], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V16[3], V17[3], V18[3], V19[3], V20[3], V21[3], V22[3], V23[3], V24[3], V25[3], V26[3], V27[3], V28[3], V29[3], V30[3], V31[3]); TRANSPOSE_16x16_16BIT( T16[0], T48[0], T17[0], T49[0], T18[0], T50[0], T19[0], T51[0], T20[0], T52[0], T21[0], T53[0], T22[0], T54[0], T23[0], T55[0], T16[1], T48[1], T17[1], T49[1], T18[1], T50[1], T19[1], T51[1], T20[1], T52[1], T21[1], T53[1], T22[1], T54[1], T23[1], T55[1], V00[4], V01[4], V02[4], V03[4], V04[4], V05[4], V06[4], V07[4], V08[4], V09[4], V10[4], V11[4], V12[4], V13[4], V14[4], V15[4], V00[5], V01[5], V02[5], V03[5], V04[5], V05[5], V06[5], V07[5], V08[5], V09[5], V10[5], V11[5], V12[5], V13[5], V14[5], V15[5]); TRANSPOSE_16x16_16BIT( T16[2], T48[2], T17[2], T49[2], T18[2], T50[2], T19[2], T51[2], T20[2], T52[2], T21[2], T53[2], T22[2], T54[2], T23[2], T55[2], T16[3], T48[3], T17[3], T49[3], T18[3], T50[3], T19[3], T51[3], T20[3], T52[3], T21[3], T53[3], T22[3], T54[3], T23[3], T55[3], V16[4], V17[4], V18[4], V19[4], V20[4], V21[4], V22[4], V23[4], V24[4], V25[4], V26[4], V27[4], V28[4], V29[4], V30[4], V31[4], V16[5], V17[5], V18[5], V19[5], V20[5], V21[5], V22[5], V23[5], V24[5], V25[5], V26[5], V27[5], V28[5], V29[5], V30[5], V31[5]); TRANSPOSE_16x16_16BIT( T24[0], T56[0], T25[0], T57[0], T26[0], T58[0], T27[0], T59[0], T28[0], T60[0], T29[0], T61[0], T30[0], T62[0], T31[0], T63[0], T24[1], T56[1], T25[1], T57[1], T26[1], T58[1], T27[1], T59[1], T28[1], T60[1], T29[1], T61[1], T30[1], T62[1], T31[1], T63[1], V00[6], V01[6], V02[6], V03[6], V04[6], V05[6], V06[6], V07[6], V08[6], V09[6], V10[6], V11[6], V12[6], V13[6], V14[6], V15[6], V00[7], V01[7], V02[7], V03[7], V04[7], V05[7], V06[7], V07[7], V08[7], V09[7], V10[7], V11[7], V12[7], V13[7], V14[7], V15[7]); TRANSPOSE_16x16_16BIT( T24[2], T56[2], T25[2], T57[2], T26[2], T58[2], T27[2], T59[2], T28[2], T60[2], T29[2], T61[2], T30[2], T62[2], T31[2], T63[2], T24[3], T56[3], T25[3], T57[3], T26[3], T58[3], T27[3], T59[3], T28[3], T60[3], T29[3], T61[3], T30[3], T62[3], T31[3], T63[3], V16[6], V17[6], V18[6], V19[6], V20[6], V21[6], V22[6], V23[6], V24[6], V25[6], V26[6], V27[6], V28[6], V29[6], V30[6], V31[6], V16[7], V17[7], V18[7], V19[7], V20[7], V21[7], V22[7], V23[7], V24[7], V25[7], V26[7], V27[7], V28[7], V29[7], V30[7], V31[7]); /*--horizontal transform--*/ //filter (odd pixel/column) for (i = 0; i < 8; i++) { V32[i] = _mm_srai_epi16(_mm_add_epi16(V00[i], V01[i]), 1); V33[i] = _mm_srai_epi16(_mm_add_epi16(V01[i], V02[i]), 1); V34[i] = _mm_srai_epi16(_mm_add_epi16(V02[i], V03[i]), 1); V35[i] = _mm_srai_epi16(_mm_add_epi16(V03[i], V04[i]), 1); V36[i] = _mm_srai_epi16(_mm_add_epi16(V04[i], V05[i]), 1); V37[i] = _mm_srai_epi16(_mm_add_epi16(V05[i], V06[i]), 1); V38[i] = _mm_srai_epi16(_mm_add_epi16(V06[i], V07[i]), 1); V39[i] = _mm_srai_epi16(_mm_add_epi16(V07[i], V08[i]), 1); V40[i] = _mm_srai_epi16(_mm_add_epi16(V08[i], V09[i]), 1); V41[i] = _mm_srai_epi16(_mm_add_epi16(V09[i], V10[i]), 1); V42[i] = _mm_srai_epi16(_mm_add_epi16(V10[i], V11[i]), 1); V43[i] = _mm_srai_epi16(_mm_add_epi16(V11[i], V12[i]), 1); V44[i] = _mm_srai_epi16(_mm_add_epi16(V12[i], V13[i]), 1); V45[i] = _mm_srai_epi16(_mm_add_epi16(V13[i], V14[i]), 1); V46[i] = _mm_srai_epi16(_mm_add_epi16(V14[i], V15[i]), 1); V47[i] = _mm_srai_epi16(_mm_add_epi16(V15[i], V16[i]), 1); V48[i] = _mm_srai_epi16(_mm_add_epi16(V16[i], V17[i]), 1); V49[i] = _mm_srai_epi16(_mm_add_epi16(V17[i], V18[i]), 1); V50[i] = _mm_srai_epi16(_mm_add_epi16(V18[i], V19[i]), 1); V51[i] = _mm_srai_epi16(_mm_add_epi16(V19[i], V20[i]), 1); V52[i] = _mm_srai_epi16(_mm_add_epi16(V20[i], V21[i]), 1); V53[i] = _mm_srai_epi16(_mm_add_epi16(V21[i], V22[i]), 1); V54[i] = _mm_srai_epi16(_mm_add_epi16(V22[i], V23[i]), 1); V55[i] = _mm_srai_epi16(_mm_add_epi16(V23[i], V24[i]), 1); V56[i] = _mm_srai_epi16(_mm_add_epi16(V24[i], V25[i]), 1); V57[i] = _mm_srai_epi16(_mm_add_epi16(V25[i], V26[i]), 1); V58[i] = _mm_srai_epi16(_mm_add_epi16(V26[i], V27[i]), 1); V59[i] = _mm_srai_epi16(_mm_add_epi16(V27[i], V28[i]), 1); V60[i] = _mm_srai_epi16(_mm_add_epi16(V28[i], V29[i]), 1); V61[i] = _mm_srai_epi16(_mm_add_epi16(V29[i], V30[i]), 1); V62[i] = _mm_srai_epi16(_mm_add_epi16(V30[i], V31[i]), 1); V63[i] = _mm_srai_epi16(_mm_add_epi16(V31[i], V31[i]), 1); } /*--transposition & Store--*/ //64x64 TRANSPOSE_16x16_16BIT( V00[0], V32[0], V01[0], V33[0], V02[0], V34[0], V03[0], V35[0], V04[0], V36[0], V05[0], V37[0], V06[0], V38[0], V07[0], V39[0], V00[1], V32[1], V01[1], V33[1], V02[1], V34[1], V03[1], V35[1], V04[1], V36[1], V05[1], V37[1], V06[1], V38[1], V07[1], V39[1], T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0], T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1]); TRANSPOSE_16x16_16BIT( V00[2], V32[2], V01[2], V33[2], V02[2], V34[2], V03[2], V35[2], V04[2], V36[2], V05[2], V37[2], V06[2], V38[2], V07[2], V39[2], V00[3], V32[3], V01[3], V33[3], V02[3], V34[3], V03[3], V35[3], V04[3], V36[3], V05[3], V37[3], V06[3], V38[3], V07[3], V39[3], T16[0], T17[0], T18[0], T19[0], T20[0], T21[0], T22[0], T23[0], T24[0], T25[0], T26[0], T27[0], T28[0], T29[0], T30[0], T31[0], T16[1], T17[1], T18[1], T19[1], T20[1], T21[1], T22[1], T23[1], T24[1], T25[1], T26[1], T27[1], T28[1], T29[1], T30[1], T31[1]); TRANSPOSE_16x16_16BIT(V00[4], V32[4], V01[4], V33[4], V02[4], V34[4], V03[4], V35[4], V04[4], V36[4], V05[4], V37[4], V06[4], V38[4], V07[4], V39[4], V00[5], V32[5], V01[5], V33[5], V02[5], V34[5], V03[5], V35[5], V04[5], V36[5], V05[5], V37[5], V06[5], V38[5], V07[5], V39[5], T32[0], T33[0], T34[0], T35[0], T36[0], T37[0], T38[0], T39[0], T40[0], T41[0], T42[0], T43[0], T44[0], T45[0], T46[0], T47[0], T32[1], T33[1], T34[1], T35[1], T36[1], T37[1], T38[1], T39[1], T40[1], T41[1], T42[1], T43[1], T44[1], T45[1], T46[1], T47[1]); TRANSPOSE_16x16_16BIT(V00[6], V32[6], V01[6], V33[6], V02[6], V34[6], V03[6], V35[6], V04[6], V36[6], V05[6], V37[6], V06[6], V38[6], V07[6], V39[6], V00[7], V32[7], V01[7], V33[7], V02[7], V34[7], V03[7], V35[7], V04[7], V36[7], V05[7], V37[7], V06[7], V38[7], V07[7], V39[7], T48[0], T49[0], T50[0], T51[0], T52[0], T53[0], T54[0], T55[0], T56[0], T57[0], T58[0], T59[0], T60[0], T61[0], T62[0], T63[0], T48[1], T49[1], T50[1], T51[1], T52[1], T53[1], T54[1], T55[1], T56[1], T57[1], T58[1], T59[1], T60[1], T61[1], T62[1], T63[1]); TRANSPOSE_16x16_16BIT( V08[0], V40[0], V09[0], V41[0], V10[0], V42[0], V11[0], V43[0], V12[0], V44[0], V13[0], V45[0], V14[0], V46[0], V15[0], V47[0], V08[1], V40[1], V09[1], V41[1], V10[1], V42[1], V11[1], V43[1], V12[1], V44[1], V13[1], V45[1], V14[1], V46[1], V15[1], V47[1], T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2], T08[2], T09[2], T10[2], T11[2], T12[2], T13[2], T14[2], T15[2], T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3], T08[3], T09[3], T10[3], T11[3], T12[3], T13[3], T14[3], T15[3]); TRANSPOSE_16x16_16BIT( V08[2], V40[2], V09[2], V41[2], V10[2], V42[2], V11[2], V43[2], V12[2], V44[2], V13[2], V45[2], V14[2], V46[2], V15[2], V47[2], V08[3], V40[3], V09[3], V41[3], V10[3], V42[3], V11[3], V43[3], V12[3], V44[3], V13[3], V45[3], V14[3], V46[3], V15[3], V47[3], T16[2], T17[2], T18[2], T19[2], T20[2], T21[2], T22[2], T23[2], T24[2], T25[2], T26[2], T27[2], T28[2], T29[2], T30[2], T31[2], T16[3], T17[3], T18[3], T19[3], T20[3], T21[3], T22[3], T23[3], T24[3], T25[3], T26[3], T27[3], T28[3], T29[3], T30[3], T31[3]); TRANSPOSE_16x16_16BIT( V08[4], V40[4], V09[4], V41[4], V10[4], V42[4], V11[4], V43[4], V12[4], V44[4], V13[4], V45[4], V14[4], V46[4], V15[4], V47[4], V08[5], V40[5], V09[5], V41[5], V10[5], V42[5], V11[5], V43[5], V12[5], V44[5], V13[5], V45[5], V14[5], V46[5], V15[5], V47[5], T32[2], T33[2], T34[2], T35[2], T36[2], T37[2], T38[2], T39[2], T40[2], T41[2], T42[2], T43[2], T44[2], T45[2], T46[2], T47[2], T32[3], T33[3], T34[3], T35[3], T36[3], T37[3], T38[3], T39[3], T40[3], T41[3], T42[3], T43[3], T44[3], T45[3], T46[3], T47[3]); TRANSPOSE_16x16_16BIT( V08[6], V40[6], V09[6], V41[6], V10[6], V42[6], V11[6], V43[6], V12[6], V44[6], V13[6], V45[6], V14[6], V46[6], V15[6], V47[6], V08[7], V40[7], V09[7], V41[7], V10[7], V42[7], V11[7], V43[7], V12[7], V44[7], V13[7], V45[7], V14[7], V46[7], V15[7], V47[7], T48[2], T49[2], T50[2], T51[2], T52[2], T53[2], T54[2], T55[2], T56[2], T57[2], T58[2], T59[2], T60[2], T61[2], T62[2], T63[2], T48[3], T49[3], T50[3], T51[3], T52[3], T53[3], T54[3], T55[3], T56[3], T57[3], T58[3], T59[3], T60[3], T61[3], T62[3], T63[3]); TRANSPOSE_16x16_16BIT( V16[0], V48[0], V17[0], V49[0], V18[0], V50[0], V19[0], V51[0], V20[0], V52[0], V21[0], V53[0], V22[0], V54[0], V23[0], V55[0], V16[1], V48[1], V17[1], V49[1], V18[1], V50[1], V19[1], V51[1], V20[1], V52[1], V21[1], V53[1], V22[1], V54[1], V23[1], V55[1], T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4], T00[5], T01[5], T02[5], T03[5], T04[5], T05[5], T06[5], T07[5], T08[5], T09[5], T10[5], T11[5], T12[5], T13[5], T14[5], T15[5]); TRANSPOSE_16x16_16BIT( V16[2], V48[2], V17[2], V49[2], V18[2], V50[2], V19[2], V51[2], V20[2], V52[2], V21[2], V53[2], V22[2], V54[2], V23[2], V55[2], V16[3], V48[3], V17[3], V49[3], V18[3], V50[3], V19[3], V51[3], V20[3], V52[3], V21[3], V53[3], V22[3], V54[3], V23[3], V55[3], T16[4], T17[4], T18[4], T19[4], T20[4], T21[4], T22[4], T23[4], T24[4], T25[4], T26[4], T27[4], T28[4], T29[4], T30[4], T31[4], T16[5], T17[5], T18[5], T19[5], T20[5], T21[5], T22[5], T23[5], T24[5], T25[5], T26[5], T27[5], T28[5], T29[5], T30[5], T31[5]); TRANSPOSE_16x16_16BIT( V16[4], V48[4], V17[4], V49[4], V18[4], V50[4], V19[4], V51[4], V20[4], V52[4], V21[4], V53[4], V22[4], V54[4], V23[4], V55[4], V16[5], V48[5], V17[5], V49[5], V18[5], V50[5], V19[5], V51[5], V20[5], V52[5], V21[5], V53[5], V22[5], V54[5], V23[5], V55[5], T32[4], T33[4], T34[4], T35[4], T36[4], T37[4], T38[4], T39[4], T40[4], T41[4], T42[4], T43[4], T44[4], T45[4], T46[4], T47[4], T32[5], T33[5], T34[5], T35[5], T36[5], T37[5], T38[5], T39[5], T40[5], T41[5], T42[5], T43[5], T44[5], T45[5], T46[5], T47[5]); TRANSPOSE_16x16_16BIT( V16[6], V48[6], V17[6], V49[6], V18[6], V50[6], V19[6], V51[6], V20[6], V52[6], V21[6], V53[6], V22[6], V54[6], V23[6], V55[6], V16[7], V48[7], V17[7], V49[7], V18[7], V50[7], V19[7], V51[7], V20[7], V52[7], V21[7], V53[7], V22[7], V54[7], V23[7], V55[7], T48[4], T49[4], T50[4], T51[4], T52[4], T53[4], T54[4], T55[4], T56[4], T57[4], T58[4], T59[4], T60[4], T61[4], T62[4], T63[4], T48[5], T49[5], T50[5], T51[5], T52[5], T53[5], T54[5], T55[5], T56[5], T57[5], T58[5], T59[5], T60[5], T61[5], T62[5], T63[5]); TRANSPOSE_16x16_16BIT( V24[0], V56[0], V25[0], V57[0], V26[0], V58[0], V27[0], V59[0], V28[0], V60[0], V29[0], V61[0], V30[0], V62[0], V31[0], V63[0], V24[1], V56[1], V25[1], V57[1], V26[1], V58[1], V27[1], V59[1], V28[1], V60[1], V29[1], V61[1], V30[1], V62[1], V31[1], V63[1], T00[6], T01[6], T02[6], T03[6], T04[6], T05[6], T06[6], T07[6], T08[6], T09[6], T10[6], T11[6], T12[6], T13[6], T14[6], T15[6], T00[7], T01[7], T02[7], T03[7], T04[7], T05[7], T06[7], T07[7], T08[7], T09[7], T10[7], T11[7], T12[7], T13[7], T14[7], T15[7]); TRANSPOSE_16x16_16BIT( V24[2], V56[2], V25[2], V57[2], V26[2], V58[2], V27[2], V59[2], V28[2], V60[2], V29[2], V61[2], V30[2], V62[2], V31[2], V63[2], V24[3], V56[3], V25[3], V57[3], V26[3], V58[3], V27[3], V59[3], V28[3], V60[3], V29[3], V61[3], V30[3], V62[3], V31[3], V63[3], T16[6], T17[6], T18[6], T19[6], T20[6], T21[6], T22[6], T23[6], T24[6], T25[6], T26[6], T27[6], T28[6], T29[6], T30[6], T31[6], T16[7], T17[7], T18[7], T19[7], T20[7], T21[7], T22[7], T23[7], T24[7], T25[7], T26[7], T27[7], T28[7], T29[7], T30[7], T31[7]); TRANSPOSE_16x16_16BIT( V24[4], V56[4], V25[4], V57[4], V26[4], V58[4], V27[4], V59[4], V28[4], V60[4], V29[4], V61[4], V30[4], V62[4], V31[4], V63[4], V24[5], V56[5], V25[5], V57[5], V26[5], V58[5], V27[5], V59[5], V28[5], V60[5], V29[5], V61[5], V30[5], V62[5], V31[5], V63[5], T32[6], T33[6], T34[6], T35[6], T36[6], T37[6], T38[6], T39[6], T40[6], T41[6], T42[6], T43[6], T44[6], T45[6], T46[6], T47[6], T32[7], T33[7], T34[7], T35[7], T36[7], T37[7], T38[7], T39[7], T40[7], T41[7], T42[7], T43[7], T44[7], T45[7], T46[7], T47[7]); TRANSPOSE_16x16_16BIT( V24[6], V56[6], V25[6], V57[6], V26[6], V58[6], V27[6], V59[6], V28[6], V60[6], V29[6], V61[6], V30[6], V62[6], V31[6], V63[6], V24[7], V56[7], V25[7], V57[7], V26[7], V58[7], V27[7], V59[7], V28[7], V60[7], V29[7], V61[7], V30[7], V62[7], V31[7], V63[7], T48[6], T49[6], T50[6], T51[6], T52[6], T53[6], T54[6], T55[6], T56[6], T57[6], T58[6], T59[6], T60[6], T61[6], T62[6], T63[6], T48[7], T49[7], T50[7], T51[7], T52[7], T53[7], T54[7], T55[7], T56[7], T57[7], T58[7], T59[7], T60[7], T61[7], T62[7], T63[7]); //store for (i = 0; i < 8; i++) { _mm_storeu_si128((__m128i*)&coeff[8 * i ], T00[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 ], T01[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 2], T02[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 3], T03[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 4], T04[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 5], T05[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 6], T06[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 7], T07[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 8], T08[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 9], T09[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 10], T10[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 11], T11[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 12], T12[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 13], T13[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 14], T14[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 15], T15[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 16], T16[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 17], T17[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 18], T18[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 19], T19[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 20], T20[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 21], T21[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 22], T22[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 23], T23[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 24], T24[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 25], T25[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 26], T26[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 27], T27[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 28], T28[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 29], T29[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 30], T30[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 31], T31[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 32], T32[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 33], T33[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 34], T34[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 35], T35[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 36], T36[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 37], T37[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 38], T38[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 39], T39[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 40], T40[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 41], T41[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 42], T42[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 43], T43[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 44], T44[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 45], T45[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 46], T46[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 47], T47[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 48], T48[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 49], T49[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 50], T50[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 51], T51[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 52], T52[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 53], T53[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 54], T54[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 55], T55[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 56], T56[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 57], T57[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 58], T58[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 59], T59[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 60], T60[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 61], T61[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 62], T62[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 63], T63[i]); } } /* --------------------------------------------------------------------------- */ static void inv_wavelet_64x16_sse128(coeff_t *coeff) { int i; // 64*16 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8]; // 16*64 __m128i V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2]; __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; /*--vertical transform--*/ //32*8, LOAD AND SHIFT T00[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 0]), 1); T01[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 1]), 1); T02[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 2]), 1); T03[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 3]), 1); T04[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 4]), 1); T05[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 5]), 1); T06[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 6]), 1); T07[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 7]), 1); T00[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 0]), 1); T01[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 1]), 1); T02[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 2]), 1); T03[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 3]), 1); T04[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 4]), 1); T05[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 5]), 1); T06[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 6]), 1); T07[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 7]), 1); T00[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 0]), 1); T01[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 1]), 1); T02[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 2]), 1); T03[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 3]), 1); T04[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 4]), 1); T05[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 5]), 1); T06[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 6]), 1); T07[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 7]), 1); T00[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 0]), 1); T01[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 1]), 1); T02[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 2]), 1); T03[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 3]), 1); T04[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 4]), 1); T05[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 5]), 1); T06[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 6]), 1); T07[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 7]), 1); //filter (odd pixel/row) T08[0] = _mm_srai_epi16(_mm_add_epi16(T00[0], T01[0]), 1); T09[0] = _mm_srai_epi16(_mm_add_epi16(T01[0], T02[0]), 1); T10[0] = _mm_srai_epi16(_mm_add_epi16(T02[0], T03[0]), 1); T11[0] = _mm_srai_epi16(_mm_add_epi16(T03[0], T04[0]), 1); T12[0] = _mm_srai_epi16(_mm_add_epi16(T04[0], T05[0]), 1); T13[0] = _mm_srai_epi16(_mm_add_epi16(T05[0], T06[0]), 1); T14[0] = _mm_srai_epi16(_mm_add_epi16(T06[0], T07[0]), 1); T15[0] = _mm_srai_epi16(_mm_add_epi16(T07[0], T07[0]), 1); T08[1] = _mm_srai_epi16(_mm_add_epi16(T00[1], T01[1]), 1); T09[1] = _mm_srai_epi16(_mm_add_epi16(T01[1], T02[1]), 1); T10[1] = _mm_srai_epi16(_mm_add_epi16(T02[1], T03[1]), 1); T11[1] = _mm_srai_epi16(_mm_add_epi16(T03[1], T04[1]), 1); T12[1] = _mm_srai_epi16(_mm_add_epi16(T04[1], T05[1]), 1); T13[1] = _mm_srai_epi16(_mm_add_epi16(T05[1], T06[1]), 1); T14[1] = _mm_srai_epi16(_mm_add_epi16(T06[1], T07[1]), 1); T15[1] = _mm_srai_epi16(_mm_add_epi16(T07[1], T07[1]), 1); T08[2] = _mm_srai_epi16(_mm_add_epi16(T00[2], T01[2]), 1); T09[2] = _mm_srai_epi16(_mm_add_epi16(T01[2], T02[2]), 1); T10[2] = _mm_srai_epi16(_mm_add_epi16(T02[2], T03[2]), 1); T11[2] = _mm_srai_epi16(_mm_add_epi16(T03[2], T04[2]), 1); T12[2] = _mm_srai_epi16(_mm_add_epi16(T04[2], T05[2]), 1); T13[2] = _mm_srai_epi16(_mm_add_epi16(T05[2], T06[2]), 1); T14[2] = _mm_srai_epi16(_mm_add_epi16(T06[2], T07[2]), 1); T15[2] = _mm_srai_epi16(_mm_add_epi16(T07[2], T07[2]), 1); T08[3] = _mm_srai_epi16(_mm_add_epi16(T00[3], T01[3]), 1); T09[3] = _mm_srai_epi16(_mm_add_epi16(T01[3], T02[3]), 1); T10[3] = _mm_srai_epi16(_mm_add_epi16(T02[3], T03[3]), 1); T11[3] = _mm_srai_epi16(_mm_add_epi16(T03[3], T04[3]), 1); T12[3] = _mm_srai_epi16(_mm_add_epi16(T04[3], T05[3]), 1); T13[3] = _mm_srai_epi16(_mm_add_epi16(T05[3], T06[3]), 1); T14[3] = _mm_srai_epi16(_mm_add_epi16(T06[3], T07[3]), 1); T15[3] = _mm_srai_epi16(_mm_add_epi16(T07[3], T07[3]), 1); /*--transposition--*/ //32x16 -> 16x32 TRANSPOSE_8x8_16BIT(T00[0], T08[0], T01[0], T09[0], T02[0], T10[0], T03[0], T11[0], V00[0], V01[0], V02[0], V03[0], V04[0], V05[0], V06[0], V07[0]); TRANSPOSE_8x8_16BIT(T00[1], T08[1], T01[1], T09[1], T02[1], T10[1], T03[1], T11[1], V08[0], V09[0], V10[0], V11[0], V12[0], V13[0], V14[0], V15[0]); TRANSPOSE_8x8_16BIT(T00[2], T08[2], T01[2], T09[2], T02[2], T10[2], T03[2], T11[2], V16[0], V17[0], V18[0], V19[0], V20[0], V21[0], V22[0], V23[0]); TRANSPOSE_8x8_16BIT(T00[3], T08[3], T01[3], T09[3], T02[3], T10[3], T03[3], T11[3], V24[0], V25[0], V26[0], V27[0], V28[0], V29[0], V30[0], V31[0]); TRANSPOSE_8x8_16BIT(T04[0], T12[0], T05[0], T13[0], T06[0], T14[0], T07[0], T15[0], V00[1], V01[1], V02[1], V03[1], V04[1], V05[1], V06[1], V07[1]); TRANSPOSE_8x8_16BIT(T04[1], T12[1], T05[1], T13[1], T06[1], T14[1], T07[1], T15[1], V08[1], V09[1], V10[1], V11[1], V12[1], V13[1], V14[1], V15[1]); TRANSPOSE_8x8_16BIT(T04[2], T12[2], T05[2], T13[2], T06[2], T14[2], T07[2], T15[2], V16[1], V17[1], V18[1], V19[1], V20[1], V21[1], V22[1], V23[1]); TRANSPOSE_8x8_16BIT(T04[3], T12[3], T05[3], T13[3], T06[3], T14[3], T07[3], T15[3], V24[1], V25[1], V26[1], V27[1], V28[1], V29[1], V30[1], V31[1]); /*--horizontal transform--*/ //filter (odd pixel/column) V32[0] = _mm_srai_epi16(_mm_add_epi16(V00[0], V01[0]), 1); V33[0] = _mm_srai_epi16(_mm_add_epi16(V01[0], V02[0]), 1); V34[0] = _mm_srai_epi16(_mm_add_epi16(V02[0], V03[0]), 1); V35[0] = _mm_srai_epi16(_mm_add_epi16(V03[0], V04[0]), 1); V36[0] = _mm_srai_epi16(_mm_add_epi16(V04[0], V05[0]), 1); V37[0] = _mm_srai_epi16(_mm_add_epi16(V05[0], V06[0]), 1); V38[0] = _mm_srai_epi16(_mm_add_epi16(V06[0], V07[0]), 1); V39[0] = _mm_srai_epi16(_mm_add_epi16(V07[0], V08[0]), 1); V40[0] = _mm_srai_epi16(_mm_add_epi16(V08[0], V09[0]), 1); V41[0] = _mm_srai_epi16(_mm_add_epi16(V09[0], V10[0]), 1); V42[0] = _mm_srai_epi16(_mm_add_epi16(V10[0], V11[0]), 1); V43[0] = _mm_srai_epi16(_mm_add_epi16(V11[0], V12[0]), 1); V44[0] = _mm_srai_epi16(_mm_add_epi16(V12[0], V13[0]), 1); V45[0] = _mm_srai_epi16(_mm_add_epi16(V13[0], V14[0]), 1); V46[0] = _mm_srai_epi16(_mm_add_epi16(V14[0], V15[0]), 1); V47[0] = _mm_srai_epi16(_mm_add_epi16(V15[0], V16[0]), 1); V48[0] = _mm_srai_epi16(_mm_add_epi16(V16[0], V17[0]), 1); V49[0] = _mm_srai_epi16(_mm_add_epi16(V17[0], V18[0]), 1); V50[0] = _mm_srai_epi16(_mm_add_epi16(V18[0], V19[0]), 1); V51[0] = _mm_srai_epi16(_mm_add_epi16(V19[0], V20[0]), 1); V52[0] = _mm_srai_epi16(_mm_add_epi16(V20[0], V21[0]), 1); V53[0] = _mm_srai_epi16(_mm_add_epi16(V21[0], V22[0]), 1); V54[0] = _mm_srai_epi16(_mm_add_epi16(V22[0], V23[0]), 1); V55[0] = _mm_srai_epi16(_mm_add_epi16(V23[0], V24[0]), 1); V56[0] = _mm_srai_epi16(_mm_add_epi16(V24[0], V25[0]), 1); V57[0] = _mm_srai_epi16(_mm_add_epi16(V25[0], V26[0]), 1); V58[0] = _mm_srai_epi16(_mm_add_epi16(V26[0], V27[0]), 1); V59[0] = _mm_srai_epi16(_mm_add_epi16(V27[0], V28[0]), 1); V60[0] = _mm_srai_epi16(_mm_add_epi16(V28[0], V29[0]), 1); V61[0] = _mm_srai_epi16(_mm_add_epi16(V29[0], V30[0]), 1); V62[0] = _mm_srai_epi16(_mm_add_epi16(V30[0], V31[0]), 1); V63[0] = _mm_srai_epi16(_mm_add_epi16(V31[0], V31[0]), 1); V32[1] = _mm_srai_epi16(_mm_add_epi16(V00[1], V01[1]), 1); V33[1] = _mm_srai_epi16(_mm_add_epi16(V01[1], V02[1]), 1); V34[1] = _mm_srai_epi16(_mm_add_epi16(V02[1], V03[1]), 1); V35[1] = _mm_srai_epi16(_mm_add_epi16(V03[1], V04[1]), 1); V36[1] = _mm_srai_epi16(_mm_add_epi16(V04[1], V05[1]), 1); V37[1] = _mm_srai_epi16(_mm_add_epi16(V05[1], V06[1]), 1); V38[1] = _mm_srai_epi16(_mm_add_epi16(V06[1], V07[1]), 1); V39[1] = _mm_srai_epi16(_mm_add_epi16(V07[1], V08[1]), 1); V40[1] = _mm_srai_epi16(_mm_add_epi16(V08[1], V09[1]), 1); V41[1] = _mm_srai_epi16(_mm_add_epi16(V09[1], V10[1]), 1); V42[1] = _mm_srai_epi16(_mm_add_epi16(V10[1], V11[1]), 1); V43[1] = _mm_srai_epi16(_mm_add_epi16(V11[1], V12[1]), 1); V44[1] = _mm_srai_epi16(_mm_add_epi16(V12[1], V13[1]), 1); V45[1] = _mm_srai_epi16(_mm_add_epi16(V13[1], V14[1]), 1); V46[1] = _mm_srai_epi16(_mm_add_epi16(V14[1], V15[1]), 1); V47[1] = _mm_srai_epi16(_mm_add_epi16(V15[1], V16[1]), 1); V48[1] = _mm_srai_epi16(_mm_add_epi16(V16[1], V17[1]), 1); V49[1] = _mm_srai_epi16(_mm_add_epi16(V17[1], V18[1]), 1); V50[1] = _mm_srai_epi16(_mm_add_epi16(V18[1], V19[1]), 1); V51[1] = _mm_srai_epi16(_mm_add_epi16(V19[1], V20[1]), 1); V52[1] = _mm_srai_epi16(_mm_add_epi16(V20[1], V21[1]), 1); V53[1] = _mm_srai_epi16(_mm_add_epi16(V21[1], V22[1]), 1); V54[1] = _mm_srai_epi16(_mm_add_epi16(V22[1], V23[1]), 1); V55[1] = _mm_srai_epi16(_mm_add_epi16(V23[1], V24[1]), 1); V56[1] = _mm_srai_epi16(_mm_add_epi16(V24[1], V25[1]), 1); V57[1] = _mm_srai_epi16(_mm_add_epi16(V25[1], V26[1]), 1); V58[1] = _mm_srai_epi16(_mm_add_epi16(V26[1], V27[1]), 1); V59[1] = _mm_srai_epi16(_mm_add_epi16(V27[1], V28[1]), 1); V60[1] = _mm_srai_epi16(_mm_add_epi16(V28[1], V29[1]), 1); V61[1] = _mm_srai_epi16(_mm_add_epi16(V29[1], V30[1]), 1); V62[1] = _mm_srai_epi16(_mm_add_epi16(V30[1], V31[1]), 1); V63[1] = _mm_srai_epi16(_mm_add_epi16(V31[1], V31[1]), 1); /*--transposition & Store--*/ //16x64 -> 64x16 TRANSPOSE_8x8_16BIT(V00[0], V32[0], V01[0], V33[0], V02[0], V34[0], V03[0], V35[0], T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0]); TRANSPOSE_8x8_16BIT(V04[0], V36[0], V05[0], V37[0], V06[0], V38[0], V07[0], V39[0], T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1]); TRANSPOSE_8x8_16BIT(V08[0], V40[0], V09[0], V41[0], V10[0], V42[0], V11[0], V43[0], T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2]); TRANSPOSE_8x8_16BIT(V12[0], V44[0], V13[0], V45[0], V14[0], V46[0], V15[0], V47[0], T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3]); TRANSPOSE_8x8_16BIT(V16[0], V48[0], V17[0], V49[0], V18[0], V50[0], V19[0], V51[0], T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4]); TRANSPOSE_8x8_16BIT(V20[0], V52[0], V21[0], V53[0], V22[0], V54[0], V23[0], V55[0], T00[5], T01[5], T02[5], T03[5], T04[5], T05[5], T06[5], T07[5]); TRANSPOSE_8x8_16BIT(V24[0], V56[0], V25[0], V57[0], V26[0], V58[0], V27[0], V59[0], T00[6], T01[6], T02[6], T03[6], T04[6], T05[6], T06[6], T07[6]); TRANSPOSE_8x8_16BIT(V28[0], V60[0], V29[0], V61[0], V30[0], V62[0], V31[0], V63[0], T00[7], T01[7], T02[7], T03[7], T04[7], T05[7], T06[7], T07[7]); TRANSPOSE_8x8_16BIT(V00[1], V32[1], V01[1], V33[1], V02[1], V34[1], V03[1], V35[1], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0]); TRANSPOSE_8x8_16BIT(V04[1], V36[1], V05[1], V37[1], V06[1], V38[1], V07[1], V39[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1]); TRANSPOSE_8x8_16BIT(V08[1], V40[1], V09[1], V41[1], V10[1], V42[1], V11[1], V43[1], T08[2], T09[2], T10[2], T11[2], T12[2], T13[2], T14[2], T15[2]); TRANSPOSE_8x8_16BIT(V12[1], V44[1], V13[1], V45[1], V14[1], V46[1], V15[1], V47[1], T08[3], T09[3], T10[3], T11[3], T12[3], T13[3], T14[3], T15[3]); TRANSPOSE_8x8_16BIT(V16[1], V48[1], V17[1], V49[1], V18[1], V50[1], V19[1], V51[1], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4]); TRANSPOSE_8x8_16BIT(V20[1], V52[1], V21[1], V53[1], V22[1], V54[1], V23[1], V55[1], T08[5], T09[5], T10[5], T11[5], T12[5], T13[5], T14[5], T15[5]); TRANSPOSE_8x8_16BIT(V24[1], V56[1], V25[1], V57[1], V26[1], V58[1], V27[1], V59[1], T08[6], T09[6], T10[6], T11[6], T12[6], T13[6], T14[6], T15[6]); TRANSPOSE_8x8_16BIT(V28[1], V60[1], V29[1], V61[1], V30[1], V62[1], V31[1], V63[1], T08[7], T09[7], T10[7], T11[7], T12[7], T13[7], T14[7], T15[7]); //store for (i = 0; i < 8; i++) { _mm_store_si128((__m128i*)&coeff[8 * i ], T00[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 ], T01[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 2], T02[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 3], T03[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 4], T04[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 5], T05[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 6], T06[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 7], T07[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 8], T08[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 9], T09[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 10], T10[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 11], T11[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 12], T12[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 13], T13[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 14], T14[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 15], T15[i]); } } /* --------------------------------------------------------------------------- */ static void inv_wavelet_16x64_sse128(coeff_t *coeff) { //src coeff 8*32 __m128i S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31; __m128i S32, S33, S34, S35, S36, S37, S38, S39, S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, S50, S51, S52, S53, S54, S55, S56, S57, S58, S59, S60, S61, S62, S63; // 64*16 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8]; // 16*64 __m128i V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2]; __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; int i; /*--load & shift--*/ //8*32 S00 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 0]), 1); S01 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 1]), 1); S02 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 2]), 1); S03 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 3]), 1); S04 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 4]), 1); S05 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 5]), 1); S06 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 6]), 1); S07 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 7]), 1); S08 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 8]), 1); S09 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 9]), 1); S10 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 10]), 1); S11 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 11]), 1); S12 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 12]), 1); S13 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 13]), 1); S14 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 14]), 1); S15 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 15]), 1); S16 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 16]), 1); S17 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 17]), 1); S18 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 18]), 1); S19 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 19]), 1); S20 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 20]), 1); S21 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 21]), 1); S22 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 22]), 1); S23 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 23]), 1); S24 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 24]), 1); S25 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 25]), 1); S26 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 26]), 1); S27 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 27]), 1); S28 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 28]), 1); S29 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 29]), 1); S30 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 30]), 1); S31 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 31]), 1); /*--vertical transform--*/ S32 = _mm_srai_epi16(_mm_add_epi16(S00, S01), 1); S33 = _mm_srai_epi16(_mm_add_epi16(S01, S02), 1); S34 = _mm_srai_epi16(_mm_add_epi16(S02, S03), 1); S35 = _mm_srai_epi16(_mm_add_epi16(S03, S04), 1); S36 = _mm_srai_epi16(_mm_add_epi16(S04, S05), 1); S37 = _mm_srai_epi16(_mm_add_epi16(S05, S06), 1); S38 = _mm_srai_epi16(_mm_add_epi16(S06, S07), 1); S39 = _mm_srai_epi16(_mm_add_epi16(S07, S08), 1); S40 = _mm_srai_epi16(_mm_add_epi16(S08, S09), 1); S41 = _mm_srai_epi16(_mm_add_epi16(S09, S10), 1); S42 = _mm_srai_epi16(_mm_add_epi16(S10, S11), 1); S43 = _mm_srai_epi16(_mm_add_epi16(S11, S12), 1); S44 = _mm_srai_epi16(_mm_add_epi16(S12, S13), 1); S45 = _mm_srai_epi16(_mm_add_epi16(S13, S14), 1); S46 = _mm_srai_epi16(_mm_add_epi16(S14, S15), 1); S47 = _mm_srai_epi16(_mm_add_epi16(S15, S16), 1); S48 = _mm_srai_epi16(_mm_add_epi16(S16, S17), 1); S49 = _mm_srai_epi16(_mm_add_epi16(S17, S18), 1); S50 = _mm_srai_epi16(_mm_add_epi16(S18, S19), 1); S51 = _mm_srai_epi16(_mm_add_epi16(S19, S20), 1); S52 = _mm_srai_epi16(_mm_add_epi16(S20, S21), 1); S53 = _mm_srai_epi16(_mm_add_epi16(S21, S22), 1); S54 = _mm_srai_epi16(_mm_add_epi16(S22, S23), 1); S55 = _mm_srai_epi16(_mm_add_epi16(S23, S24), 1); S56 = _mm_srai_epi16(_mm_add_epi16(S24, S25), 1); S57 = _mm_srai_epi16(_mm_add_epi16(S25, S26), 1); S58 = _mm_srai_epi16(_mm_add_epi16(S26, S27), 1); S59 = _mm_srai_epi16(_mm_add_epi16(S27, S28), 1); S60 = _mm_srai_epi16(_mm_add_epi16(S28, S29), 1); S61 = _mm_srai_epi16(_mm_add_epi16(S29, S30), 1); S62 = _mm_srai_epi16(_mm_add_epi16(S30, S31), 1); S63 = _mm_srai_epi16(_mm_add_epi16(S31, S31), 1); /*--transposition--*/ //8x64 -> 64x8 TRANSPOSE_8x8_16BIT(S00, S32, S01, S33, S02, S34, S03, S35, T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0]); TRANSPOSE_8x8_16BIT(S04, S36, S05, S37, S06, S38, S07, S39, T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1]); TRANSPOSE_8x8_16BIT(S08, S40, S09, S41, S10, S42, S11, S43, T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2]); TRANSPOSE_8x8_16BIT(S12, S44, S13, S45, S14, S46, S15, S47, T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3]); TRANSPOSE_8x8_16BIT(S16, S48, S17, S49, S18, S50, S19, S51, T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4]); TRANSPOSE_8x8_16BIT(S20, S52, S21, S53, S22, S54, S23, S55, T00[5], T01[5], T02[5], T03[5], T04[5], T05[5], T06[5], T07[5]); TRANSPOSE_8x8_16BIT(S24, S56, S25, S57, S26, S58, S27, S59, T00[6], T01[6], T02[6], T03[6], T04[6], T05[6], T06[6], T07[6]); TRANSPOSE_8x8_16BIT(S28, S60, S29, S61, S30, S62, S31, S63, T00[7], T01[7], T02[7], T03[7], T04[7], T05[7], T06[7], T07[7]); /*--horizontal transform--*/ for (i = 0; i < 8; i++) { T08[i] = _mm_srai_epi16(_mm_add_epi16(T00[i], T01[i]), 1); T09[i] = _mm_srai_epi16(_mm_add_epi16(T01[i], T02[i]), 1); T10[i] = _mm_srai_epi16(_mm_add_epi16(T02[i], T03[i]), 1); T11[i] = _mm_srai_epi16(_mm_add_epi16(T03[i], T04[i]), 1); T12[i] = _mm_srai_epi16(_mm_add_epi16(T04[i], T05[i]), 1); T13[i] = _mm_srai_epi16(_mm_add_epi16(T05[i], T06[i]), 1); T14[i] = _mm_srai_epi16(_mm_add_epi16(T06[i], T07[i]), 1); T15[i] = _mm_srai_epi16(_mm_add_epi16(T07[i], T07[i]), 1); } /*--transposition--*/ //64x16 -> 16x64 TRANSPOSE_8x8_16BIT(T00[0], T08[0], T01[0], T09[0], T02[0], T10[0], T03[0], T11[0], V00[0], V01[0], V02[0], V03[0], V04[0], V05[0], V06[0], V07[0]); TRANSPOSE_8x8_16BIT(T00[1], T08[1], T01[1], T09[1], T02[1], T10[1], T03[1], T11[1], V08[0], V09[0], V10[0], V11[0], V12[0], V13[0], V14[0], V15[0]); TRANSPOSE_8x8_16BIT(T00[2], T08[2], T01[2], T09[2], T02[2], T10[2], T03[2], T11[2], V16[0], V17[0], V18[0], V19[0], V20[0], V21[0], V22[0], V23[0]); TRANSPOSE_8x8_16BIT(T00[3], T08[3], T01[3], T09[3], T02[3], T10[3], T03[3], T11[3], V24[0], V25[0], V26[0], V27[0], V28[0], V29[0], V30[0], V31[0]); TRANSPOSE_8x8_16BIT(T00[4], T08[4], T01[4], T09[4], T02[4], T10[4], T03[4], T11[4], V32[0], V33[0], V34[0], V35[0], V36[0], V37[0], V38[0], V39[0]); TRANSPOSE_8x8_16BIT(T00[5], T08[5], T01[5], T09[5], T02[5], T10[5], T03[5], T11[5], V40[0], V41[0], V42[0], V43[0], V44[0], V45[0], V46[0], V47[0]); TRANSPOSE_8x8_16BIT(T00[6], T08[6], T01[6], T09[6], T02[6], T10[6], T03[6], T11[6], V48[0], V49[0], V50[0], V51[0], V52[0], V53[0], V54[0], V55[0]); TRANSPOSE_8x8_16BIT(T00[7], T08[7], T01[7], T09[7], T02[7], T10[7], T03[7], T11[7], V56[0], V57[0], V58[0], V59[0], V60[0], V61[0], V62[0], V63[0]); TRANSPOSE_8x8_16BIT(T04[0], T12[0], T05[0], T13[0], T06[0], T14[0], T07[0], T15[0], V00[1], V01[1], V02[1], V03[1], V04[1], V05[1], V06[1], V07[1]); TRANSPOSE_8x8_16BIT(T04[1], T12[1], T05[1], T13[1], T06[1], T14[1], T07[1], T15[1], V08[1], V09[1], V10[1], V11[1], V12[1], V13[1], V14[1], V15[1]); TRANSPOSE_8x8_16BIT(T04[2], T12[2], T05[2], T13[2], T06[2], T14[2], T07[2], T15[2], V16[1], V17[1], V18[1], V19[1], V20[1], V21[1], V22[1], V23[1]); TRANSPOSE_8x8_16BIT(T04[3], T12[3], T05[3], T13[3], T06[3], T14[3], T07[3], T15[3], V24[1], V25[1], V26[1], V27[1], V28[1], V29[1], V30[1], V31[1]); TRANSPOSE_8x8_16BIT(T04[4], T12[4], T05[4], T13[4], T06[4], T14[4], T07[4], T15[4], V32[1], V33[1], V34[1], V35[1], V36[1], V37[1], V38[1], V39[1]); TRANSPOSE_8x8_16BIT(T04[5], T12[5], T05[5], T13[5], T06[5], T14[5], T07[5], T15[5], V40[1], V41[1], V42[1], V43[1], V44[1], V45[1], V46[1], V47[1]); TRANSPOSE_8x8_16BIT(T04[6], T12[6], T05[6], T13[6], T06[6], T14[6], T07[6], T15[6], V48[1], V49[1], V50[1], V51[1], V52[1], V53[1], V54[1], V55[1]); TRANSPOSE_8x8_16BIT(T04[7], T12[7], T05[7], T13[7], T06[7], T14[7], T07[7], T15[7], V56[1], V57[1], V58[1], V59[1], V60[1], V61[1], V62[1], V63[1]); /*--Store--*/ //16x64 for (i = 0; i < 2; i++) { _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 0], V00[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 1], V01[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 2], V02[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 3], V03[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 4], V04[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 5], V05[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 6], V06[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 7], V07[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 8], V08[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 9], V09[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 10], V10[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 11], V11[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 12], V12[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 13], V13[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 14], V14[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 15], V15[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 16], V16[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 17], V17[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 18], V18[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 19], V19[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 20], V20[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 21], V21[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 22], V22[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 23], V23[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 24], V24[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 25], V25[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 26], V26[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 27], V27[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 28], V28[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 29], V29[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 30], V30[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 31], V31[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 32], V32[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 33], V33[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 34], V34[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 35], V35[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 36], V36[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 37], V37[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 38], V38[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 39], V39[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 40], V40[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 41], V41[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 42], V42[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 43], V43[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 44], V44[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 45], V45[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 46], V46[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 47], V47[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 48], V48[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 49], V49[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 50], V50[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 51], V51[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 52], V52[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 53], V53[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 54], V54[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 55], V55[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 56], V56[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 57], V57[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 58], V58[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 59], V59[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 60], V60[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 61], V61[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 62], V62[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 63], V63[i]); } } /* --------------------------------------------------------------------------- */ void idct_c_64x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_c_32x32_sse128(src, dst, 32 | 0x01); /* 32x32 idct */ inv_wavelet_64x64_sse128(dst); } /* --------------------------------------------------------------------------- */ void idct_c_64x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_c_32x8_sse128(src, dst, 32 | 0x01); inv_wavelet_64x16_sse128(dst); } /* --------------------------------------------------------------------------- */ void idct_c_16x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_c_8x32_sse128(src, dst, 8 | 0x01); inv_wavelet_16x64_sse128(dst); } xavs2-1.3/source/common/vec/intrinsic_idct_avx2.c000066400000000000000000004420011340660520300220760ustar00rootroot00000000000000/* * intrinsic_idct_avx2.c * * Description of this file: * AVX2 assembly functions of IDCT module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO Jiaqi ZHANG Tianliang FU * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include #include #include #include #include #include "../basic_types.h" #include "../avs2_defs.h" #include "intrinsic.h" /* disable warnings */ #pragma warning(disable:4127) // warning C4127: ʽdz ALIGN32(static const coeff_t tab_idct_8x8_256[12][16]) = { { 44, 38, 44, 38, 44, 38, 44, 38, 44, 38, 44, 38, 44, 38, 44, 38 }, { 25, 9, 25, 9, 25, 9, 25, 9, 25, 9, 25, 9, 25, 9, 25, 9 }, { 38, -9, 38, -9, 38, -9, 38, -9, 38, -9, 38, -9, 38, -9, 38, -9 }, { -44, -25, -44, -25, -44, -25, -44, -25, -44, -25, -44, -25, -44, -25, -44, -25 }, { 25, -44, 25, -44, 25, -44, 25, -44, 25, -44, 25, -44, 25, -44, 25, -44 }, { 9, 38, 9, 38, 9, 38, 9, 38, 9, 38, 9, 38, 9, 38, 9, 38 }, { 9, -25, 9, -25, 9, -25, 9, -25, 9, -25, 9, -25, 9, -25, 9, -25 }, { 38, -44, 38, -44, 38, -44, 38, -44, 38, -44, 38, -44, 38, -44, 38, -44 }, { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, { 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32 }, { 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17 }, { 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42 } }; void idct_c_8x8_avx2(const coeff_t *src, coeff_t *dst, int i_dst) { const int SHIFT1 = 5; // const int CLIP1 = LIMIT_BIT; const int SHIFT2 = 20 - g_bit_depth; const int CLIP2 = g_bit_depth + 1; __m256i mAdd; __m256i S1S5, S3S7; __m256i T0, T1, T2, T3; __m256i E0, E1, E2, E3, O0, O1, O2, O3; __m256i EE0, EE1, EO0, EO1; __m256i S0, S1, S2, S3, S4, S5, S6, S7; __m256i C00, C01, C02, C03, C04, C05, C06, C07; __m256i max_val, min_val; UNUSED_PARAMETER(i_dst); S1S5 = _mm256_loadu2_m128i((__m128i*)&src[40], (__m128i*)&src[ 8]); S3S7 = _mm256_loadu2_m128i((__m128i*)&src[56], (__m128i*)&src[24]); T0 = _mm256_unpacklo_epi16(S1S5, S3S7); T1 = _mm256_unpackhi_epi16(S1S5, S3S7); T2 = _mm256_permute2x128_si256(T0, T1, 0x20); T3 = _mm256_permute2x128_si256(T0, T1, 0x31); O0 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[0]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[1])))); O1 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[2]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[3])))); O2 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[4]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[5])))); O3 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[6]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[7])))); /* ------- */ S1S5 = _mm256_loadu2_m128i((__m128i*)&src[16], (__m128i*)&src[0]); S3S7 = _mm256_loadu2_m128i((__m128i*)&src[48], (__m128i*)&src[32]); T0 = _mm256_unpacklo_epi16(S1S5, S3S7); T1 = _mm256_unpackhi_epi16(S1S5, S3S7); T2 = _mm256_permute2x128_si256(T0, T1, 0x20); T3 = _mm256_permute2x128_si256(T0, T1, 0x31); EE0 = _mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[8]))); EE1 = _mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[9]))); EO0 = _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[10]))); EO1 = _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[11]))); /* ------- */ mAdd = _mm256_set1_epi32((1 << (SHIFT1 - 1))); // ״η任 E0 = _mm256_add_epi32(EE0, EO0); E1 = _mm256_add_epi32(EE1, EO1); E3 = _mm256_sub_epi32(EE0, EO0); E2 = _mm256_sub_epi32(EE1, EO1); E0 = _mm256_add_epi32(E0, mAdd); E1 = _mm256_add_epi32(E1, mAdd); E2 = _mm256_add_epi32(E2, mAdd); E3 = _mm256_add_epi32(E3, mAdd); S0 = _mm256_srai_epi32(_mm256_add_epi32(E0, O0), SHIFT1); S7 = _mm256_srai_epi32(_mm256_sub_epi32(E0, O0), SHIFT1); S1 = _mm256_srai_epi32(_mm256_add_epi32(E1, O1), SHIFT1); S6 = _mm256_srai_epi32(_mm256_sub_epi32(E1, O1), SHIFT1); S2 = _mm256_srai_epi32(_mm256_add_epi32(E2, O2), SHIFT1); S5 = _mm256_srai_epi32(_mm256_sub_epi32(E2, O2), SHIFT1); S3 = _mm256_srai_epi32(_mm256_add_epi32(E3, O3), SHIFT1); S4 = _mm256_srai_epi32(_mm256_sub_epi32(E3, O3), SHIFT1); C00 = _mm256_permute2x128_si256(S0, S4, 0x20); C01 = _mm256_permute2x128_si256(S0, S4, 0x31); C02 = _mm256_permute2x128_si256(S1, S5, 0x20); C03 = _mm256_permute2x128_si256(S1, S5, 0x31); C04 = _mm256_permute2x128_si256(S2, S6, 0x20); C05 = _mm256_permute2x128_si256(S2, S6, 0x31); C06 = _mm256_permute2x128_si256(S3, S7, 0x20); C07 = _mm256_permute2x128_si256(S3, S7, 0x31); S0 = _mm256_packs_epi32(C00, C01); S1 = _mm256_packs_epi32(C02, C03); S2 = _mm256_packs_epi32(C04, C05); S3 = _mm256_packs_epi32(C06, C07); S4 = _mm256_unpacklo_epi16(S0, S1); S5 = _mm256_unpacklo_epi16(S2, S3); S6 = _mm256_unpackhi_epi16(S0, S1); S7 = _mm256_unpackhi_epi16(S2, S3); C00 = _mm256_unpacklo_epi32(S4, S5); C01 = _mm256_unpacklo_epi32(S6, S7); C02 = _mm256_unpackhi_epi32(S4, S5); C03 = _mm256_unpackhi_epi32(S6, S7); C04 = _mm256_permute2x128_si256(C00, C02, 0x20); C05 = _mm256_permute2x128_si256(C00, C02, 0x31); C06 = _mm256_permute2x128_si256(C01, C03, 0x20); C07 = _mm256_permute2x128_si256(C01, C03, 0x31); S0 = _mm256_unpacklo_epi64(C04, C05); S1 = _mm256_unpacklo_epi64(C06, C07); S2 = _mm256_unpackhi_epi64(C04, C05); S3 = _mm256_unpackhi_epi64(C06, C07); S4 = _mm256_permute2x128_si256(S2, S3, 0x20); S5 = _mm256_permute2x128_si256(S2, S3, 0x31); T0 = _mm256_unpacklo_epi16(S4, S5); T1 = _mm256_unpackhi_epi16(S4, S5); T2 = _mm256_permute2x128_si256(T0, T1, 0x20); T3 = _mm256_permute2x128_si256(T0, T1, 0x31); O0 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[0]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[1])))); O1 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[2]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[3])))); O2 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[4]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[5])))); O3 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[6]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[7])))); /* ------- */ T0 = _mm256_unpacklo_epi16(S0, S1); T1 = _mm256_unpackhi_epi16(S0, S1); T2 = _mm256_permute2x128_si256(T0, T1, 0x20); T3 = _mm256_permute2x128_si256(T0, T1, 0x31); EE0 = _mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[8]))); EE1 = _mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[9]))); EO0 = _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[10]))); EO1 = _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[11]))); /* ------- */ mAdd = _mm256_set1_epi32(SHIFT2 ? (1 << (SHIFT2 - 1)) : 0); // E0 = _mm256_add_epi32(EE0, EO0); E1 = _mm256_add_epi32(EE1, EO1); E3 = _mm256_sub_epi32(EE0, EO0); E2 = _mm256_sub_epi32(EE1, EO1); E0 = _mm256_add_epi32(E0, mAdd); E1 = _mm256_add_epi32(E1, mAdd); E2 = _mm256_add_epi32(E2, mAdd); E3 = _mm256_add_epi32(E3, mAdd); S0 = _mm256_srai_epi32(_mm256_add_epi32(E0, O0), SHIFT2); S7 = _mm256_srai_epi32(_mm256_sub_epi32(E0, O0), SHIFT2); S1 = _mm256_srai_epi32(_mm256_add_epi32(E1, O1), SHIFT2); S6 = _mm256_srai_epi32(_mm256_sub_epi32(E1, O1), SHIFT2); S2 = _mm256_srai_epi32(_mm256_add_epi32(E2, O2), SHIFT2); S5 = _mm256_srai_epi32(_mm256_sub_epi32(E2, O2), SHIFT2); S3 = _mm256_srai_epi32(_mm256_add_epi32(E3, O3), SHIFT2); S4 = _mm256_srai_epi32(_mm256_sub_epi32(E3, O3), SHIFT2); C00 = _mm256_permute2x128_si256(S0, S4, 0x20); C01 = _mm256_permute2x128_si256(S0, S4, 0x31); C02 = _mm256_permute2x128_si256(S1, S5, 0x20); C03 = _mm256_permute2x128_si256(S1, S5, 0x31); C04 = _mm256_permute2x128_si256(S2, S6, 0x20); C05 = _mm256_permute2x128_si256(S2, S6, 0x31); C06 = _mm256_permute2x128_si256(S3, S7, 0x20); C07 = _mm256_permute2x128_si256(S3, S7, 0x31); S0 = _mm256_packs_epi32(C00, C01); S1 = _mm256_packs_epi32(C02, C03); S2 = _mm256_packs_epi32(C04, C05); S3 = _mm256_packs_epi32(C06, C07); S4 = _mm256_unpacklo_epi16(S0, S1); S5 = _mm256_unpacklo_epi16(S2, S3); S6 = _mm256_unpackhi_epi16(S0, S1); S7 = _mm256_unpackhi_epi16(S2, S3); C00 = _mm256_unpacklo_epi32(S4, S5); C01 = _mm256_unpacklo_epi32(S6, S7); C02 = _mm256_unpackhi_epi32(S4, S5); C03 = _mm256_unpackhi_epi32(S6, S7); C04 = _mm256_permute2x128_si256(C00, C02, 0x20); C05 = _mm256_permute2x128_si256(C00, C02, 0x31); C06 = _mm256_permute2x128_si256(C01, C03, 0x20); C07 = _mm256_permute2x128_si256(C01, C03, 0x31); S0 = _mm256_unpacklo_epi64(C04, C05); S1 = _mm256_unpacklo_epi64(C06, C07); S2 = _mm256_unpackhi_epi64(C04, C05); S3 = _mm256_unpackhi_epi64(C06, C07); // CLIP2 max_val = _mm256_set1_epi16((1 << (CLIP2 - 1)) - 1); min_val = _mm256_set1_epi16(-(1 << (CLIP2 - 1))); S0 = _mm256_max_epi16(_mm256_min_epi16(S0, max_val), min_val); S1 = _mm256_max_epi16(_mm256_min_epi16(S1, max_val), min_val); S2 = _mm256_max_epi16(_mm256_min_epi16(S2, max_val), min_val); S3 = _mm256_max_epi16(_mm256_min_epi16(S3, max_val), min_val); // store _mm256_storeu2_m128i((__m128i*)&dst[16], (__m128i*)&dst[ 0], S0); _mm256_storeu2_m128i((__m128i*)&dst[48], (__m128i*)&dst[32], S1); _mm256_storeu2_m128i((__m128i*)&dst[24], (__m128i*)&dst[ 8], S2); _mm256_storeu2_m128i((__m128i*)&dst[56], (__m128i*)&dst[40], S3); } void idct_c_16x16_avx2(const coeff_t *src, coeff_t *dst, int i_dst) { const int shift = 20-g_bit_depth; const int clip = g_bit_depth + 1; const __m256i c16_p43_p45 = _mm256_set1_epi32(0x002B002D); //row0 87high - 90low address const __m256i c16_p35_p40 = _mm256_set1_epi32(0x00230028); const __m256i c16_p21_p29 = _mm256_set1_epi32(0x0015001D); const __m256i c16_p04_p13 = _mm256_set1_epi32(0x0004000D); const __m256i c16_p29_p43 = _mm256_set1_epi32(0x001D002B); //row1 const __m256i c16_n21_p04 = _mm256_set1_epi32(0xFFEB0004); const __m256i c16_n45_n40 = _mm256_set1_epi32(0xFFD3FFD8); const __m256i c16_n13_n35 = _mm256_set1_epi32(0xFFF3FFDD); const __m256i c16_p04_p40 = _mm256_set1_epi32(0x00040028); //row2 const __m256i c16_n43_n35 = _mm256_set1_epi32(0xFFD5FFDD); const __m256i c16_p29_n13 = _mm256_set1_epi32(0x001DFFF3); const __m256i c16_p21_p45 = _mm256_set1_epi32(0x0015002D); const __m256i c16_n21_p35 = _mm256_set1_epi32(0xFFEB0023); //row3 const __m256i c16_p04_n43 = _mm256_set1_epi32(0x0004FFD5); const __m256i c16_p13_p45 = _mm256_set1_epi32(0x000D002D); const __m256i c16_n29_n40 = _mm256_set1_epi32(0xFFE3FFD8); const __m256i c16_n40_p29 = _mm256_set1_epi32(0xFFD8001D); //row4 const __m256i c16_p45_n13 = _mm256_set1_epi32(0x002DFFF3); const __m256i c16_n43_n04 = _mm256_set1_epi32(0xFFD5FFFC); const __m256i c16_p35_p21 = _mm256_set1_epi32(0x00230015); const __m256i c16_n45_p21 = _mm256_set1_epi32(0xFFD30015); //row5 const __m256i c16_p13_p29 = _mm256_set1_epi32(0x000D001D); const __m256i c16_p35_n43 = _mm256_set1_epi32(0x0023FFD5); const __m256i c16_n40_p04 = _mm256_set1_epi32(0xFFD80004); const __m256i c16_n35_p13 = _mm256_set1_epi32(0xFFDD000D); //row6 const __m256i c16_n40_p45 = _mm256_set1_epi32(0xFFD8002D); const __m256i c16_p04_p21 = _mm256_set1_epi32(0x00040015); const __m256i c16_p43_n29 = _mm256_set1_epi32(0x002BFFE3); const __m256i c16_n13_p04 = _mm256_set1_epi32(0xFFF30004); //row7 const __m256i c16_n29_p21 = _mm256_set1_epi32(0xFFE30015); const __m256i c16_n40_p35 = _mm256_set1_epi32(0xFFD80023); const __m256i c16_n45_p43 = _mm256_set1_epi32(0xFFD3002B); const __m256i c16_p38_p44 = _mm256_set1_epi32(0x0026002C); const __m256i c16_p09_p25 = _mm256_set1_epi32(0x00090019); const __m256i c16_n09_p38 = _mm256_set1_epi32(0xFFF70026); const __m256i c16_n25_n44 = _mm256_set1_epi32(0xFFE7FFD4); const __m256i c16_n44_p25 = _mm256_set1_epi32(0xFFD40019); const __m256i c16_p38_p09 = _mm256_set1_epi32(0x00260009); const __m256i c16_n25_p09 = _mm256_set1_epi32(0xFFE70009); const __m256i c16_n44_p38 = _mm256_set1_epi32(0xFFD40026); const __m256i c16_p17_p42 = _mm256_set1_epi32(0x0011002A); const __m256i c16_n42_p17 = _mm256_set1_epi32(0xFFD60011); const __m256i c16_n32_p32 = _mm256_set1_epi32(0xFFE00020); const __m256i c16_p32_p32 = _mm256_set1_epi32(0x00200020); __m256i max_val, min_val; __m256i c32_rnd = _mm256_set1_epi32(16); // һ int nShift = 5; int pass; __m256i in00, in01, in02, in03, in04, in05, in06, in07; __m256i in08, in09, in10, in11, in12, in13, in14, in15; __m256i res00, res01, res02, res03, res04, res05, res06, res07; __m256i res08, res09, res10, res11, res12, res13, res14, res15; UNUSED_PARAMETER(i_dst); in00 = _mm256_lddqu_si256((const __m256i*)&src[0 * 16]); // [07 06 05 04 03 02 01 00] in01 = _mm256_lddqu_si256((const __m256i*)&src[1 * 16]); // [17 16 15 14 13 12 11 10] in02 = _mm256_lddqu_si256((const __m256i*)&src[2 * 16]); // [27 26 25 24 23 22 21 20] in03 = _mm256_lddqu_si256((const __m256i*)&src[3 * 16]); // [37 36 35 34 33 32 31 30] in04 = _mm256_lddqu_si256((const __m256i*)&src[4 * 16]); // [47 46 45 44 43 42 41 40] in05 = _mm256_lddqu_si256((const __m256i*)&src[5 * 16]); // [57 56 55 54 53 52 51 50] in06 = _mm256_lddqu_si256((const __m256i*)&src[6 * 16]); // [67 66 65 64 63 62 61 60] in07 = _mm256_lddqu_si256((const __m256i*)&src[7 * 16]); // [77 76 75 74 73 72 71 70] in08 = _mm256_lddqu_si256((const __m256i*)&src[8 * 16]); in09 = _mm256_lddqu_si256((const __m256i*)&src[9 * 16]); in10 = _mm256_lddqu_si256((const __m256i*)&src[10 * 16]); in11 = _mm256_lddqu_si256((const __m256i*)&src[11 * 16]); in12 = _mm256_lddqu_si256((const __m256i*)&src[12 * 16]); in13 = _mm256_lddqu_si256((const __m256i*)&src[13 * 16]); in14 = _mm256_lddqu_si256((const __m256i*)&src[14 * 16]); in15 = _mm256_lddqu_si256((const __m256i*)&src[15 * 16]); for (pass = 0; pass < 2; pass++) { const __m256i T_00_00A = _mm256_unpacklo_epi16(in01, in03); // [33 13 32 12 31 11 30 10] const __m256i T_00_00B = _mm256_unpackhi_epi16(in01, in03); // [37 17 36 16 35 15 34 14] const __m256i T_00_01A = _mm256_unpacklo_epi16(in05, in07); // [ ] const __m256i T_00_01B = _mm256_unpackhi_epi16(in05, in07); // [ ] const __m256i T_00_02A = _mm256_unpacklo_epi16(in09, in11); // [ ] const __m256i T_00_02B = _mm256_unpackhi_epi16(in09, in11); // [ ] const __m256i T_00_03A = _mm256_unpacklo_epi16(in13, in15); // [ ] const __m256i T_00_03B = _mm256_unpackhi_epi16(in13, in15); // [ ] const __m256i T_00_04A = _mm256_unpacklo_epi16(in02, in06); // [ ] const __m256i T_00_04B = _mm256_unpackhi_epi16(in02, in06); // [ ] const __m256i T_00_05A = _mm256_unpacklo_epi16(in10, in14); // [ ] const __m256i T_00_05B = _mm256_unpackhi_epi16(in10, in14); // [ ] const __m256i T_00_06A = _mm256_unpacklo_epi16(in04, in12); // [ ]row const __m256i T_00_06B = _mm256_unpackhi_epi16(in04, in12); // [ ] const __m256i T_00_07A = _mm256_unpacklo_epi16(in00, in08); // [83 03 82 02 81 01 81 00] row08 row00 const __m256i T_00_07B = _mm256_unpackhi_epi16(in00, in08); // [87 07 86 06 85 05 84 04] __m256i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A; __m256i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B; __m256i EO0A, EO1A, EO2A, EO3A; __m256i EO0B, EO1B, EO2B, EO3B; __m256i EEO0A, EEO1A; __m256i EEO0B, EEO1B; __m256i EEE0A, EEE1A; __m256i EEE0B, EEE1B; { __m256i T00, T01; #define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \ T00 = _mm256_add_epi32(_mm256_madd_epi16(row0103, c0103), _mm256_madd_epi16(row0507, c0507)); \ T01 = _mm256_add_epi32(_mm256_madd_epi16(row0911, c0911), _mm256_madd_epi16(row1315, c1315)); \ row = _mm256_add_epi32(T00, T01); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, O0A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, O1A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, O2A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, O3A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, O4A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, O5A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, O6A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, O7A) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, O0B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, O1B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, O2B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, O3B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, O4B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, O5B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, O6B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, O7B) #undef COMPUTE_ROW } EO0A = _mm256_add_epi32(_mm256_madd_epi16(T_00_04A, c16_p38_p44), _mm256_madd_epi16(T_00_05A, c16_p09_p25)); // EO0 EO0B = _mm256_add_epi32(_mm256_madd_epi16(T_00_04B, c16_p38_p44), _mm256_madd_epi16(T_00_05B, c16_p09_p25)); EO1A = _mm256_add_epi32(_mm256_madd_epi16(T_00_04A, c16_n09_p38), _mm256_madd_epi16(T_00_05A, c16_n25_n44)); // EO1 EO1B = _mm256_add_epi32(_mm256_madd_epi16(T_00_04B, c16_n09_p38), _mm256_madd_epi16(T_00_05B, c16_n25_n44)); EO2A = _mm256_add_epi32(_mm256_madd_epi16(T_00_04A, c16_n44_p25), _mm256_madd_epi16(T_00_05A, c16_p38_p09)); // EO2 EO2B = _mm256_add_epi32(_mm256_madd_epi16(T_00_04B, c16_n44_p25), _mm256_madd_epi16(T_00_05B, c16_p38_p09)); EO3A = _mm256_add_epi32(_mm256_madd_epi16(T_00_04A, c16_n25_p09), _mm256_madd_epi16(T_00_05A, c16_n44_p38)); // EO3 EO3B = _mm256_add_epi32(_mm256_madd_epi16(T_00_04B, c16_n25_p09), _mm256_madd_epi16(T_00_05B, c16_n44_p38)); EEO0A = _mm256_madd_epi16(T_00_06A, c16_p17_p42); EEO0B = _mm256_madd_epi16(T_00_06B, c16_p17_p42); EEO1A = _mm256_madd_epi16(T_00_06A, c16_n42_p17); EEO1B = _mm256_madd_epi16(T_00_06B, c16_n42_p17); EEE0A = _mm256_madd_epi16(T_00_07A, c16_p32_p32); EEE0B = _mm256_madd_epi16(T_00_07B, c16_p32_p32); EEE1A = _mm256_madd_epi16(T_00_07A, c16_n32_p32); EEE1B = _mm256_madd_epi16(T_00_07B, c16_n32_p32); { const __m256i EE0A = _mm256_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m256i EE0B = _mm256_add_epi32(EEE0B, EEO0B); const __m256i EE1A = _mm256_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m256i EE1B = _mm256_add_epi32(EEE1B, EEO1B); const __m256i EE3A = _mm256_sub_epi32(EEE0A, EEO0A); // EE2 = EEE0 - EEO0 const __m256i EE3B = _mm256_sub_epi32(EEE0B, EEO0B); const __m256i EE2A = _mm256_sub_epi32(EEE1A, EEO1A); // EE3 = EEE1 - EEO1 const __m256i EE2B = _mm256_sub_epi32(EEE1B, EEO1B); const __m256i E0A = _mm256_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 const __m256i E0B = _mm256_add_epi32(EE0B, EO0B); const __m256i E1A = _mm256_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 const __m256i E1B = _mm256_add_epi32(EE1B, EO1B); const __m256i E2A = _mm256_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 const __m256i E2B = _mm256_add_epi32(EE2B, EO2B); const __m256i E3A = _mm256_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 const __m256i E3B = _mm256_add_epi32(EE3B, EO3B); const __m256i E7A = _mm256_sub_epi32(EE0A, EO0A); // E7 = EE0 - EO0 const __m256i E7B = _mm256_sub_epi32(EE0B, EO0B); const __m256i E6A = _mm256_sub_epi32(EE1A, EO1A); // E6 = EE1 - EO1 const __m256i E6B = _mm256_sub_epi32(EE1B, EO1B); const __m256i E5A = _mm256_sub_epi32(EE2A, EO2A); // E5 = EE2 - EO2 const __m256i E5B = _mm256_sub_epi32(EE2B, EO2B); const __m256i E4A = _mm256_sub_epi32(EE3A, EO3A); // E4 = EE3 - EO3 const __m256i E4B = _mm256_sub_epi32(EE3B, EO3B); const __m256i T10A = _mm256_add_epi32(E0A, c32_rnd); // E0 + rnd const __m256i T10B = _mm256_add_epi32(E0B, c32_rnd); const __m256i T11A = _mm256_add_epi32(E1A, c32_rnd); // E1 + rnd const __m256i T11B = _mm256_add_epi32(E1B, c32_rnd); const __m256i T12A = _mm256_add_epi32(E2A, c32_rnd); // E2 + rnd const __m256i T12B = _mm256_add_epi32(E2B, c32_rnd); const __m256i T13A = _mm256_add_epi32(E3A, c32_rnd); // E3 + rnd const __m256i T13B = _mm256_add_epi32(E3B, c32_rnd); const __m256i T14A = _mm256_add_epi32(E4A, c32_rnd); // E4 + rnd const __m256i T14B = _mm256_add_epi32(E4B, c32_rnd); const __m256i T15A = _mm256_add_epi32(E5A, c32_rnd); // E5 + rnd const __m256i T15B = _mm256_add_epi32(E5B, c32_rnd); const __m256i T16A = _mm256_add_epi32(E6A, c32_rnd); // E6 + rnd const __m256i T16B = _mm256_add_epi32(E6B, c32_rnd); const __m256i T17A = _mm256_add_epi32(E7A, c32_rnd); // E7 + rnd const __m256i T17B = _mm256_add_epi32(E7B, c32_rnd); const __m256i T20A = _mm256_add_epi32(T10A, O0A); // E0 + O0 + rnd const __m256i T20B = _mm256_add_epi32(T10B, O0B); const __m256i T21A = _mm256_add_epi32(T11A, O1A); // E1 + O1 + rnd const __m256i T21B = _mm256_add_epi32(T11B, O1B); const __m256i T22A = _mm256_add_epi32(T12A, O2A); // E2 + O2 + rnd const __m256i T22B = _mm256_add_epi32(T12B, O2B); const __m256i T23A = _mm256_add_epi32(T13A, O3A); // E3 + O3 + rnd const __m256i T23B = _mm256_add_epi32(T13B, O3B); const __m256i T24A = _mm256_add_epi32(T14A, O4A); // E4 const __m256i T24B = _mm256_add_epi32(T14B, O4B); const __m256i T25A = _mm256_add_epi32(T15A, O5A); // E5 const __m256i T25B = _mm256_add_epi32(T15B, O5B); const __m256i T26A = _mm256_add_epi32(T16A, O6A); // E6 const __m256i T26B = _mm256_add_epi32(T16B, O6B); const __m256i T27A = _mm256_add_epi32(T17A, O7A); // E7 const __m256i T27B = _mm256_add_epi32(T17B, O7B); const __m256i T2FA = _mm256_sub_epi32(T10A, O0A); // E0 - O0 + rnd const __m256i T2FB = _mm256_sub_epi32(T10B, O0B); const __m256i T2EA = _mm256_sub_epi32(T11A, O1A); // E1 - O1 + rnd const __m256i T2EB = _mm256_sub_epi32(T11B, O1B); const __m256i T2DA = _mm256_sub_epi32(T12A, O2A); // E2 - O2 + rnd const __m256i T2DB = _mm256_sub_epi32(T12B, O2B); const __m256i T2CA = _mm256_sub_epi32(T13A, O3A); // E3 - O3 + rnd const __m256i T2CB = _mm256_sub_epi32(T13B, O3B); const __m256i T2BA = _mm256_sub_epi32(T14A, O4A); // E4 const __m256i T2BB = _mm256_sub_epi32(T14B, O4B); const __m256i T2AA = _mm256_sub_epi32(T15A, O5A); // E5 const __m256i T2AB = _mm256_sub_epi32(T15B, O5B); const __m256i T29A = _mm256_sub_epi32(T16A, O6A); // E6 const __m256i T29B = _mm256_sub_epi32(T16B, O6B); const __m256i T28A = _mm256_sub_epi32(T17A, O7A); // E7 const __m256i T28B = _mm256_sub_epi32(T17B, O7B); const __m256i T30A = _mm256_srai_epi32(T20A, nShift); // [30 20 10 00] // This operation make it much slower than 128 const __m256i T30B = _mm256_srai_epi32(T20B, nShift); // [70 60 50 40] // This operation make it much slower than 128 const __m256i T31A = _mm256_srai_epi32(T21A, nShift); // [31 21 11 01] // This operation make it much slower than 128 const __m256i T31B = _mm256_srai_epi32(T21B, nShift); // [71 61 51 41] // This operation make it much slower than 128 const __m256i T32A = _mm256_srai_epi32(T22A, nShift); // [32 22 12 02] // This operation make it much slower than 128 const __m256i T32B = _mm256_srai_epi32(T22B, nShift); // [72 62 52 42] // This operation make it much slower than 128 const __m256i T33A = _mm256_srai_epi32(T23A, nShift); // [33 23 13 03] // This operation make it much slower than 128 const __m256i T33B = _mm256_srai_epi32(T23B, nShift); // [73 63 53 43] // This operation make it much slower than 128 const __m256i T34A = _mm256_srai_epi32(T24A, nShift); // [33 24 14 04] // This operation make it much slower than 128 const __m256i T34B = _mm256_srai_epi32(T24B, nShift); // [74 64 54 44] // This operation make it much slower than 128 const __m256i T35A = _mm256_srai_epi32(T25A, nShift); // [35 25 15 05] // This operation make it much slower than 128 const __m256i T35B = _mm256_srai_epi32(T25B, nShift); // [75 65 55 45] // This operation make it much slower than 128 const __m256i T36A = _mm256_srai_epi32(T26A, nShift); // [36 26 16 06] // This operation make it much slower than 128 const __m256i T36B = _mm256_srai_epi32(T26B, nShift); // [76 66 56 46] // This operation make it much slower than 128 const __m256i T37A = _mm256_srai_epi32(T27A, nShift); // [37 27 17 07] // This operation make it much slower than 128 const __m256i T37B = _mm256_srai_epi32(T27B, nShift); // [77 67 57 47] // This operation make it much slower than 128 const __m256i T38A = _mm256_srai_epi32(T28A, nShift); // [30 20 10 00] x8 // This operation make it much slower than 128 const __m256i T38B = _mm256_srai_epi32(T28B, nShift); // [70 60 50 40] const __m256i T39A = _mm256_srai_epi32(T29A, nShift); // [31 21 11 01] x9 // This operation make it much slower than 128 const __m256i T39B = _mm256_srai_epi32(T29B, nShift); // [71 61 51 41] const __m256i T3AA = _mm256_srai_epi32(T2AA, nShift); // [32 22 12 02] xA // This operation make it much slower than 128 const __m256i T3AB = _mm256_srai_epi32(T2AB, nShift); // [72 62 52 42] const __m256i T3BA = _mm256_srai_epi32(T2BA, nShift); // [33 23 13 03] xB // This operation make it much slower than 128 const __m256i T3BB = _mm256_srai_epi32(T2BB, nShift); // [73 63 53 43] const __m256i T3CA = _mm256_srai_epi32(T2CA, nShift); // [33 24 14 04] xC // This operation make it much slower than 128 const __m256i T3CB = _mm256_srai_epi32(T2CB, nShift); // [74 64 54 44] const __m256i T3DA = _mm256_srai_epi32(T2DA, nShift); // [35 25 15 05] xD // This operation make it much slower than 128 const __m256i T3DB = _mm256_srai_epi32(T2DB, nShift); // [75 65 55 45] const __m256i T3EA = _mm256_srai_epi32(T2EA, nShift); // [36 26 16 06] xE // This operation make it much slower than 128 const __m256i T3EB = _mm256_srai_epi32(T2EB, nShift); // [76 66 56 46] const __m256i T3FA = _mm256_srai_epi32(T2FA, nShift); // [37 27 17 07] xF // This operation make it much slower than 128 const __m256i T3FB = _mm256_srai_epi32(T2FB, nShift); // [77 67 57 47] res00 = _mm256_packs_epi32(T30A, T30B); // [70 60 50 40 30 20 10 00] res01 = _mm256_packs_epi32(T31A, T31B); // [71 61 51 41 31 21 11 01] res02 = _mm256_packs_epi32(T32A, T32B); // [72 62 52 42 32 22 12 02] res03 = _mm256_packs_epi32(T33A, T33B); // [73 63 53 43 33 23 13 03] res04 = _mm256_packs_epi32(T34A, T34B); // [74 64 54 44 34 24 14 04] res05 = _mm256_packs_epi32(T35A, T35B); // [75 65 55 45 35 25 15 05] res06 = _mm256_packs_epi32(T36A, T36B); // [76 66 56 46 36 26 16 06] res07 = _mm256_packs_epi32(T37A, T37B); // [77 67 57 47 37 27 17 07] res08 = _mm256_packs_epi32(T38A, T38B); // [A0 ... 80] res09 = _mm256_packs_epi32(T39A, T39B); // [A1 ... 81] res10 = _mm256_packs_epi32(T3AA, T3AB); // [A2 ... 82] res11 = _mm256_packs_epi32(T3BA, T3BB); // [A3 ... 83] res12 = _mm256_packs_epi32(T3CA, T3CB); // [A4 ... 84] res13 = _mm256_packs_epi32(T3DA, T3DB); // [A5 ... 85] res14 = _mm256_packs_epi32(T3EA, T3EB); // [A6 ... 86] res15 = _mm256_packs_epi32(T3FA, T3FB); // [A7 ... 87] } //transpose matrix 16x16 16bit. { __m256i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7, tr0_8, tr0_9, tr0_10, tr0_11, tr0_12, tr0_13, tr0_14, tr0_15; #define TRANSPOSE_16x16_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, O0, O1, O2, O3, O4, O5, O6, O7, O8, O9, O10, O11, O12, O13, O14, O15) \ tr0_0 = _mm256_unpacklo_epi16(I0, I1); \ tr0_1 = _mm256_unpacklo_epi16(I2, I3); \ tr0_2 = _mm256_unpacklo_epi16(I4, I5); \ tr0_3 = _mm256_unpacklo_epi16(I6, I7); \ tr0_4 = _mm256_unpacklo_epi16(I8, I9); \ tr0_5 = _mm256_unpacklo_epi16(I10, I11); \ tr0_6 = _mm256_unpacklo_epi16(I12, I13); \ tr0_7 = _mm256_unpacklo_epi16(I14, I15); \ tr0_8 = _mm256_unpackhi_epi16(I0, I1); \ tr0_9 = _mm256_unpackhi_epi16(I2, I3); \ tr0_10 = _mm256_unpackhi_epi16(I4, I5); \ tr0_11 = _mm256_unpackhi_epi16(I6, I7); \ tr0_12 = _mm256_unpackhi_epi16(I8, I9); \ tr0_13 = _mm256_unpackhi_epi16(I10, I11); \ tr0_14 = _mm256_unpackhi_epi16(I12, I13); \ tr0_15 = _mm256_unpackhi_epi16(I14, I15); \ O0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); \ O1 = _mm256_unpacklo_epi32(tr0_2, tr0_3); \ O2 = _mm256_unpacklo_epi32(tr0_4, tr0_5); \ O3 = _mm256_unpacklo_epi32(tr0_6, tr0_7); \ O4 = _mm256_unpackhi_epi32(tr0_0, tr0_1); \ O5 = _mm256_unpackhi_epi32(tr0_2, tr0_3); \ O6 = _mm256_unpackhi_epi32(tr0_4, tr0_5); \ O7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); \ O8 = _mm256_unpacklo_epi32(tr0_8, tr0_9); \ O9 = _mm256_unpacklo_epi32(tr0_10, tr0_11); \ O10 = _mm256_unpacklo_epi32(tr0_12, tr0_13); \ O11 = _mm256_unpacklo_epi32(tr0_14, tr0_15); \ O12 = _mm256_unpackhi_epi32(tr0_8, tr0_9); \ O13 = _mm256_unpackhi_epi32(tr0_10, tr0_11); \ O14 = _mm256_unpackhi_epi32(tr0_12, tr0_13); \ O15 = _mm256_unpackhi_epi32(tr0_14, tr0_15); \ tr0_0 = _mm256_unpacklo_epi64(O0, O1); \ tr0_1 = _mm256_unpacklo_epi64(O2, O3); \ tr0_2 = _mm256_unpackhi_epi64(O0, O1); \ tr0_3 = _mm256_unpackhi_epi64(O2, O3); \ tr0_4 = _mm256_unpacklo_epi64(O4, O5); \ tr0_5 = _mm256_unpacklo_epi64(O6, O7); \ tr0_6 = _mm256_unpackhi_epi64(O4, O5); \ tr0_7 = _mm256_unpackhi_epi64(O6, O7); \ tr0_8 = _mm256_unpacklo_epi64(O8, O9); \ tr0_9 = _mm256_unpacklo_epi64(O10, O11); \ tr0_10 = _mm256_unpackhi_epi64(O8, O9); \ tr0_11 = _mm256_unpackhi_epi64(O10, O11); \ tr0_12 = _mm256_unpacklo_epi64(O12, O13); \ tr0_13 = _mm256_unpacklo_epi64(O14, O15); \ tr0_14 = _mm256_unpackhi_epi64(O12, O13); \ tr0_15 = _mm256_unpackhi_epi64(O14, O15); \ O0 = _mm256_permute2x128_si256(tr0_0, tr0_1, 0x20); \ O1 = _mm256_permute2x128_si256(tr0_2, tr0_3, 0x20); \ O2 = _mm256_permute2x128_si256(tr0_4, tr0_5, 0x20); \ O3 = _mm256_permute2x128_si256(tr0_6, tr0_7, 0x20); \ O4 = _mm256_permute2x128_si256(tr0_8, tr0_9, 0x20); \ O5 = _mm256_permute2x128_si256(tr0_10, tr0_11, 0x20); \ O6 = _mm256_permute2x128_si256(tr0_12, tr0_13, 0x20); \ O7 = _mm256_permute2x128_si256(tr0_14, tr0_15, 0x20); \ O8 = _mm256_permute2x128_si256(tr0_0, tr0_1, 0x31); \ O9 = _mm256_permute2x128_si256(tr0_2, tr0_3, 0x31); \ O10 = _mm256_permute2x128_si256(tr0_4, tr0_5, 0x31); \ O11 = _mm256_permute2x128_si256(tr0_6, tr0_7, 0x31); \ O12 = _mm256_permute2x128_si256(tr0_8, tr0_9, 0x31); \ O13 = _mm256_permute2x128_si256(tr0_10, tr0_11, 0x31); \ O14 = _mm256_permute2x128_si256(tr0_12, tr0_13, 0x31); \ O15 = _mm256_permute2x128_si256(tr0_14, tr0_15, 0x31); \ TRANSPOSE_16x16_16BIT(res00, res01, res02, res03, res04, res05, res06, res07, res08, res09, res10, res11, res12, res13, res14, res15, in00, in01, in02, in03, in04, in05, in06, in07, in08, in09, in10, in11, in12, in13, in14, in15) #undef TRANSPOSE_16x16_16BIT } nShift = shift; c32_rnd = _mm256_set1_epi32(shift ? (1 << (shift - 1)) : 0); // pass == 1 ڶ } // clip max_val = _mm256_set1_epi16((1 << (clip - 1)) - 1); min_val = _mm256_set1_epi16(-(1 << (clip - 1))); in00 = _mm256_max_epi16(_mm256_min_epi16(in00, max_val), min_val); in01 = _mm256_max_epi16(_mm256_min_epi16(in01, max_val), min_val); in02 = _mm256_max_epi16(_mm256_min_epi16(in02, max_val), min_val); in03 = _mm256_max_epi16(_mm256_min_epi16(in03, max_val), min_val); in04 = _mm256_max_epi16(_mm256_min_epi16(in04, max_val), min_val); in05 = _mm256_max_epi16(_mm256_min_epi16(in05, max_val), min_val); in06 = _mm256_max_epi16(_mm256_min_epi16(in06, max_val), min_val); in07 = _mm256_max_epi16(_mm256_min_epi16(in07, max_val), min_val); in08 = _mm256_max_epi16(_mm256_min_epi16(in08, max_val), min_val); in09 = _mm256_max_epi16(_mm256_min_epi16(in09, max_val), min_val); in10 = _mm256_max_epi16(_mm256_min_epi16(in10, max_val), min_val); in11 = _mm256_max_epi16(_mm256_min_epi16(in11, max_val), min_val); in12 = _mm256_max_epi16(_mm256_min_epi16(in12, max_val), min_val); in13 = _mm256_max_epi16(_mm256_min_epi16(in13, max_val), min_val); in14 = _mm256_max_epi16(_mm256_min_epi16(in14, max_val), min_val); in15 = _mm256_max_epi16(_mm256_min_epi16(in15, max_val), min_val); // store _mm256_storeu_si256((__m256i*)&dst[0 * 16 + 0], in00); _mm256_storeu_si256((__m256i*)&dst[1 * 16 + 0], in01); _mm256_storeu_si256((__m256i*)&dst[2 * 16 + 0], in02); _mm256_storeu_si256((__m256i*)&dst[3 * 16 + 0], in03); _mm256_storeu_si256((__m256i*)&dst[4 * 16 + 0], in04); _mm256_storeu_si256((__m256i*)&dst[5 * 16 + 0], in05); _mm256_storeu_si256((__m256i*)&dst[6 * 16 + 0], in06); _mm256_storeu_si256((__m256i*)&dst[7 * 16 + 0], in07); _mm256_storeu_si256((__m256i*)&dst[8 * 16 + 0], in08); _mm256_storeu_si256((__m256i*)&dst[9 * 16 + 0], in09); _mm256_storeu_si256((__m256i*)&dst[10 * 16 + 0], in10); _mm256_storeu_si256((__m256i*)&dst[11 * 16 + 0], in11); _mm256_storeu_si256((__m256i*)&dst[12 * 16 + 0], in12); _mm256_storeu_si256((__m256i*)&dst[13 * 16 + 0], in13); _mm256_storeu_si256((__m256i*)&dst[14 * 16 + 0], in14); _mm256_storeu_si256((__m256i*)&dst[15 * 16 + 0], in15); } void idct_c_32x32_avx2(const coeff_t *src, coeff_t *dst, int i_dst) { int shift = 20 - g_bit_depth - (i_dst & 0x01); int clip = g_bit_depth + 1 + (i_dst & 0x01); int k, i; __m256i max_val, min_val; __m256i EEO0A, EEO1A, EEO2A, EEO3A, EEO0B, EEO1B, EEO2B, EEO3B; __m256i EEEO0A, EEEO0B, EEEO1A, EEEO1B; __m256i EEEE0A, EEEE0B, EEEE1A, EEEE1B; __m256i EEE0A, EEE0B, EEE1A, EEE1B, EEE3A, EEE3B, EEE2A, EEE2B; __m256i EE0A, EE0B, EE1A, EE1B, EE2A, EE2B, EE3A, EE3B, EE7A, EE7B, EE6A, EE6B, EE5A, EE5B, EE4A, EE4B; __m256i E0A, E0B, E1A, E1B, E2A, E2B, E3A, E3B, E4A, E4B, E5A, E5B, E6A, E6B, E7A, E7B, EFA, EFB, EEA, EEB, EDA, EDB, ECA, ECB, EBA, EBB, EAA, EAB, E9A, E9B, E8A, E8B; __m256i T10A, T10B, T11A, T11B, T12A, T12B, T13A, T13B, T14A, T14B, T15A, T15B, T16A, T16B, T17A, T17B, T18A, T18B, T19A, T19B, T1AA, T1AB, T1BA, T1BB, T1CA, T1CB, T1DA, T1DB, T1EA, T1EB, T1FA, T1FB; __m256i T2_00A, T2_00B, T2_01A, T2_01B, T2_02A, T2_02B, T2_03A, T2_03B, T2_04A, T2_04B, T2_05A, T2_05B, T2_06A, T2_06B, T2_07A, T2_07B, T2_08A, T2_08B, T2_09A, T2_09B, T2_10A, T2_10B, T2_11A, T2_11B, T2_12A, T2_12B, T2_13A, T2_13B, T2_14A, T2_14B, T2_15A, T2_15B, T2_31A, T2_31B, T2_30A, T2_30B, T2_29A, T2_29B, T2_28A, T2_28B, T2_27A, T2_27B, T2_26A, T2_26B, T2_25A, T2_25B, T2_24A, T2_24B, T2_23A, T2_23B, T2_22A, T2_22B, T2_21A, T2_21B, T2_20A, T2_20B, T2_19A, T2_19B, T2_18A, T2_18B, T2_17A, T2_17B, T2_16A, T2_16B; __m256i T3_00A, T3_00B, T3_01A, T3_01B, T3_02A, T3_02B, T3_03A, T3_03B, T3_04A, T3_04B, T3_05A, T3_05B, T3_06A, T3_06B, T3_07A, T3_07B, T3_08A, T3_08B, T3_09A, T3_09B, T3_10A, T3_10B, T3_11A, T3_11B, T3_12A, T3_12B, T3_13A, T3_13B, T3_14A, T3_14B, T3_15A, T3_15B; __m256i T3_16A, T3_16B, T3_17A, T3_17B, T3_18A, T3_18B, T3_19A, T3_19B, T3_20A, T3_20B, T3_21A, T3_21B, T3_22A, T3_22B, T3_23A, T3_23B, T3_24A, T3_24B, T3_25A, T3_25B, T3_26A, T3_26B, T3_27A, T3_27B, T3_28A, T3_28B, T3_29A, T3_29B, T3_30A, T3_30B, T3_31A, T3_31B; const __m256i c16_p45_p45 = _mm256_set1_epi32(0x002D002D); const __m256i c16_p43_p44 = _mm256_set1_epi32(0x002B002C); const __m256i c16_p39_p41 = _mm256_set1_epi32(0x00270029); const __m256i c16_p34_p36 = _mm256_set1_epi32(0x00220024); const __m256i c16_p27_p30 = _mm256_set1_epi32(0x001B001E); const __m256i c16_p19_p23 = _mm256_set1_epi32(0x00130017); const __m256i c16_p11_p15 = _mm256_set1_epi32(0x000B000F); const __m256i c16_p02_p07 = _mm256_set1_epi32(0x00020007); const __m256i c16_p41_p45 = _mm256_set1_epi32(0x0029002D); const __m256i c16_p23_p34 = _mm256_set1_epi32(0x00170022); const __m256i c16_n02_p11 = _mm256_set1_epi32(0xFFFE000B); const __m256i c16_n27_n15 = _mm256_set1_epi32(0xFFE5FFF1); const __m256i c16_n43_n36 = _mm256_set1_epi32(0xFFD5FFDC); const __m256i c16_n44_n45 = _mm256_set1_epi32(0xFFD4FFD3); const __m256i c16_n30_n39 = _mm256_set1_epi32(0xFFE2FFD9); const __m256i c16_n07_n19 = _mm256_set1_epi32(0xFFF9FFED); const __m256i c16_p34_p44 = _mm256_set1_epi32(0x0022002C); const __m256i c16_n07_p15 = _mm256_set1_epi32(0xFFF9000F); const __m256i c16_n41_n27 = _mm256_set1_epi32(0xFFD7FFE5); const __m256i c16_n39_n45 = _mm256_set1_epi32(0xFFD9FFD3); const __m256i c16_n02_n23 = _mm256_set1_epi32(0xFFFEFFE9); const __m256i c16_p36_p19 = _mm256_set1_epi32(0x00240013); const __m256i c16_p43_p45 = _mm256_set1_epi32(0x002B002D); const __m256i c16_p11_p30 = _mm256_set1_epi32(0x000B001E); const __m256i c16_p23_p43 = _mm256_set1_epi32(0x0017002B); const __m256i c16_n34_n07 = _mm256_set1_epi32(0xFFDEFFF9); const __m256i c16_n36_n45 = _mm256_set1_epi32(0xFFDCFFD3); const __m256i c16_p19_n11 = _mm256_set1_epi32(0x0013FFF5); const __m256i c16_p44_p41 = _mm256_set1_epi32(0x002C0029); const __m256i c16_n02_p27 = _mm256_set1_epi32(0xFFFE001B); const __m256i c16_n45_n30 = _mm256_set1_epi32(0xFFD3FFE2); const __m256i c16_n15_n39 = _mm256_set1_epi32(0xFFF1FFD9); const __m256i c16_p11_p41 = _mm256_set1_epi32(0x000B0029); const __m256i c16_n45_n27 = _mm256_set1_epi32(0xFFD3FFE5); const __m256i c16_p07_n30 = _mm256_set1_epi32(0x0007FFE2); const __m256i c16_p43_p39 = _mm256_set1_epi32(0x002B0027); const __m256i c16_n23_p15 = _mm256_set1_epi32(0xFFE9000F); const __m256i c16_n34_n45 = _mm256_set1_epi32(0xFFDEFFD3); const __m256i c16_p36_p02 = _mm256_set1_epi32(0x00240002); const __m256i c16_p19_p44 = _mm256_set1_epi32(0x0013002C); const __m256i c16_n02_p39 = _mm256_set1_epi32(0xFFFE0027); const __m256i c16_n36_n41 = _mm256_set1_epi32(0xFFDCFFD7); const __m256i c16_p43_p07 = _mm256_set1_epi32(0x002B0007); const __m256i c16_n11_p34 = _mm256_set1_epi32(0xFFF50022); const __m256i c16_n30_n44 = _mm256_set1_epi32(0xFFE2FFD4); const __m256i c16_p45_p15 = _mm256_set1_epi32(0x002D000F); const __m256i c16_n19_p27 = _mm256_set1_epi32(0xFFED001B); const __m256i c16_n23_n45 = _mm256_set1_epi32(0xFFE9FFD3); const __m256i c16_n15_p36 = _mm256_set1_epi32(0xFFF10024); const __m256i c16_n11_n45 = _mm256_set1_epi32(0xFFF5FFD3); const __m256i c16_p34_p39 = _mm256_set1_epi32(0x00220027); const __m256i c16_n45_n19 = _mm256_set1_epi32(0xFFD3FFED); const __m256i c16_p41_n07 = _mm256_set1_epi32(0x0029FFF9); const __m256i c16_n23_p30 = _mm256_set1_epi32(0xFFE9001E); const __m256i c16_n02_n44 = _mm256_set1_epi32(0xFFFEFFD4); const __m256i c16_p27_p43 = _mm256_set1_epi32(0x001B002B); const __m256i c16_n27_p34 = _mm256_set1_epi32(0xFFE50022); const __m256i c16_p19_n39 = _mm256_set1_epi32(0x0013FFD9); const __m256i c16_n11_p43 = _mm256_set1_epi32(0xFFF5002B); const __m256i c16_p02_n45 = _mm256_set1_epi32(0x0002FFD3); const __m256i c16_p07_p45 = _mm256_set1_epi32(0x0007002D); const __m256i c16_n15_n44 = _mm256_set1_epi32(0xFFF1FFD4); const __m256i c16_p23_p41 = _mm256_set1_epi32(0x00170029); const __m256i c16_n30_n36 = _mm256_set1_epi32(0xFFE2FFDC); const __m256i c16_n36_p30 = _mm256_set1_epi32(0xFFDC001E); const __m256i c16_p41_n23 = _mm256_set1_epi32(0x0029FFE9); const __m256i c16_n44_p15 = _mm256_set1_epi32(0xFFD4000F); const __m256i c16_p45_n07 = _mm256_set1_epi32(0x002DFFF9); const __m256i c16_n45_n02 = _mm256_set1_epi32(0xFFD3FFFE); const __m256i c16_p43_p11 = _mm256_set1_epi32(0x002B000B); const __m256i c16_n39_n19 = _mm256_set1_epi32(0xFFD9FFED); const __m256i c16_p34_p27 = _mm256_set1_epi32(0x0022001B); const __m256i c16_n43_p27 = _mm256_set1_epi32(0xFFD5001B); const __m256i c16_p44_n02 = _mm256_set1_epi32(0x002CFFFE); const __m256i c16_n30_n23 = _mm256_set1_epi32(0xFFE2FFE9); const __m256i c16_p07_p41 = _mm256_set1_epi32(0x00070029); const __m256i c16_p19_n45 = _mm256_set1_epi32(0x0013FFD3); const __m256i c16_n39_p34 = _mm256_set1_epi32(0xFFD90022); const __m256i c16_p45_n11 = _mm256_set1_epi32(0x002DFFF5); const __m256i c16_n36_n15 = _mm256_set1_epi32(0xFFDCFFF1); const __m256i c16_n45_p23 = _mm256_set1_epi32(0xFFD30017); const __m256i c16_p27_p19 = _mm256_set1_epi32(0x001B0013); const __m256i c16_p15_n45 = _mm256_set1_epi32(0x000FFFD3); const __m256i c16_n44_p30 = _mm256_set1_epi32(0xFFD4001E); const __m256i c16_p34_p11 = _mm256_set1_epi32(0x0022000B); const __m256i c16_p07_n43 = _mm256_set1_epi32(0x0007FFD5); const __m256i c16_n41_p36 = _mm256_set1_epi32(0xFFD70024); const __m256i c16_p39_p02 = _mm256_set1_epi32(0x00270002); const __m256i c16_n44_p19 = _mm256_set1_epi32(0xFFD40013); const __m256i c16_n02_p36 = _mm256_set1_epi32(0xFFFE0024); const __m256i c16_p45_n34 = _mm256_set1_epi32(0x002DFFDE); const __m256i c16_n15_n23 = _mm256_set1_epi32(0xFFF1FFE9); const __m256i c16_n39_p43 = _mm256_set1_epi32(0xFFD9002B); const __m256i c16_p30_p07 = _mm256_set1_epi32(0x001E0007); const __m256i c16_p27_n45 = _mm256_set1_epi32(0x001BFFD3); const __m256i c16_n41_p11 = _mm256_set1_epi32(0xFFD7000B); const __m256i c16_n39_p15 = _mm256_set1_epi32(0xFFD9000F); const __m256i c16_n30_p45 = _mm256_set1_epi32(0xFFE2002D); const __m256i c16_p27_p02 = _mm256_set1_epi32(0x001B0002); const __m256i c16_p41_n44 = _mm256_set1_epi32(0x0029FFD4); const __m256i c16_n11_n19 = _mm256_set1_epi32(0xFFF5FFED); const __m256i c16_n45_p36 = _mm256_set1_epi32(0xFFD30024); const __m256i c16_n07_p34 = _mm256_set1_epi32(0xFFF90022); const __m256i c16_p43_n23 = _mm256_set1_epi32(0x002BFFE9); const __m256i c16_n30_p11 = _mm256_set1_epi32(0xFFE2000B); const __m256i c16_n45_p43 = _mm256_set1_epi32(0xFFD3002B); const __m256i c16_n19_p36 = _mm256_set1_epi32(0xFFED0024); const __m256i c16_p23_n02 = _mm256_set1_epi32(0x0017FFFE); const __m256i c16_p45_n39 = _mm256_set1_epi32(0x002DFFD9); const __m256i c16_p27_n41 = _mm256_set1_epi32(0x001BFFD7); const __m256i c16_n15_n07 = _mm256_set1_epi32(0xFFF1FFF9); const __m256i c16_n44_p34 = _mm256_set1_epi32(0xFFD40022); const __m256i c16_n19_p07 = _mm256_set1_epi32(0xFFED0007); const __m256i c16_n39_p30 = _mm256_set1_epi32(0xFFD9001E); const __m256i c16_n45_p44 = _mm256_set1_epi32(0xFFD3002C); const __m256i c16_n36_p43 = _mm256_set1_epi32(0xFFDC002B); const __m256i c16_n15_p27 = _mm256_set1_epi32(0xFFF1001B); const __m256i c16_p11_p02 = _mm256_set1_epi32(0x000B0002); const __m256i c16_p34_n23 = _mm256_set1_epi32(0x0022FFE9); const __m256i c16_p45_n41 = _mm256_set1_epi32(0x002DFFD7); const __m256i c16_n07_p02 = _mm256_set1_epi32(0xFFF90002); const __m256i c16_n15_p11 = _mm256_set1_epi32(0xFFF1000B); const __m256i c16_n23_p19 = _mm256_set1_epi32(0xFFE90013); const __m256i c16_n30_p27 = _mm256_set1_epi32(0xFFE2001B); const __m256i c16_n36_p34 = _mm256_set1_epi32(0xFFDC0022); const __m256i c16_n41_p39 = _mm256_set1_epi32(0xFFD70027); const __m256i c16_n44_p43 = _mm256_set1_epi32(0xFFD4002B); const __m256i c16_n45_p45 = _mm256_set1_epi32(0xFFD3002D); // const __m256i c16_p43_p45 = _mm256_set1_epi32(0x002B002D); const __m256i c16_p35_p40 = _mm256_set1_epi32(0x00230028); const __m256i c16_p21_p29 = _mm256_set1_epi32(0x0015001D); const __m256i c16_p04_p13 = _mm256_set1_epi32(0x0004000D); const __m256i c16_p29_p43 = _mm256_set1_epi32(0x001D002B); const __m256i c16_n21_p04 = _mm256_set1_epi32(0xFFEB0004); const __m256i c16_n45_n40 = _mm256_set1_epi32(0xFFD3FFD8); const __m256i c16_n13_n35 = _mm256_set1_epi32(0xFFF3FFDD); const __m256i c16_p04_p40 = _mm256_set1_epi32(0x00040028); const __m256i c16_n43_n35 = _mm256_set1_epi32(0xFFD5FFDD); const __m256i c16_p29_n13 = _mm256_set1_epi32(0x001DFFF3); const __m256i c16_p21_p45 = _mm256_set1_epi32(0x0015002D); const __m256i c16_n21_p35 = _mm256_set1_epi32(0xFFEB0023); const __m256i c16_p04_n43 = _mm256_set1_epi32(0x0004FFD5); const __m256i c16_p13_p45 = _mm256_set1_epi32(0x000D002D); const __m256i c16_n29_n40 = _mm256_set1_epi32(0xFFE3FFD8); const __m256i c16_n40_p29 = _mm256_set1_epi32(0xFFD8001D); const __m256i c16_p45_n13 = _mm256_set1_epi32(0x002DFFF3); const __m256i c16_n43_n04 = _mm256_set1_epi32(0xFFD5FFFC); const __m256i c16_p35_p21 = _mm256_set1_epi32(0x00230015); const __m256i c16_n45_p21 = _mm256_set1_epi32(0xFFD30015); const __m256i c16_p13_p29 = _mm256_set1_epi32(0x000D001D); const __m256i c16_p35_n43 = _mm256_set1_epi32(0x0023FFD5); const __m256i c16_n40_p04 = _mm256_set1_epi32(0xFFD80004); const __m256i c16_n35_p13 = _mm256_set1_epi32(0xFFDD000D); const __m256i c16_n40_p45 = _mm256_set1_epi32(0xFFD8002D); const __m256i c16_p04_p21 = _mm256_set1_epi32(0x00040015); const __m256i c16_p43_n29 = _mm256_set1_epi32(0x002BFFE3); const __m256i c16_n13_p04 = _mm256_set1_epi32(0xFFF30004); const __m256i c16_n29_p21 = _mm256_set1_epi32(0xFFE30015); const __m256i c16_n40_p35 = _mm256_set1_epi32(0xFFD80023); //const __m256i c16_n45_p43 = _mm256_set1_epi32(0xFFD3002B); const __m256i c16_p38_p44 = _mm256_set1_epi32(0x0026002C); const __m256i c16_p09_p25 = _mm256_set1_epi32(0x00090019); const __m256i c16_n09_p38 = _mm256_set1_epi32(0xFFF70026); const __m256i c16_n25_n44 = _mm256_set1_epi32(0xFFE7FFD4); const __m256i c16_n44_p25 = _mm256_set1_epi32(0xFFD40019); const __m256i c16_p38_p09 = _mm256_set1_epi32(0x00260009); const __m256i c16_n25_p09 = _mm256_set1_epi32(0xFFE70009); const __m256i c16_n44_p38 = _mm256_set1_epi32(0xFFD40026); const __m256i c16_p17_p42 = _mm256_set1_epi32(0x0011002A); const __m256i c16_n42_p17 = _mm256_set1_epi32(0xFFD60011); const __m256i c16_p32_p32 = _mm256_set1_epi32(0x00200020); const __m256i c16_n32_p32 = _mm256_set1_epi32(0xFFE00020); __m256i c32_rnd = _mm256_set1_epi32(16); int nShift = 5; // DCT1 __m256i in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2], in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2]; __m256i in16[2], in17[2], in18[2], in19[2], in20[2], in21[2], in22[2], in23[2], in24[2], in25[2], in26[2], in27[2], in28[2], in29[2], in30[2], in31[2]; __m256i res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2], res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2]; __m256i res16[2], res17[2], res18[2], res19[2], res20[2], res21[2], res22[2], res23[2], res24[2], res25[2], res26[2], res27[2], res28[2], res29[2], res30[2], res31[2]; int pass, part; UNUSED_PARAMETER(i_dst); for (i = 0; i < 2; i++) { const int offset = (i << 4); in00[i] = _mm256_lddqu_si256((const __m256i*)&src[0 * 32 + offset]); in01[i] = _mm256_lddqu_si256((const __m256i*)&src[1 * 32 + offset]); in02[i] = _mm256_lddqu_si256((const __m256i*)&src[2 * 32 + offset]); in03[i] = _mm256_lddqu_si256((const __m256i*)&src[3 * 32 + offset]); in04[i] = _mm256_lddqu_si256((const __m256i*)&src[4 * 32 + offset]); in05[i] = _mm256_lddqu_si256((const __m256i*)&src[5 * 32 + offset]); in06[i] = _mm256_lddqu_si256((const __m256i*)&src[6 * 32 + offset]); in07[i] = _mm256_lddqu_si256((const __m256i*)&src[7 * 32 + offset]); in08[i] = _mm256_lddqu_si256((const __m256i*)&src[8 * 32 + offset]); in09[i] = _mm256_lddqu_si256((const __m256i*)&src[9 * 32 + offset]); in10[i] = _mm256_lddqu_si256((const __m256i*)&src[10 * 32 + offset]); in11[i] = _mm256_lddqu_si256((const __m256i*)&src[11 * 32 + offset]); in12[i] = _mm256_lddqu_si256((const __m256i*)&src[12 * 32 + offset]); in13[i] = _mm256_lddqu_si256((const __m256i*)&src[13 * 32 + offset]); in14[i] = _mm256_lddqu_si256((const __m256i*)&src[14 * 32 + offset]); in15[i] = _mm256_lddqu_si256((const __m256i*)&src[15 * 32 + offset]); in16[i] = _mm256_lddqu_si256((const __m256i*)&src[16 * 32 + offset]); in17[i] = _mm256_lddqu_si256((const __m256i*)&src[17 * 32 + offset]); in18[i] = _mm256_lddqu_si256((const __m256i*)&src[18 * 32 + offset]); in19[i] = _mm256_lddqu_si256((const __m256i*)&src[19 * 32 + offset]); in20[i] = _mm256_lddqu_si256((const __m256i*)&src[20 * 32 + offset]); in21[i] = _mm256_lddqu_si256((const __m256i*)&src[21 * 32 + offset]); in22[i] = _mm256_lddqu_si256((const __m256i*)&src[22 * 32 + offset]); in23[i] = _mm256_lddqu_si256((const __m256i*)&src[23 * 32 + offset]); in24[i] = _mm256_lddqu_si256((const __m256i*)&src[24 * 32 + offset]); in25[i] = _mm256_lddqu_si256((const __m256i*)&src[25 * 32 + offset]); in26[i] = _mm256_lddqu_si256((const __m256i*)&src[26 * 32 + offset]); in27[i] = _mm256_lddqu_si256((const __m256i*)&src[27 * 32 + offset]); in28[i] = _mm256_lddqu_si256((const __m256i*)&src[28 * 32 + offset]); in29[i] = _mm256_lddqu_si256((const __m256i*)&src[29 * 32 + offset]); in30[i] = _mm256_lddqu_si256((const __m256i*)&src[30 * 32 + offset]); in31[i] = _mm256_lddqu_si256((const __m256i*)&src[31 * 32 + offset]); } for (pass = 0; pass < 2; pass++) { for (part = 0; part < 2; part++) { const __m256i T_00_00A = _mm256_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10] const __m256i T_00_00B = _mm256_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14] const __m256i T_00_01A = _mm256_unpacklo_epi16(in05[part], in07[part]); // [ ] const __m256i T_00_01B = _mm256_unpackhi_epi16(in05[part], in07[part]); // [ ] const __m256i T_00_02A = _mm256_unpacklo_epi16(in09[part], in11[part]); // [ ] const __m256i T_00_02B = _mm256_unpackhi_epi16(in09[part], in11[part]); // [ ] const __m256i T_00_03A = _mm256_unpacklo_epi16(in13[part], in15[part]); // [ ] const __m256i T_00_03B = _mm256_unpackhi_epi16(in13[part], in15[part]); // [ ] const __m256i T_00_04A = _mm256_unpacklo_epi16(in17[part], in19[part]); // [ ] const __m256i T_00_04B = _mm256_unpackhi_epi16(in17[part], in19[part]); // [ ] const __m256i T_00_05A = _mm256_unpacklo_epi16(in21[part], in23[part]); // [ ] const __m256i T_00_05B = _mm256_unpackhi_epi16(in21[part], in23[part]); // [ ] const __m256i T_00_06A = _mm256_unpacklo_epi16(in25[part], in27[part]); // [ ] const __m256i T_00_06B = _mm256_unpackhi_epi16(in25[part], in27[part]); // [ ] const __m256i T_00_07A = _mm256_unpacklo_epi16(in29[part], in31[part]); // const __m256i T_00_07B = _mm256_unpackhi_epi16(in29[part], in31[part]); // [ ] const __m256i T_00_08A = _mm256_unpacklo_epi16(in02[part], in06[part]); // [ ] const __m256i T_00_08B = _mm256_unpackhi_epi16(in02[part], in06[part]); // [ ] const __m256i T_00_09A = _mm256_unpacklo_epi16(in10[part], in14[part]); // [ ] const __m256i T_00_09B = _mm256_unpackhi_epi16(in10[part], in14[part]); // [ ] const __m256i T_00_10A = _mm256_unpacklo_epi16(in18[part], in22[part]); // [ ] const __m256i T_00_10B = _mm256_unpackhi_epi16(in18[part], in22[part]); // [ ] const __m256i T_00_11A = _mm256_unpacklo_epi16(in26[part], in30[part]); // [ ] const __m256i T_00_11B = _mm256_unpackhi_epi16(in26[part], in30[part]); // [ ] const __m256i T_00_12A = _mm256_unpacklo_epi16(in04[part], in12[part]); // [ ] const __m256i T_00_12B = _mm256_unpackhi_epi16(in04[part], in12[part]); // [ ] const __m256i T_00_13A = _mm256_unpacklo_epi16(in20[part], in28[part]); // [ ] const __m256i T_00_13B = _mm256_unpackhi_epi16(in20[part], in28[part]); // [ ] const __m256i T_00_14A = _mm256_unpacklo_epi16(in08[part], in24[part]); // const __m256i T_00_14B = _mm256_unpackhi_epi16(in08[part], in24[part]); // [ ] const __m256i T_00_15A = _mm256_unpacklo_epi16(in00[part], in16[part]); // const __m256i T_00_15B = _mm256_unpackhi_epi16(in00[part], in16[part]); // [ ] __m256i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A; __m256i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B; __m256i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A; __m256i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B; { __m256i T00, T01, T02, T03; #define COMPUTE_ROW(r0103, r0507, r0911, r1315, r1719, r2123, r2527, r2931, c0103, c0507, c0911, c1315, c1719, c2123, c2527, c2931, row) \ T00 = _mm256_add_epi32(_mm256_madd_epi16(r0103, c0103), _mm256_madd_epi16(r0507, c0507)); \ T01 = _mm256_add_epi32(_mm256_madd_epi16(r0911, c0911), _mm256_madd_epi16(r1315, c1315)); \ T02 = _mm256_add_epi32(_mm256_madd_epi16(r1719, c1719), _mm256_madd_epi16(r2123, c2123)); \ T03 = _mm256_add_epi32(_mm256_madd_epi16(r2527, c2527), _mm256_madd_epi16(r2931, c2931)); \ row = _mm256_add_epi32(_mm256_add_epi32(T00, T01), _mm256_add_epi32(T02, T03)); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15A) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15B) #undef COMPUTE_ROW } { __m256i T00, T01; #define COMPUTE_ROW(row0206, row1014, row1822, row2630, c0206, c1014, c1822, c2630, row) \ T00 = _mm256_add_epi32(_mm256_madd_epi16(row0206, c0206), _mm256_madd_epi16(row1014, c1014)); \ T01 = _mm256_add_epi32(_mm256_madd_epi16(row1822, c1822), _mm256_madd_epi16(row2630, c2630)); \ row = _mm256_add_epi32(T00, T01); COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7A) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7B) #undef COMPUTE_ROW } EEO0A = _mm256_add_epi32(_mm256_madd_epi16(T_00_12A, c16_p38_p44), _mm256_madd_epi16(T_00_13A, c16_p09_p25)); EEO1A = _mm256_add_epi32(_mm256_madd_epi16(T_00_12A, c16_n09_p38), _mm256_madd_epi16(T_00_13A, c16_n25_n44)); EEO2A = _mm256_add_epi32(_mm256_madd_epi16(T_00_12A, c16_n44_p25), _mm256_madd_epi16(T_00_13A, c16_p38_p09)); EEO3A = _mm256_add_epi32(_mm256_madd_epi16(T_00_12A, c16_n25_p09), _mm256_madd_epi16(T_00_13A, c16_n44_p38)); EEO0B = _mm256_add_epi32(_mm256_madd_epi16(T_00_12B, c16_p38_p44), _mm256_madd_epi16(T_00_13B, c16_p09_p25)); EEO1B = _mm256_add_epi32(_mm256_madd_epi16(T_00_12B, c16_n09_p38), _mm256_madd_epi16(T_00_13B, c16_n25_n44)); EEO2B = _mm256_add_epi32(_mm256_madd_epi16(T_00_12B, c16_n44_p25), _mm256_madd_epi16(T_00_13B, c16_p38_p09)); EEO3B = _mm256_add_epi32(_mm256_madd_epi16(T_00_12B, c16_n25_p09), _mm256_madd_epi16(T_00_13B, c16_n44_p38)); EEEO0A = _mm256_madd_epi16(T_00_14A, c16_p17_p42); EEEO0B = _mm256_madd_epi16(T_00_14B, c16_p17_p42); EEEO1A = _mm256_madd_epi16(T_00_14A, c16_n42_p17); EEEO1B = _mm256_madd_epi16(T_00_14B, c16_n42_p17); EEEE0A = _mm256_madd_epi16(T_00_15A, c16_p32_p32); EEEE0B = _mm256_madd_epi16(T_00_15B, c16_p32_p32); EEEE1A = _mm256_madd_epi16(T_00_15A, c16_n32_p32); EEEE1B = _mm256_madd_epi16(T_00_15B, c16_n32_p32); EEE0A = _mm256_add_epi32(EEEE0A, EEEO0A); // EEE0 = EEEE0 + EEEO0 EEE0B = _mm256_add_epi32(EEEE0B, EEEO0B); EEE1A = _mm256_add_epi32(EEEE1A, EEEO1A); // EEE1 = EEEE1 + EEEO1 EEE1B = _mm256_add_epi32(EEEE1B, EEEO1B); EEE3A = _mm256_sub_epi32(EEEE0A, EEEO0A); // EEE2 = EEEE0 - EEEO0 EEE3B = _mm256_sub_epi32(EEEE0B, EEEO0B); EEE2A = _mm256_sub_epi32(EEEE1A, EEEO1A); // EEE3 = EEEE1 - EEEO1 EEE2B = _mm256_sub_epi32(EEEE1B, EEEO1B); EE0A = _mm256_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 EE0B = _mm256_add_epi32(EEE0B, EEO0B); EE1A = _mm256_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 EE1B = _mm256_add_epi32(EEE1B, EEO1B); EE2A = _mm256_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0 EE2B = _mm256_add_epi32(EEE2B, EEO2B); EE3A = _mm256_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1 EE3B = _mm256_add_epi32(EEE3B, EEO3B); EE7A = _mm256_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0 EE7B = _mm256_sub_epi32(EEE0B, EEO0B); EE6A = _mm256_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1 EE6B = _mm256_sub_epi32(EEE1B, EEO1B); EE5A = _mm256_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0 EE5B = _mm256_sub_epi32(EEE2B, EEO2B); EE4A = _mm256_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1 EE4B = _mm256_sub_epi32(EEE3B, EEO3B); E0A = _mm256_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 E0B = _mm256_add_epi32(EE0B, EO0B); E1A = _mm256_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 E1B = _mm256_add_epi32(EE1B, EO1B); E2A = _mm256_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 E2B = _mm256_add_epi32(EE2B, EO2B); E3A = _mm256_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 E3B = _mm256_add_epi32(EE3B, EO3B); E4A = _mm256_add_epi32(EE4A, EO4A); // E4 = E4B = _mm256_add_epi32(EE4B, EO4B); E5A = _mm256_add_epi32(EE5A, EO5A); // E5 = E5B = _mm256_add_epi32(EE5B, EO5B); E6A = _mm256_add_epi32(EE6A, EO6A); // E6 = E6B = _mm256_add_epi32(EE6B, EO6B); E7A = _mm256_add_epi32(EE7A, EO7A); // E7 = E7B = _mm256_add_epi32(EE7B, EO7B); EFA = _mm256_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0 EFB = _mm256_sub_epi32(EE0B, EO0B); EEA = _mm256_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1 EEB = _mm256_sub_epi32(EE1B, EO1B); EDA = _mm256_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2 EDB = _mm256_sub_epi32(EE2B, EO2B); ECA = _mm256_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3 ECB = _mm256_sub_epi32(EE3B, EO3B); EBA = _mm256_sub_epi32(EE4A, EO4A); // EB = EBB = _mm256_sub_epi32(EE4B, EO4B); EAA = _mm256_sub_epi32(EE5A, EO5A); // EA = EAB = _mm256_sub_epi32(EE5B, EO5B); E9A = _mm256_sub_epi32(EE6A, EO6A); // E9 = E9B = _mm256_sub_epi32(EE6B, EO6B); E8A = _mm256_sub_epi32(EE7A, EO7A); // E8 = E8B = _mm256_sub_epi32(EE7B, EO7B); T10A = _mm256_add_epi32(E0A, c32_rnd); // E0 + rnd T10B = _mm256_add_epi32(E0B, c32_rnd); T11A = _mm256_add_epi32(E1A, c32_rnd); // E1 + rnd T11B = _mm256_add_epi32(E1B, c32_rnd); T12A = _mm256_add_epi32(E2A, c32_rnd); // E2 + rnd T12B = _mm256_add_epi32(E2B, c32_rnd); T13A = _mm256_add_epi32(E3A, c32_rnd); // E3 + rnd T13B = _mm256_add_epi32(E3B, c32_rnd); T14A = _mm256_add_epi32(E4A, c32_rnd); // E4 + rnd T14B = _mm256_add_epi32(E4B, c32_rnd); T15A = _mm256_add_epi32(E5A, c32_rnd); // E5 + rnd T15B = _mm256_add_epi32(E5B, c32_rnd); T16A = _mm256_add_epi32(E6A, c32_rnd); // E6 + rnd T16B = _mm256_add_epi32(E6B, c32_rnd); T17A = _mm256_add_epi32(E7A, c32_rnd); // E7 + rnd T17B = _mm256_add_epi32(E7B, c32_rnd); T18A = _mm256_add_epi32(E8A, c32_rnd); // E8 + rnd T18B = _mm256_add_epi32(E8B, c32_rnd); T19A = _mm256_add_epi32(E9A, c32_rnd); // E9 + rnd T19B = _mm256_add_epi32(E9B, c32_rnd); T1AA = _mm256_add_epi32(EAA, c32_rnd); // E10 + rnd T1AB = _mm256_add_epi32(EAB, c32_rnd); T1BA = _mm256_add_epi32(EBA, c32_rnd); // E11 + rnd T1BB = _mm256_add_epi32(EBB, c32_rnd); T1CA = _mm256_add_epi32(ECA, c32_rnd); // E12 + rnd T1CB = _mm256_add_epi32(ECB, c32_rnd); T1DA = _mm256_add_epi32(EDA, c32_rnd); // E13 + rnd T1DB = _mm256_add_epi32(EDB, c32_rnd); T1EA = _mm256_add_epi32(EEA, c32_rnd); // E14 + rnd T1EB = _mm256_add_epi32(EEB, c32_rnd); T1FA = _mm256_add_epi32(EFA, c32_rnd); // E15 + rnd T1FB = _mm256_add_epi32(EFB, c32_rnd); T2_00A = _mm256_add_epi32(T10A, O00A); // E0 + O0 + rnd T2_00B = _mm256_add_epi32(T10B, O00B); T2_01A = _mm256_add_epi32(T11A, O01A); // E1 + O1 + rnd T2_01B = _mm256_add_epi32(T11B, O01B); T2_02A = _mm256_add_epi32(T12A, O02A); // E2 + O2 + rnd T2_02B = _mm256_add_epi32(T12B, O02B); T2_03A = _mm256_add_epi32(T13A, O03A); // E3 + O3 + rnd T2_03B = _mm256_add_epi32(T13B, O03B); T2_04A = _mm256_add_epi32(T14A, O04A); // E4 T2_04B = _mm256_add_epi32(T14B, O04B); T2_05A = _mm256_add_epi32(T15A, O05A); // E5 T2_05B = _mm256_add_epi32(T15B, O05B); T2_06A = _mm256_add_epi32(T16A, O06A); // E6 T2_06B = _mm256_add_epi32(T16B, O06B); T2_07A = _mm256_add_epi32(T17A, O07A); // E7 T2_07B = _mm256_add_epi32(T17B, O07B); T2_08A = _mm256_add_epi32(T18A, O08A); // E8 T2_08B = _mm256_add_epi32(T18B, O08B); T2_09A = _mm256_add_epi32(T19A, O09A); // E9 T2_09B = _mm256_add_epi32(T19B, O09B); T2_10A = _mm256_add_epi32(T1AA, O10A); // E10 T2_10B = _mm256_add_epi32(T1AB, O10B); T2_11A = _mm256_add_epi32(T1BA, O11A); // E11 T2_11B = _mm256_add_epi32(T1BB, O11B); T2_12A = _mm256_add_epi32(T1CA, O12A); // E12 T2_12B = _mm256_add_epi32(T1CB, O12B); T2_13A = _mm256_add_epi32(T1DA, O13A); // E13 T2_13B = _mm256_add_epi32(T1DB, O13B); T2_14A = _mm256_add_epi32(T1EA, O14A); // E14 T2_14B = _mm256_add_epi32(T1EB, O14B); T2_15A = _mm256_add_epi32(T1FA, O15A); // E15 T2_15B = _mm256_add_epi32(T1FB, O15B); T2_31A = _mm256_sub_epi32(T10A, O00A); // E0 - O0 + rnd T2_31B = _mm256_sub_epi32(T10B, O00B); T2_30A = _mm256_sub_epi32(T11A, O01A); // E1 - O1 + rnd T2_30B = _mm256_sub_epi32(T11B, O01B); T2_29A = _mm256_sub_epi32(T12A, O02A); // E2 - O2 + rnd T2_29B = _mm256_sub_epi32(T12B, O02B); T2_28A = _mm256_sub_epi32(T13A, O03A); // E3 - O3 + rnd T2_28B = _mm256_sub_epi32(T13B, O03B); T2_27A = _mm256_sub_epi32(T14A, O04A); // E4 T2_27B = _mm256_sub_epi32(T14B, O04B); T2_26A = _mm256_sub_epi32(T15A, O05A); // E5 T2_26B = _mm256_sub_epi32(T15B, O05B); T2_25A = _mm256_sub_epi32(T16A, O06A); // E6 T2_25B = _mm256_sub_epi32(T16B, O06B); T2_24A = _mm256_sub_epi32(T17A, O07A); // E7 T2_24B = _mm256_sub_epi32(T17B, O07B); T2_23A = _mm256_sub_epi32(T18A, O08A); // T2_23B = _mm256_sub_epi32(T18B, O08B); T2_22A = _mm256_sub_epi32(T19A, O09A); // T2_22B = _mm256_sub_epi32(T19B, O09B); T2_21A = _mm256_sub_epi32(T1AA, O10A); // T2_21B = _mm256_sub_epi32(T1AB, O10B); T2_20A = _mm256_sub_epi32(T1BA, O11A); // T2_20B = _mm256_sub_epi32(T1BB, O11B); T2_19A = _mm256_sub_epi32(T1CA, O12A); // T2_19B = _mm256_sub_epi32(T1CB, O12B); T2_18A = _mm256_sub_epi32(T1DA, O13A); // T2_18B = _mm256_sub_epi32(T1DB, O13B); T2_17A = _mm256_sub_epi32(T1EA, O14A); // T2_17B = _mm256_sub_epi32(T1EB, O14B); T2_16A = _mm256_sub_epi32(T1FA, O15A); // T2_16B = _mm256_sub_epi32(T1FB, O15B); T3_00A = _mm256_srai_epi32(T2_00A, nShift); // [30 20 10 00] // This operation make it much slower than 128 T3_00B = _mm256_srai_epi32(T2_00B, nShift); // [70 60 50 40] // This operation make it much slower than 128 T3_01A = _mm256_srai_epi32(T2_01A, nShift); // [31 21 11 01] // This operation make it much slower than 128 T3_01B = _mm256_srai_epi32(T2_01B, nShift); // [71 61 51 41] // This operation make it much slower than 128 T3_02A = _mm256_srai_epi32(T2_02A, nShift); // [32 22 12 02] // This operation make it much slower than 128 T3_02B = _mm256_srai_epi32(T2_02B, nShift); // [72 62 52 42] T3_03A = _mm256_srai_epi32(T2_03A, nShift); // [33 23 13 03] T3_03B = _mm256_srai_epi32(T2_03B, nShift); // [73 63 53 43] T3_04A = _mm256_srai_epi32(T2_04A, nShift); // [33 24 14 04] T3_04B = _mm256_srai_epi32(T2_04B, nShift); // [74 64 54 44] T3_05A = _mm256_srai_epi32(T2_05A, nShift); // [35 25 15 05] T3_05B = _mm256_srai_epi32(T2_05B, nShift); // [75 65 55 45] T3_06A = _mm256_srai_epi32(T2_06A, nShift); // [36 26 16 06] T3_06B = _mm256_srai_epi32(T2_06B, nShift); // [76 66 56 46] T3_07A = _mm256_srai_epi32(T2_07A, nShift); // [37 27 17 07] T3_07B = _mm256_srai_epi32(T2_07B, nShift); // [77 67 57 47] T3_08A = _mm256_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8 T3_08B = _mm256_srai_epi32(T2_08B, nShift); // [70 60 50 40] T3_09A = _mm256_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9 T3_09B = _mm256_srai_epi32(T2_09B, nShift); // [71 61 51 41] T3_10A = _mm256_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA T3_10B = _mm256_srai_epi32(T2_10B, nShift); // [72 62 52 42] T3_11A = _mm256_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB T3_11B = _mm256_srai_epi32(T2_11B, nShift); // [73 63 53 43] T3_12A = _mm256_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC T3_12B = _mm256_srai_epi32(T2_12B, nShift); // [74 64 54 44] T3_13A = _mm256_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD T3_13B = _mm256_srai_epi32(T2_13B, nShift); // [75 65 55 45] T3_14A = _mm256_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE T3_14B = _mm256_srai_epi32(T2_14B, nShift); // [76 66 56 46] T3_15A = _mm256_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF T3_15B = _mm256_srai_epi32(T2_15B, nShift); // [77 67 57 47] T3_16A = _mm256_srai_epi32(T2_16A, nShift); // [30 20 10 00] // This operation make it much slower than 128 T3_16B = _mm256_srai_epi32(T2_16B, nShift); // [70 60 50 40] // This operation make it much slower than 128 T3_17A = _mm256_srai_epi32(T2_17A, nShift); // [31 21 11 01] // This operation make it much slower than 128 T3_17B = _mm256_srai_epi32(T2_17B, nShift); // [71 61 51 41] T3_18A = _mm256_srai_epi32(T2_18A, nShift); // [32 22 12 02] T3_18B = _mm256_srai_epi32(T2_18B, nShift); // [72 62 52 42] T3_19A = _mm256_srai_epi32(T2_19A, nShift); // [33 23 13 03] T3_19B = _mm256_srai_epi32(T2_19B, nShift); // [73 63 53 43] T3_20A = _mm256_srai_epi32(T2_20A, nShift); // [33 24 14 04] T3_20B = _mm256_srai_epi32(T2_20B, nShift); // [74 64 54 44] T3_21A = _mm256_srai_epi32(T2_21A, nShift); // [35 25 15 05] T3_21B = _mm256_srai_epi32(T2_21B, nShift); // [75 65 55 45] T3_22A = _mm256_srai_epi32(T2_22A, nShift); // [36 26 16 06] T3_22B = _mm256_srai_epi32(T2_22B, nShift); // [76 66 56 46] T3_23A = _mm256_srai_epi32(T2_23A, nShift); // [37 27 17 07] T3_23B = _mm256_srai_epi32(T2_23B, nShift); // [77 67 57 47] T3_24A = _mm256_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8 T3_24B = _mm256_srai_epi32(T2_24B, nShift); // [70 60 50 40] T3_25A = _mm256_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9 T3_25B = _mm256_srai_epi32(T2_25B, nShift); // [71 61 51 41] T3_26A = _mm256_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA T3_26B = _mm256_srai_epi32(T2_26B, nShift); // [72 62 52 42] T3_27A = _mm256_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB T3_27B = _mm256_srai_epi32(T2_27B, nShift); // [73 63 53 43] T3_28A = _mm256_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC T3_28B = _mm256_srai_epi32(T2_28B, nShift); // [74 64 54 44] T3_29A = _mm256_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD T3_29B = _mm256_srai_epi32(T2_29B, nShift); // [75 65 55 45] T3_30A = _mm256_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE T3_30B = _mm256_srai_epi32(T2_30B, nShift); // [76 66 56 46] T3_31A = _mm256_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF T3_31B = _mm256_srai_epi32(T2_31B, nShift); // [77 67 57 47] res00[part] = _mm256_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00] res01[part] = _mm256_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01] res02[part] = _mm256_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02] res03[part] = _mm256_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03] res04[part] = _mm256_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04] res05[part] = _mm256_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05] res06[part] = _mm256_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06] res07[part] = _mm256_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07] res08[part] = _mm256_packs_epi32(T3_08A, T3_08B); // [A0 ... 80] res09[part] = _mm256_packs_epi32(T3_09A, T3_09B); // [A1 ... 81] res10[part] = _mm256_packs_epi32(T3_10A, T3_10B); // [A2 ... 82] res11[part] = _mm256_packs_epi32(T3_11A, T3_11B); // [A3 ... 83] res12[part] = _mm256_packs_epi32(T3_12A, T3_12B); // [A4 ... 84] res13[part] = _mm256_packs_epi32(T3_13A, T3_13B); // [A5 ... 85] res14[part] = _mm256_packs_epi32(T3_14A, T3_14B); // [A6 ... 86] res15[part] = _mm256_packs_epi32(T3_15A, T3_15B); // [A7 ... 87] res16[part] = _mm256_packs_epi32(T3_16A, T3_16B); res17[part] = _mm256_packs_epi32(T3_17A, T3_17B); res18[part] = _mm256_packs_epi32(T3_18A, T3_18B); res19[part] = _mm256_packs_epi32(T3_19A, T3_19B); res20[part] = _mm256_packs_epi32(T3_20A, T3_20B); res21[part] = _mm256_packs_epi32(T3_21A, T3_21B); res22[part] = _mm256_packs_epi32(T3_22A, T3_22B); res23[part] = _mm256_packs_epi32(T3_23A, T3_23B); res24[part] = _mm256_packs_epi32(T3_24A, T3_24B); res25[part] = _mm256_packs_epi32(T3_25A, T3_25B); res26[part] = _mm256_packs_epi32(T3_26A, T3_26B); res27[part] = _mm256_packs_epi32(T3_27A, T3_27B); res28[part] = _mm256_packs_epi32(T3_28A, T3_28B); res29[part] = _mm256_packs_epi32(T3_29A, T3_29B); res30[part] = _mm256_packs_epi32(T3_30A, T3_30B); res31[part] = _mm256_packs_epi32(T3_31A, T3_31B); } //transpose 32x32 matrix { __m256i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7, tr0_8, tr0_9, tr0_10, tr0_11, tr0_12, tr0_13, tr0_14, tr0_15; #define TRANSPOSE_16x16_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, O0, O1, O2, O3, O4, O5, O6, O7, O8, O9, O10, O11, O12, O13, O14, O15) \ tr0_0 = _mm256_unpacklo_epi16(I0, I1); \ tr0_1 = _mm256_unpacklo_epi16(I2, I3); \ tr0_2 = _mm256_unpacklo_epi16(I4, I5); \ tr0_3 = _mm256_unpacklo_epi16(I6, I7); \ tr0_4 = _mm256_unpacklo_epi16(I8, I9); \ tr0_5 = _mm256_unpacklo_epi16(I10, I11); \ tr0_6 = _mm256_unpacklo_epi16(I12, I13); \ tr0_7 = _mm256_unpacklo_epi16(I14, I15); \ tr0_8 = _mm256_unpackhi_epi16(I0, I1); \ tr0_9 = _mm256_unpackhi_epi16(I2, I3); \ tr0_10 = _mm256_unpackhi_epi16(I4, I5); \ tr0_11 = _mm256_unpackhi_epi16(I6, I7); \ tr0_12 = _mm256_unpackhi_epi16(I8, I9); \ tr0_13 = _mm256_unpackhi_epi16(I10, I11); \ tr0_14 = _mm256_unpackhi_epi16(I12, I13); \ tr0_15 = _mm256_unpackhi_epi16(I14, I15); \ O0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); \ O1 = _mm256_unpacklo_epi32(tr0_2, tr0_3); \ O2 = _mm256_unpacklo_epi32(tr0_4, tr0_5); \ O3 = _mm256_unpacklo_epi32(tr0_6, tr0_7); \ O4 = _mm256_unpackhi_epi32(tr0_0, tr0_1); \ O5 = _mm256_unpackhi_epi32(tr0_2, tr0_3); \ O6 = _mm256_unpackhi_epi32(tr0_4, tr0_5); \ O7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); \ O8 = _mm256_unpacklo_epi32(tr0_8, tr0_9); \ O9 = _mm256_unpacklo_epi32(tr0_10, tr0_11); \ O10 = _mm256_unpacklo_epi32(tr0_12, tr0_13); \ O11 = _mm256_unpacklo_epi32(tr0_14, tr0_15); \ O12 = _mm256_unpackhi_epi32(tr0_8, tr0_9); \ O13 = _mm256_unpackhi_epi32(tr0_10, tr0_11); \ O14 = _mm256_unpackhi_epi32(tr0_12, tr0_13); \ O15 = _mm256_unpackhi_epi32(tr0_14, tr0_15); \ tr0_0 = _mm256_unpacklo_epi64(O0, O1); \ tr0_1 = _mm256_unpacklo_epi64(O2, O3); \ tr0_2 = _mm256_unpackhi_epi64(O0, O1); \ tr0_3 = _mm256_unpackhi_epi64(O2, O3); \ tr0_4 = _mm256_unpacklo_epi64(O4, O5); \ tr0_5 = _mm256_unpacklo_epi64(O6, O7); \ tr0_6 = _mm256_unpackhi_epi64(O4, O5); \ tr0_7 = _mm256_unpackhi_epi64(O6, O7); \ tr0_8 = _mm256_unpacklo_epi64(O8, O9); \ tr0_9 = _mm256_unpacklo_epi64(O10, O11); \ tr0_10 = _mm256_unpackhi_epi64(O8, O9); \ tr0_11 = _mm256_unpackhi_epi64(O10, O11); \ tr0_12 = _mm256_unpacklo_epi64(O12, O13); \ tr0_13 = _mm256_unpacklo_epi64(O14, O15); \ tr0_14 = _mm256_unpackhi_epi64(O12, O13); \ tr0_15 = _mm256_unpackhi_epi64(O14, O15); \ O0 = _mm256_permute2x128_si256(tr0_0, tr0_1, 0x20); \ O1 = _mm256_permute2x128_si256(tr0_2, tr0_3, 0x20); \ O2 = _mm256_permute2x128_si256(tr0_4, tr0_5, 0x20); \ O3 = _mm256_permute2x128_si256(tr0_6, tr0_7, 0x20); \ O4 = _mm256_permute2x128_si256(tr0_8, tr0_9, 0x20); \ O5 = _mm256_permute2x128_si256(tr0_10, tr0_11, 0x20); \ O6 = _mm256_permute2x128_si256(tr0_12, tr0_13, 0x20); \ O7 = _mm256_permute2x128_si256(tr0_14, tr0_15, 0x20); \ O8 = _mm256_permute2x128_si256(tr0_0, tr0_1, 0x31); \ O9 = _mm256_permute2x128_si256(tr0_2, tr0_3, 0x31); \ O10 = _mm256_permute2x128_si256(tr0_4, tr0_5, 0x31); \ O11 = _mm256_permute2x128_si256(tr0_6, tr0_7, 0x31); \ O12 = _mm256_permute2x128_si256(tr0_8, tr0_9, 0x31); \ O13 = _mm256_permute2x128_si256(tr0_10, tr0_11, 0x31); \ O14 = _mm256_permute2x128_si256(tr0_12, tr0_13, 0x31); \ O15 = _mm256_permute2x128_si256(tr0_14, tr0_15, 0x31); \ TRANSPOSE_16x16_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0]) TRANSPOSE_16x16_16BIT(res16[0], res17[0], res18[0], res19[0], res20[0], res21[0], res22[0], res23[0], res24[0], res25[0], res26[0], res27[0], res28[0], res29[0], res30[0], res31[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1]); TRANSPOSE_16x16_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in16[0], in17[0], in18[0], in19[0], in20[0], in21[0], in22[0], in23[0], in24[0], in25[0], in26[0], in27[0], in28[0], in29[0], in30[0], in31[0]); TRANSPOSE_16x16_16BIT(res16[1], res17[1], res18[1], res19[1], res20[1], res21[1], res22[1], res23[1], res24[1], res25[1], res26[1], res27[1], res28[1], res29[1], res30[1], res31[1], in16[1], in17[1], in18[1], in19[1], in20[1], in21[1], in22[1], in23[1], in24[1], in25[1], in26[1], in27[1], in28[1], in29[1], in30[1], in31[1]); #undef TRANSPOSE_16x16_16BIT } c32_rnd = _mm256_set1_epi32(shift ? (1 << (shift - 1)) : 0); // pass == 1 ڶ nShift = shift; } // clip max_val = _mm256_set1_epi16((1 << (clip - 1)) - 1); min_val = _mm256_set1_epi16(-(1 << (clip - 1))); for (k = 0; k < 2; k++) { in00[k] = _mm256_max_epi16(_mm256_min_epi16(in00[k], max_val), min_val); in01[k] = _mm256_max_epi16(_mm256_min_epi16(in01[k], max_val), min_val); in02[k] = _mm256_max_epi16(_mm256_min_epi16(in02[k], max_val), min_val); in03[k] = _mm256_max_epi16(_mm256_min_epi16(in03[k], max_val), min_val); in04[k] = _mm256_max_epi16(_mm256_min_epi16(in04[k], max_val), min_val); in05[k] = _mm256_max_epi16(_mm256_min_epi16(in05[k], max_val), min_val); in06[k] = _mm256_max_epi16(_mm256_min_epi16(in06[k], max_val), min_val); in07[k] = _mm256_max_epi16(_mm256_min_epi16(in07[k], max_val), min_val); in08[k] = _mm256_max_epi16(_mm256_min_epi16(in08[k], max_val), min_val); in09[k] = _mm256_max_epi16(_mm256_min_epi16(in09[k], max_val), min_val); in10[k] = _mm256_max_epi16(_mm256_min_epi16(in10[k], max_val), min_val); in11[k] = _mm256_max_epi16(_mm256_min_epi16(in11[k], max_val), min_val); in12[k] = _mm256_max_epi16(_mm256_min_epi16(in12[k], max_val), min_val); in13[k] = _mm256_max_epi16(_mm256_min_epi16(in13[k], max_val), min_val); in14[k] = _mm256_max_epi16(_mm256_min_epi16(in14[k], max_val), min_val); in15[k] = _mm256_max_epi16(_mm256_min_epi16(in15[k], max_val), min_val); in16[k] = _mm256_max_epi16(_mm256_min_epi16(in16[k], max_val), min_val); in17[k] = _mm256_max_epi16(_mm256_min_epi16(in17[k], max_val), min_val); in18[k] = _mm256_max_epi16(_mm256_min_epi16(in18[k], max_val), min_val); in19[k] = _mm256_max_epi16(_mm256_min_epi16(in19[k], max_val), min_val); in20[k] = _mm256_max_epi16(_mm256_min_epi16(in20[k], max_val), min_val); in21[k] = _mm256_max_epi16(_mm256_min_epi16(in21[k], max_val), min_val); in22[k] = _mm256_max_epi16(_mm256_min_epi16(in22[k], max_val), min_val); in23[k] = _mm256_max_epi16(_mm256_min_epi16(in23[k], max_val), min_val); in24[k] = _mm256_max_epi16(_mm256_min_epi16(in24[k], max_val), min_val); in25[k] = _mm256_max_epi16(_mm256_min_epi16(in25[k], max_val), min_val); in26[k] = _mm256_max_epi16(_mm256_min_epi16(in26[k], max_val), min_val); in27[k] = _mm256_max_epi16(_mm256_min_epi16(in27[k], max_val), min_val); in28[k] = _mm256_max_epi16(_mm256_min_epi16(in28[k], max_val), min_val); in29[k] = _mm256_max_epi16(_mm256_min_epi16(in29[k], max_val), min_val); in30[k] = _mm256_max_epi16(_mm256_min_epi16(in30[k], max_val), min_val); in31[k] = _mm256_max_epi16(_mm256_min_epi16(in31[k], max_val), min_val); } // Store for (i = 0; i < 2; i++) { const int offset = (i << 4); _mm256_storeu_si256((__m256i*)&dst[0 * 32 + offset], in00[i]); _mm256_storeu_si256((__m256i*)&dst[1 * 32 + offset], in01[i]); _mm256_storeu_si256((__m256i*)&dst[2 * 32 + offset], in02[i]); _mm256_storeu_si256((__m256i*)&dst[3 * 32 + offset], in03[i]); _mm256_storeu_si256((__m256i*)&dst[4 * 32 + offset], in04[i]); _mm256_storeu_si256((__m256i*)&dst[5 * 32 + offset], in05[i]); _mm256_storeu_si256((__m256i*)&dst[6 * 32 + offset], in06[i]); _mm256_storeu_si256((__m256i*)&dst[7 * 32 + offset], in07[i]); _mm256_storeu_si256((__m256i*)&dst[8 * 32 + offset], in08[i]); _mm256_storeu_si256((__m256i*)&dst[9 * 32 + offset], in09[i]); _mm256_storeu_si256((__m256i*)&dst[10 * 32 + offset], in10[i]); _mm256_storeu_si256((__m256i*)&dst[11 * 32 + offset], in11[i]); _mm256_storeu_si256((__m256i*)&dst[12 * 32 + offset], in12[i]); _mm256_storeu_si256((__m256i*)&dst[13 * 32 + offset], in13[i]); _mm256_storeu_si256((__m256i*)&dst[14 * 32 + offset], in14[i]); _mm256_storeu_si256((__m256i*)&dst[15 * 32 + offset], in15[i]); _mm256_storeu_si256((__m256i*)&dst[16 * 32 + offset], in16[i]); _mm256_storeu_si256((__m256i*)&dst[17 * 32 + offset], in17[i]); _mm256_storeu_si256((__m256i*)&dst[18 * 32 + offset], in18[i]); _mm256_storeu_si256((__m256i*)&dst[19 * 32 + offset], in19[i]); _mm256_storeu_si256((__m256i*)&dst[20 * 32 + offset], in20[i]); _mm256_storeu_si256((__m256i*)&dst[21 * 32 + offset], in21[i]); _mm256_storeu_si256((__m256i*)&dst[22 * 32 + offset], in22[i]); _mm256_storeu_si256((__m256i*)&dst[23 * 32 + offset], in23[i]); _mm256_storeu_si256((__m256i*)&dst[24 * 32 + offset], in24[i]); _mm256_storeu_si256((__m256i*)&dst[25 * 32 + offset], in25[i]); _mm256_storeu_si256((__m256i*)&dst[26 * 32 + offset], in26[i]); _mm256_storeu_si256((__m256i*)&dst[27 * 32 + offset], in27[i]); _mm256_storeu_si256((__m256i*)&dst[28 * 32 + offset], in28[i]); _mm256_storeu_si256((__m256i*)&dst[29 * 32 + offset], in29[i]); _mm256_storeu_si256((__m256i*)&dst[30 * 32 + offset], in30[i]); _mm256_storeu_si256((__m256i*)&dst[31 * 32 + offset], in31[i]); } } #define TRANSPOSE_8x8_16BIT_m256i(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm256_unpacklo_epi16(I0, I1); \ tr0_1 = _mm256_unpacklo_epi16(I2, I3); \ tr0_2 = _mm256_unpackhi_epi16(I0, I1); \ tr0_3 = _mm256_unpackhi_epi16(I2, I3); \ tr0_4 = _mm256_unpacklo_epi16(I4, I5); \ tr0_5 = _mm256_unpacklo_epi16(I6, I7); \ tr0_6 = _mm256_unpackhi_epi16(I4, I5); \ tr0_7 = _mm256_unpackhi_epi16(I6, I7); \ tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm256_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm256_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm256_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm256_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); #define TRANSPOSE_16x16_16BIT_m256i(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, O0, O1, O2, O3, O4, O5, O6, O7, O8, O9, O10, O11, O12, O13, O14, O15) \ TRANSPOSE_8x8_16BIT_m256i(I0, I1, I2, I3, I4, I5, I6, I7, t0, t1, t2, t3, t4, t5, t6, t7); \ TRANSPOSE_8x8_16BIT_m256i(I8, I9, I10, I11, I12, I13, I14, I15, t8, t9, t10, t11, t12, t13, t14, t15); \ O0 = _mm256_permute2x128_si256(t0, t8, 0x20); \ O1 = _mm256_permute2x128_si256(t1, t9, 0x20); \ O2 = _mm256_permute2x128_si256(t2, t10, 0x20); \ O3 = _mm256_permute2x128_si256(t3, t11, 0x20); \ O4 = _mm256_permute2x128_si256(t4, t12, 0x20); \ O5 = _mm256_permute2x128_si256(t5, t13, 0x20); \ O6 = _mm256_permute2x128_si256(t6, t14, 0x20); \ O7 = _mm256_permute2x128_si256(t7, t15, 0x20); \ O8 = _mm256_permute2x128_si256(t0, t8, 0x31); \ O9 = _mm256_permute2x128_si256(t1, t9, 0x31); \ O10 = _mm256_permute2x128_si256(t2, t10, 0x31); \ O11 = _mm256_permute2x128_si256(t3, t11, 0x31); \ O12 = _mm256_permute2x128_si256(t4, t12, 0x31); \ O13 = _mm256_permute2x128_si256(t5, t13, 0x31); \ O14 = _mm256_permute2x128_si256(t6, t14, 0x31); \ O15 = _mm256_permute2x128_si256(t7, t15, 0x31); //inv_wavelet_64x16_sse128 static void inv_wavelet_64x16_avx2(coeff_t *coeff) { int i; __m256i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m256i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; __m256i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; // 64*16 __m256i T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4]; // 16*64 __m256i V00, V01, V02, V03, V04, V05, V06, V07, V08, V09, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31, V32, V33, V34, V35, V36, V37, V38, V39, V40, V41, V42, V43, V44, V45, V46, V47, V48, V49, V50, V51, V52, V53, V54, V55, V56, V57, V58, V59, V60, V61, V62, V63; /*--vertical transform--*/ //32*8, LOAD AND SHIFT T00[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 0]), 1); T01[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 1]), 1); T02[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 2]), 1); T03[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 3]), 1); T04[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 4]), 1); T05[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 5]), 1); T06[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 6]), 1); T07[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 7]), 1); T00[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 0]), 1); T01[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 1]), 1); T02[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 2]), 1); T03[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 3]), 1); T04[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 4]), 1); T05[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 5]), 1); T06[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 6]), 1); T07[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 7]), 1); //filter (odd pixel/row) T08[0] = _mm256_srai_epi16(_mm256_add_epi16(T00[0], T01[0]), 1); T09[0] = _mm256_srai_epi16(_mm256_add_epi16(T01[0], T02[0]), 1); T10[0] = _mm256_srai_epi16(_mm256_add_epi16(T02[0], T03[0]), 1); T11[0] = _mm256_srai_epi16(_mm256_add_epi16(T03[0], T04[0]), 1); T12[0] = _mm256_srai_epi16(_mm256_add_epi16(T04[0], T05[0]), 1); T13[0] = _mm256_srai_epi16(_mm256_add_epi16(T05[0], T06[0]), 1); T14[0] = _mm256_srai_epi16(_mm256_add_epi16(T06[0], T07[0]), 1); T15[0] = _mm256_srai_epi16(_mm256_add_epi16(T07[0], T07[0]), 1); T08[1] = _mm256_srai_epi16(_mm256_add_epi16(T00[1], T01[1]), 1); T09[1] = _mm256_srai_epi16(_mm256_add_epi16(T01[1], T02[1]), 1); T10[1] = _mm256_srai_epi16(_mm256_add_epi16(T02[1], T03[1]), 1); T11[1] = _mm256_srai_epi16(_mm256_add_epi16(T03[1], T04[1]), 1); T12[1] = _mm256_srai_epi16(_mm256_add_epi16(T04[1], T05[1]), 1); T13[1] = _mm256_srai_epi16(_mm256_add_epi16(T05[1], T06[1]), 1); T14[1] = _mm256_srai_epi16(_mm256_add_epi16(T06[1], T07[1]), 1); T15[1] = _mm256_srai_epi16(_mm256_add_epi16(T07[1], T07[1]), 1); /*--transposition--*/ //32x16 -> 16x32 TRANSPOSE_16x16_16BIT_m256i(T00[0], T08[0], T01[0], T09[0], T02[0], T10[0], T03[0], T11[0], T04[0], T12[0], T05[0], T13[0], T06[0], T14[0], T07[0], T15[0], V00, V01, V02, V03, V04, V05, V06, V07, V08, V09, V10, V11, V12, V13, V14, V15); TRANSPOSE_16x16_16BIT_m256i(T00[1], T08[1], T01[1], T09[1], T02[1], T10[1], T03[1], T11[1], T04[1], T12[1], T05[1], T13[1], T06[1], T14[1], T07[1], T15[1], V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31); /*--horizontal transform--*/ //filter (odd pixel/column) V32 = _mm256_srai_epi16(_mm256_add_epi16(V00, V01), 1); V33 = _mm256_srai_epi16(_mm256_add_epi16(V01, V02), 1); V34 = _mm256_srai_epi16(_mm256_add_epi16(V02, V03), 1); V35 = _mm256_srai_epi16(_mm256_add_epi16(V03, V04), 1); V36 = _mm256_srai_epi16(_mm256_add_epi16(V04, V05), 1); V37 = _mm256_srai_epi16(_mm256_add_epi16(V05, V06), 1); V38 = _mm256_srai_epi16(_mm256_add_epi16(V06, V07), 1); V39 = _mm256_srai_epi16(_mm256_add_epi16(V07, V08), 1); V40 = _mm256_srai_epi16(_mm256_add_epi16(V08, V09), 1); V41 = _mm256_srai_epi16(_mm256_add_epi16(V09, V10), 1); V42 = _mm256_srai_epi16(_mm256_add_epi16(V10, V11), 1); V43 = _mm256_srai_epi16(_mm256_add_epi16(V11, V12), 1); V44 = _mm256_srai_epi16(_mm256_add_epi16(V12, V13), 1); V45 = _mm256_srai_epi16(_mm256_add_epi16(V13, V14), 1); V46 = _mm256_srai_epi16(_mm256_add_epi16(V14, V15), 1); V47 = _mm256_srai_epi16(_mm256_add_epi16(V15, V16), 1); V48 = _mm256_srai_epi16(_mm256_add_epi16(V16, V17), 1); V49 = _mm256_srai_epi16(_mm256_add_epi16(V17, V18), 1); V50 = _mm256_srai_epi16(_mm256_add_epi16(V18, V19), 1); V51 = _mm256_srai_epi16(_mm256_add_epi16(V19, V20), 1); V52 = _mm256_srai_epi16(_mm256_add_epi16(V20, V21), 1); V53 = _mm256_srai_epi16(_mm256_add_epi16(V21, V22), 1); V54 = _mm256_srai_epi16(_mm256_add_epi16(V22, V23), 1); V55 = _mm256_srai_epi16(_mm256_add_epi16(V23, V24), 1); V56 = _mm256_srai_epi16(_mm256_add_epi16(V24, V25), 1); V57 = _mm256_srai_epi16(_mm256_add_epi16(V25, V26), 1); V58 = _mm256_srai_epi16(_mm256_add_epi16(V26, V27), 1); V59 = _mm256_srai_epi16(_mm256_add_epi16(V27, V28), 1); V60 = _mm256_srai_epi16(_mm256_add_epi16(V28, V29), 1); V61 = _mm256_srai_epi16(_mm256_add_epi16(V29, V30), 1); V62 = _mm256_srai_epi16(_mm256_add_epi16(V30, V31), 1); V63 = _mm256_srai_epi16(_mm256_add_epi16(V31, V31), 1); /*--transposition & Store--*/ //16x64 -> 64x16 TRANSPOSE_16x16_16BIT_m256i(V00, V32, V01, V33, V02, V34, V03, V35, V04, V36, V05, V37, V06, V38, V07, V39, T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0]); TRANSPOSE_16x16_16BIT_m256i(V08, V40, V09, V41, V10, V42, V11, V43, V12, V44, V13, V45, V14, V46, V15, V47, T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1]); TRANSPOSE_16x16_16BIT_m256i(V16, V48, V17, V49, V18, V50, V19, V51, V20, V52, V21, V53, V22, V54, V23, V55, T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2], T08[2], T09[2], T10[2], T11[2], T12[2], T13[2], T14[2], T15[2]); TRANSPOSE_16x16_16BIT_m256i(V24, V56, V25, V57, V26, V58, V27, V59, V28, V60, V29, V61, V30, V62, V31, V63, T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3], T08[3], T09[3], T10[3], T11[3], T12[3], T13[3], T14[3], T15[3]); //store for (i = 0; i < 4; i++) { _mm256_storeu_si256((__m256i*)&coeff[16 * i], T00[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64], T01[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 2], T02[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 3], T03[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 4], T04[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 5], T05[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 6], T06[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 7], T07[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 8], T08[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 9], T09[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 10], T10[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 11], T11[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 12], T12[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 13], T13[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 14], T14[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 15], T15[i]); } } static void inv_wavelet_16x64_avx2(coeff_t *coeff) { //src blk 8*32 __m256i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m256i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; __m256i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; __m256i S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31; __m256i S32, S33, S34, S35, S36, S37, S38, S39, S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, S50, S51, S52, S53, S54, S55, S56, S57, S58, S59, S60, S61, S62, S63; // 64*16 __m256i TT00[8], TT01[8], TT02[8], TT03[8], TT04[8], TT05[8], TT06[8], TT07[8]; __m256i T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4]; // 16*64 __m256i V00, V01, V02, V03, V04, V05, V06, V07, V08, V09, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31, V32, V33, V34, V35, V36, V37, V38, V39, V40, V41, V42, V43, V44, V45, V46, V47, V48, V49, V50, V51, V52, V53, V54, V55, V56, V57, V58, V59, V60, V61, V62, V63; int i; /*--load & shift--*/ //8*32 S00 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 0]), 1); S01 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 1]), 1); S02 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 2]), 1); S03 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 3]), 1); S04 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 4]), 1); S05 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 5]), 1); S06 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 6]), 1); S07 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 7]), 1); S08 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 8]), 1); S09 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 9]), 1); S10 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 10]), 1); S11 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 11]), 1); S12 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 12]), 1); S13 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 13]), 1); S14 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 14]), 1); S15 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 15]), 1); S16 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 16]), 1); S17 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 17]), 1); S18 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 18]), 1); S19 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 19]), 1); S20 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 20]), 1); S21 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 21]), 1); S22 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 22]), 1); S23 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 23]), 1); S24 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 24]), 1); S25 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 25]), 1); S26 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 26]), 1); S27 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 27]), 1); S28 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 28]), 1); S29 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 29]), 1); S30 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 30]), 1); S31 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 31]), 1); /*--vertical transform--*/ S32 = _mm256_srai_epi16(_mm256_add_epi16(S00, S01), 1); S33 = _mm256_srai_epi16(_mm256_add_epi16(S01, S02), 1); S34 = _mm256_srai_epi16(_mm256_add_epi16(S02, S03), 1); S35 = _mm256_srai_epi16(_mm256_add_epi16(S03, S04), 1); S36 = _mm256_srai_epi16(_mm256_add_epi16(S04, S05), 1); S37 = _mm256_srai_epi16(_mm256_add_epi16(S05, S06), 1); S38 = _mm256_srai_epi16(_mm256_add_epi16(S06, S07), 1); S39 = _mm256_srai_epi16(_mm256_add_epi16(S07, S08), 1); S40 = _mm256_srai_epi16(_mm256_add_epi16(S08, S09), 1); S41 = _mm256_srai_epi16(_mm256_add_epi16(S09, S10), 1); S42 = _mm256_srai_epi16(_mm256_add_epi16(S10, S11), 1); S43 = _mm256_srai_epi16(_mm256_add_epi16(S11, S12), 1); S44 = _mm256_srai_epi16(_mm256_add_epi16(S12, S13), 1); S45 = _mm256_srai_epi16(_mm256_add_epi16(S13, S14), 1); S46 = _mm256_srai_epi16(_mm256_add_epi16(S14, S15), 1); S47 = _mm256_srai_epi16(_mm256_add_epi16(S15, S16), 1); S48 = _mm256_srai_epi16(_mm256_add_epi16(S16, S17), 1); S49 = _mm256_srai_epi16(_mm256_add_epi16(S17, S18), 1); S50 = _mm256_srai_epi16(_mm256_add_epi16(S18, S19), 1); S51 = _mm256_srai_epi16(_mm256_add_epi16(S19, S20), 1); S52 = _mm256_srai_epi16(_mm256_add_epi16(S20, S21), 1); S53 = _mm256_srai_epi16(_mm256_add_epi16(S21, S22), 1); S54 = _mm256_srai_epi16(_mm256_add_epi16(S22, S23), 1); S55 = _mm256_srai_epi16(_mm256_add_epi16(S23, S24), 1); S56 = _mm256_srai_epi16(_mm256_add_epi16(S24, S25), 1); S57 = _mm256_srai_epi16(_mm256_add_epi16(S25, S26), 1); S58 = _mm256_srai_epi16(_mm256_add_epi16(S26, S27), 1); S59 = _mm256_srai_epi16(_mm256_add_epi16(S27, S28), 1); S60 = _mm256_srai_epi16(_mm256_add_epi16(S28, S29), 1); S61 = _mm256_srai_epi16(_mm256_add_epi16(S29, S30), 1); S62 = _mm256_srai_epi16(_mm256_add_epi16(S30, S31), 1); S63 = _mm256_srai_epi16(_mm256_add_epi16(S31, S31), 1); /*--transposition--*/ //8x64 -> 64x8 TRANSPOSE_8x8_16BIT_m256i(S00, S32, S01, S33, S02, S34, S03, S35, TT00[0], TT01[0], TT02[0], TT03[0], TT04[0], TT05[0], TT06[0], TT07[0]); TRANSPOSE_8x8_16BIT_m256i(S04, S36, S05, S37, S06, S38, S07, S39, TT00[1], TT01[1], TT02[1], TT03[1], TT04[1], TT05[1], TT06[1], TT07[1]); TRANSPOSE_8x8_16BIT_m256i(S08, S40, S09, S41, S10, S42, S11, S43, TT00[2], TT01[2], TT02[2], TT03[2], TT04[2], TT05[2], TT06[2], TT07[2]); TRANSPOSE_8x8_16BIT_m256i(S12, S44, S13, S45, S14, S46, S15, S47, TT00[3], TT01[3], TT02[3], TT03[3], TT04[3], TT05[3], TT06[3], TT07[3]); TRANSPOSE_8x8_16BIT_m256i(S16, S48, S17, S49, S18, S50, S19, S51, TT00[4], TT01[4], TT02[4], TT03[4], TT04[4], TT05[4], TT06[4], TT07[4]); TRANSPOSE_8x8_16BIT_m256i(S20, S52, S21, S53, S22, S54, S23, S55, TT00[5], TT01[5], TT02[5], TT03[5], TT04[5], TT05[5], TT06[5], TT07[5]); TRANSPOSE_8x8_16BIT_m256i(S24, S56, S25, S57, S26, S58, S27, S59, TT00[6], TT01[6], TT02[6], TT03[6], TT04[6], TT05[6], TT06[6], TT07[6]); TRANSPOSE_8x8_16BIT_m256i(S28, S60, S29, S61, S30, S62, S31, S63, TT00[7], TT01[7], TT02[7], TT03[7], TT04[7], TT05[7], TT06[7], TT07[7]); T00[0] = _mm256_permute2x128_si256(TT00[0], TT00[1], 0x20); T00[1] = _mm256_permute2x128_si256(TT00[2], TT00[3], 0x20); T00[2] = _mm256_permute2x128_si256(TT00[4], TT00[5], 0x20); T00[3] = _mm256_permute2x128_si256(TT00[6], TT00[7], 0x20); T01[0] = _mm256_permute2x128_si256(TT01[0], TT01[1], 0x20); T01[1] = _mm256_permute2x128_si256(TT01[2], TT01[3], 0x20); T01[2] = _mm256_permute2x128_si256(TT01[4], TT01[5], 0x20); T01[3] = _mm256_permute2x128_si256(TT01[6], TT01[7], 0x20); T02[0] = _mm256_permute2x128_si256(TT02[0], TT02[1], 0x20); T02[1] = _mm256_permute2x128_si256(TT02[2], TT02[3], 0x20); T02[2] = _mm256_permute2x128_si256(TT02[4], TT02[5], 0x20); T02[3] = _mm256_permute2x128_si256(TT02[6], TT02[7], 0x20); T03[0] = _mm256_permute2x128_si256(TT03[0], TT03[1], 0x20); T03[1] = _mm256_permute2x128_si256(TT03[2], TT03[3], 0x20); T03[2] = _mm256_permute2x128_si256(TT03[4], TT03[5], 0x20); T03[3] = _mm256_permute2x128_si256(TT03[6], TT03[7], 0x20); T04[0] = _mm256_permute2x128_si256(TT04[0], TT04[1], 0x20); T04[1] = _mm256_permute2x128_si256(TT04[2], TT04[3], 0x20); T04[2] = _mm256_permute2x128_si256(TT04[4], TT04[5], 0x20); T04[3] = _mm256_permute2x128_si256(TT04[6], TT04[7], 0x20); T05[0] = _mm256_permute2x128_si256(TT05[0], TT05[1], 0x20); T05[1] = _mm256_permute2x128_si256(TT05[2], TT05[3], 0x20); T05[2] = _mm256_permute2x128_si256(TT05[4], TT05[5], 0x20); T05[3] = _mm256_permute2x128_si256(TT05[6], TT05[7], 0x20); T06[0] = _mm256_permute2x128_si256(TT06[0], TT06[1], 0x20); T06[1] = _mm256_permute2x128_si256(TT06[2], TT06[3], 0x20); T06[2] = _mm256_permute2x128_si256(TT06[4], TT06[5], 0x20); T06[3] = _mm256_permute2x128_si256(TT06[6], TT06[7], 0x20); T07[0] = _mm256_permute2x128_si256(TT07[0], TT07[1], 0x20); T07[1] = _mm256_permute2x128_si256(TT07[2], TT07[3], 0x20); T07[2] = _mm256_permute2x128_si256(TT07[4], TT07[5], 0x20); T07[3] = _mm256_permute2x128_si256(TT07[6], TT07[7], 0x20); /*--horizontal transform--*/ for (i = 0; i < 4; i++) { T08[i] = _mm256_srai_epi16(_mm256_add_epi16(T00[i], T01[i]), 1); T09[i] = _mm256_srai_epi16(_mm256_add_epi16(T01[i], T02[i]), 1); T10[i] = _mm256_srai_epi16(_mm256_add_epi16(T02[i], T03[i]), 1); T11[i] = _mm256_srai_epi16(_mm256_add_epi16(T03[i], T04[i]), 1); T12[i] = _mm256_srai_epi16(_mm256_add_epi16(T04[i], T05[i]), 1); T13[i] = _mm256_srai_epi16(_mm256_add_epi16(T05[i], T06[i]), 1); T14[i] = _mm256_srai_epi16(_mm256_add_epi16(T06[i], T07[i]), 1); T15[i] = _mm256_srai_epi16(_mm256_add_epi16(T07[i], T07[i]), 1); } /*--transposition--*/ //64x16 -> 16x64 TRANSPOSE_16x16_16BIT_m256i(T00[0], T08[0], T01[0], T09[0], T02[0], T10[0], T03[0], T11[0], T04[0], T12[0], T05[0], T13[0], T06[0], T14[0], T07[0], T15[0], V00, V01, V02, V03, V04, V05, V06, V07, V08, V09, V10, V11, V12, V13, V14, V15); TRANSPOSE_16x16_16BIT_m256i(T00[1], T08[1], T01[1], T09[1], T02[1], T10[1], T03[1], T11[1], T04[1], T12[1], T05[1], T13[1], T06[1], T14[1], T07[1], T15[1], V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31); TRANSPOSE_16x16_16BIT_m256i(T00[2], T08[2], T01[2], T09[2], T02[2], T10[2], T03[2], T11[2], T04[2], T12[2], T05[2], T13[2], T06[2], T14[2], T07[2], T15[2], V32, V33, V34, V35, V36, V37, V38, V39, V40, V41, V42, V43, V44, V45, V46, V47); TRANSPOSE_16x16_16BIT_m256i(T00[3], T08[3], T01[3], T09[3], T02[3], T10[3], T03[3], T11[3], T04[3], T12[3], T05[3], T13[3], T06[3], T14[3], T07[3], T15[3], V48, V49, V50, V51, V52, V53, V54, V55, V56, V57, V58, V59, V60, V61, V62, V63); /*--Store--*/ //16x64 _mm256_storeu_si256((__m256i*)&coeff[16 * 0], V00); _mm256_storeu_si256((__m256i*)&coeff[16 * 1], V01); _mm256_storeu_si256((__m256i*)&coeff[16 * 2], V02); _mm256_storeu_si256((__m256i*)&coeff[16 * 3], V03); _mm256_storeu_si256((__m256i*)&coeff[16 * 4], V04); _mm256_storeu_si256((__m256i*)&coeff[16 * 5], V05); _mm256_storeu_si256((__m256i*)&coeff[16 * 6], V06); _mm256_storeu_si256((__m256i*)&coeff[16 * 7], V07); _mm256_storeu_si256((__m256i*)&coeff[16 * 8], V08); _mm256_storeu_si256((__m256i*)&coeff[16 * 9], V09); _mm256_storeu_si256((__m256i*)&coeff[16 * 10], V10); _mm256_storeu_si256((__m256i*)&coeff[16 * 11], V11); _mm256_storeu_si256((__m256i*)&coeff[16 * 12], V12); _mm256_storeu_si256((__m256i*)&coeff[16 * 13], V13); _mm256_storeu_si256((__m256i*)&coeff[16 * 14], V14); _mm256_storeu_si256((__m256i*)&coeff[16 * 15], V15); _mm256_storeu_si256((__m256i*)&coeff[16 * 16], V16); _mm256_storeu_si256((__m256i*)&coeff[16 * 17], V17); _mm256_storeu_si256((__m256i*)&coeff[16 * 18], V18); _mm256_storeu_si256((__m256i*)&coeff[16 * 19], V19); _mm256_storeu_si256((__m256i*)&coeff[16 * 20], V20); _mm256_storeu_si256((__m256i*)&coeff[16 * 21], V21); _mm256_storeu_si256((__m256i*)&coeff[16 * 22], V22); _mm256_storeu_si256((__m256i*)&coeff[16 * 23], V23); _mm256_storeu_si256((__m256i*)&coeff[16 * 24], V24); _mm256_storeu_si256((__m256i*)&coeff[16 * 25], V25); _mm256_storeu_si256((__m256i*)&coeff[16 * 26], V26); _mm256_storeu_si256((__m256i*)&coeff[16 * 27], V27); _mm256_storeu_si256((__m256i*)&coeff[16 * 28], V28); _mm256_storeu_si256((__m256i*)&coeff[16 * 29], V29); _mm256_storeu_si256((__m256i*)&coeff[16 * 30], V30); _mm256_storeu_si256((__m256i*)&coeff[16 * 31], V31); _mm256_storeu_si256((__m256i*)&coeff[16 * 32], V32); _mm256_storeu_si256((__m256i*)&coeff[16 * 33], V33); _mm256_storeu_si256((__m256i*)&coeff[16 * 34], V34); _mm256_storeu_si256((__m256i*)&coeff[16 * 35], V35); _mm256_storeu_si256((__m256i*)&coeff[16 * 36], V36); _mm256_storeu_si256((__m256i*)&coeff[16 * 37], V37); _mm256_storeu_si256((__m256i*)&coeff[16 * 38], V38); _mm256_storeu_si256((__m256i*)&coeff[16 * 39], V39); _mm256_storeu_si256((__m256i*)&coeff[16 * 40], V40); _mm256_storeu_si256((__m256i*)&coeff[16 * 41], V41); _mm256_storeu_si256((__m256i*)&coeff[16 * 42], V42); _mm256_storeu_si256((__m256i*)&coeff[16 * 43], V43); _mm256_storeu_si256((__m256i*)&coeff[16 * 44], V44); _mm256_storeu_si256((__m256i*)&coeff[16 * 45], V45); _mm256_storeu_si256((__m256i*)&coeff[16 * 46], V46); _mm256_storeu_si256((__m256i*)&coeff[16 * 47], V47); _mm256_storeu_si256((__m256i*)&coeff[16 * 48], V48); _mm256_storeu_si256((__m256i*)&coeff[16 * 49], V49); _mm256_storeu_si256((__m256i*)&coeff[16 * 50], V50); _mm256_storeu_si256((__m256i*)&coeff[16 * 51], V51); _mm256_storeu_si256((__m256i*)&coeff[16 * 52], V52); _mm256_storeu_si256((__m256i*)&coeff[16 * 53], V53); _mm256_storeu_si256((__m256i*)&coeff[16 * 54], V54); _mm256_storeu_si256((__m256i*)&coeff[16 * 55], V55); _mm256_storeu_si256((__m256i*)&coeff[16 * 56], V56); _mm256_storeu_si256((__m256i*)&coeff[16 * 57], V57); _mm256_storeu_si256((__m256i*)&coeff[16 * 58], V58); _mm256_storeu_si256((__m256i*)&coeff[16 * 59], V59); _mm256_storeu_si256((__m256i*)&coeff[16 * 60], V60); _mm256_storeu_si256((__m256i*)&coeff[16 * 61], V61); _mm256_storeu_si256((__m256i*)&coeff[16 * 62], V62); _mm256_storeu_si256((__m256i*)&coeff[16 * 63], V63); } static void inv_wavelet_64x64_avx2(coeff_t *coeff) { int i; __m256i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m256i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; __m256i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; // 64*64 __m256i T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4], T16[4], T17[4], T18[4], T19[4], T20[4], T21[4], T22[4], T23[4], T24[4], T25[4], T26[4], T27[4], T28[4], T29[4], T30[4], T31[4], T32[4], T33[4], T34[4], T35[4], T36[4], T37[4], T38[4], T39[4], T40[4], T41[4], T42[4], T43[4], T44[4], T45[4], T46[4], T47[4], T48[4], T49[4], T50[4], T51[4], T52[4], T53[4], T54[4], T55[4], T56[4], T57[4], T58[4], T59[4], T60[4], T61[4], T62[4], T63[4]; // 64*64 __m256i V00[4], V01[4], V02[4], V03[4], V04[4], V05[4], V06[4], V07[4], V08[4], V09[4], V10[4], V11[4], V12[4], V13[4], V14[4], V15[4], V16[4], V17[4], V18[4], V19[4], V20[4], V21[4], V22[4], V23[4], V24[4], V25[4], V26[4], V27[4], V28[4], V29[4], V30[4], V31[4], V32[4], V33[4], V34[4], V35[4], V36[4], V37[4], V38[4], V39[4], V40[4], V41[4], V42[4], V43[4], V44[4], V45[4], V46[4], V47[4], V48[4], V49[4], V50[4], V51[4], V52[4], V53[4], V54[4], V55[4], V56[4], V57[4], V58[4], V59[4], V60[4], V61[4], V62[4], V63[4]; /*--vertical transform--*/ //32*32, LOAD AND SHIFT for (i = 0; i < 2; i++) { T00[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 0]), 1); T01[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 1]), 1); T02[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 2]), 1); T03[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 3]), 1); T04[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 4]), 1); T05[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 5]), 1); T06[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 6]), 1); T07[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 7]), 1); T08[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 8]), 1); T09[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 9]), 1); T10[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 10]), 1); T11[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 11]), 1); T12[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 12]), 1); T13[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 13]), 1); T14[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 14]), 1); T15[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 15]), 1); T16[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 16]), 1); T17[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 17]), 1); T18[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 18]), 1); T19[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 19]), 1); T20[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 20]), 1); T21[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 21]), 1); T22[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 22]), 1); T23[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 23]), 1); T24[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 24]), 1); T25[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 25]), 1); T26[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 26]), 1); T27[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 27]), 1); T28[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 28]), 1); T29[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 29]), 1); T30[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 30]), 1); T31[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 31]), 1); } //filter (odd pixel/row) for (i = 0; i < 4; i++) { T32[i] = _mm256_srai_epi16(_mm256_add_epi16(T00[i], T01[i]), 1); T33[i] = _mm256_srai_epi16(_mm256_add_epi16(T01[i], T02[i]), 1); T34[i] = _mm256_srai_epi16(_mm256_add_epi16(T02[i], T03[i]), 1); T35[i] = _mm256_srai_epi16(_mm256_add_epi16(T03[i], T04[i]), 1); T36[i] = _mm256_srai_epi16(_mm256_add_epi16(T04[i], T05[i]), 1); T37[i] = _mm256_srai_epi16(_mm256_add_epi16(T05[i], T06[i]), 1); T38[i] = _mm256_srai_epi16(_mm256_add_epi16(T06[i], T07[i]), 1); T39[i] = _mm256_srai_epi16(_mm256_add_epi16(T07[i], T08[i]), 1); T40[i] = _mm256_srai_epi16(_mm256_add_epi16(T08[i], T09[i]), 1); T41[i] = _mm256_srai_epi16(_mm256_add_epi16(T09[i], T10[i]), 1); T42[i] = _mm256_srai_epi16(_mm256_add_epi16(T10[i], T11[i]), 1); T43[i] = _mm256_srai_epi16(_mm256_add_epi16(T11[i], T12[i]), 1); T44[i] = _mm256_srai_epi16(_mm256_add_epi16(T12[i], T13[i]), 1); T45[i] = _mm256_srai_epi16(_mm256_add_epi16(T13[i], T14[i]), 1); T46[i] = _mm256_srai_epi16(_mm256_add_epi16(T14[i], T15[i]), 1); T47[i] = _mm256_srai_epi16(_mm256_add_epi16(T15[i], T16[i]), 1); T48[i] = _mm256_srai_epi16(_mm256_add_epi16(T16[i], T17[i]), 1); T49[i] = _mm256_srai_epi16(_mm256_add_epi16(T17[i], T18[i]), 1); T50[i] = _mm256_srai_epi16(_mm256_add_epi16(T18[i], T19[i]), 1); T51[i] = _mm256_srai_epi16(_mm256_add_epi16(T19[i], T20[i]), 1); T52[i] = _mm256_srai_epi16(_mm256_add_epi16(T20[i], T21[i]), 1); T53[i] = _mm256_srai_epi16(_mm256_add_epi16(T21[i], T22[i]), 1); T54[i] = _mm256_srai_epi16(_mm256_add_epi16(T22[i], T23[i]), 1); T55[i] = _mm256_srai_epi16(_mm256_add_epi16(T23[i], T24[i]), 1); T56[i] = _mm256_srai_epi16(_mm256_add_epi16(T24[i], T25[i]), 1); T57[i] = _mm256_srai_epi16(_mm256_add_epi16(T25[i], T26[i]), 1); T58[i] = _mm256_srai_epi16(_mm256_add_epi16(T26[i], T27[i]), 1); T59[i] = _mm256_srai_epi16(_mm256_add_epi16(T27[i], T28[i]), 1); T60[i] = _mm256_srai_epi16(_mm256_add_epi16(T28[i], T29[i]), 1); T61[i] = _mm256_srai_epi16(_mm256_add_epi16(T29[i], T30[i]), 1); T62[i] = _mm256_srai_epi16(_mm256_add_epi16(T30[i], T31[i]), 1); T63[i] = _mm256_srai_epi16(_mm256_add_epi16(T31[i], T31[i]), 1); } /*--transposition--*/ //32x64 -> 64x32 TRANSPOSE_16x16_16BIT_m256i(T00[0], T32[0], T01[0], T33[0], T02[0], T34[0], T03[0], T35[0], T04[0], T36[0], T05[0], T37[0], T06[0], T38[0], T07[0], T39[0], V00[0], V01[0], V02[0], V03[0], V04[0], V05[0], V06[0], V07[0], V08[0], V09[0], V10[0], V11[0], V12[0], V13[0], V14[0], V15[0]); TRANSPOSE_16x16_16BIT_m256i(T08[0], T40[0], T09[0], T41[0], T10[0], T42[0], T11[0], T43[0], T12[0], T44[0], T13[0], T45[0], T14[0], T46[0], T15[0], T47[0], V00[1], V01[1], V02[1], V03[1], V04[1], V05[1], V06[1], V07[1], V08[1], V09[1], V10[1], V11[1], V12[1], V13[1], V14[1], V15[1]); TRANSPOSE_16x16_16BIT_m256i(T16[0], T48[0], T17[0], T49[0], T18[0], T50[0], T19[0], T51[0], T20[0], T52[0], T21[0], T53[0], T22[0], T54[0], T23[0], T55[0], V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2]); TRANSPOSE_16x16_16BIT_m256i(T24[0], T56[0], T25[0], T57[0], T26[0], T58[0], T27[0], T59[0], T28[0], T60[0], T29[0], T61[0], T30[0], T62[0], T31[0], T63[0], V00[3], V01[3], V02[3], V03[3], V04[3], V05[3], V06[3], V07[3], V08[3], V09[3], V10[3], V11[3], V12[3], V13[3], V14[3], V15[3]); TRANSPOSE_16x16_16BIT_m256i(T00[1], T32[1], T01[1], T33[1], T02[1], T34[1], T03[1], T35[1], T04[1], T36[1], T05[1], T37[1], T06[1], T38[1], T07[1], T39[1], V16[0], V17[0], V18[0], V19[0], V20[0], V21[0], V22[0], V23[0], V24[0], V25[0], V26[0], V27[0], V28[0], V29[0], V30[0], V31[0]); TRANSPOSE_16x16_16BIT_m256i(T08[1], T40[1], T09[1], T41[1], T10[1], T42[1], T11[1], T43[1], T12[1], T44[1], T13[1], T45[1], T14[1], T46[1], T15[1], T47[1], V16[1], V17[1], V18[1], V19[1], V20[1], V21[1], V22[1], V23[1], V24[1], V25[1], V26[1], V27[1], V28[1], V29[1], V30[1], V31[1]); TRANSPOSE_16x16_16BIT_m256i(T16[1], T48[1], T17[1], T49[1], T18[1], T50[1], T19[1], T51[1], T20[1], T52[1], T21[1], T53[1], T22[1], T54[1], T23[1], T55[1], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2]); TRANSPOSE_16x16_16BIT_m256i(T24[1], T56[1], T25[1], T57[1], T26[1], T58[1], T27[1], T59[1], T28[1], T60[1], T29[1], T61[1], T30[1], T62[1], T31[1], T63[1], V16[3], V17[3], V18[3], V19[3], V20[3], V21[3], V22[3], V23[3], V24[3], V25[3], V26[3], V27[3], V28[3], V29[3], V30[3], V31[3]); /*--horizontal transform--*/ //filter (odd pixel/column) for (i = 0; i < 4; i++) { V32[i] = _mm256_srai_epi16(_mm256_add_epi16(V00[i], V01[i]), 1); V33[i] = _mm256_srai_epi16(_mm256_add_epi16(V01[i], V02[i]), 1); V34[i] = _mm256_srai_epi16(_mm256_add_epi16(V02[i], V03[i]), 1); V35[i] = _mm256_srai_epi16(_mm256_add_epi16(V03[i], V04[i]), 1); V36[i] = _mm256_srai_epi16(_mm256_add_epi16(V04[i], V05[i]), 1); V37[i] = _mm256_srai_epi16(_mm256_add_epi16(V05[i], V06[i]), 1); V38[i] = _mm256_srai_epi16(_mm256_add_epi16(V06[i], V07[i]), 1); V39[i] = _mm256_srai_epi16(_mm256_add_epi16(V07[i], V08[i]), 1); V40[i] = _mm256_srai_epi16(_mm256_add_epi16(V08[i], V09[i]), 1); V41[i] = _mm256_srai_epi16(_mm256_add_epi16(V09[i], V10[i]), 1); V42[i] = _mm256_srai_epi16(_mm256_add_epi16(V10[i], V11[i]), 1); V43[i] = _mm256_srai_epi16(_mm256_add_epi16(V11[i], V12[i]), 1); V44[i] = _mm256_srai_epi16(_mm256_add_epi16(V12[i], V13[i]), 1); V45[i] = _mm256_srai_epi16(_mm256_add_epi16(V13[i], V14[i]), 1); V46[i] = _mm256_srai_epi16(_mm256_add_epi16(V14[i], V15[i]), 1); V47[i] = _mm256_srai_epi16(_mm256_add_epi16(V15[i], V16[i]), 1); V48[i] = _mm256_srai_epi16(_mm256_add_epi16(V16[i], V17[i]), 1); V49[i] = _mm256_srai_epi16(_mm256_add_epi16(V17[i], V18[i]), 1); V50[i] = _mm256_srai_epi16(_mm256_add_epi16(V18[i], V19[i]), 1); V51[i] = _mm256_srai_epi16(_mm256_add_epi16(V19[i], V20[i]), 1); V52[i] = _mm256_srai_epi16(_mm256_add_epi16(V20[i], V21[i]), 1); V53[i] = _mm256_srai_epi16(_mm256_add_epi16(V21[i], V22[i]), 1); V54[i] = _mm256_srai_epi16(_mm256_add_epi16(V22[i], V23[i]), 1); V55[i] = _mm256_srai_epi16(_mm256_add_epi16(V23[i], V24[i]), 1); V56[i] = _mm256_srai_epi16(_mm256_add_epi16(V24[i], V25[i]), 1); V57[i] = _mm256_srai_epi16(_mm256_add_epi16(V25[i], V26[i]), 1); V58[i] = _mm256_srai_epi16(_mm256_add_epi16(V26[i], V27[i]), 1); V59[i] = _mm256_srai_epi16(_mm256_add_epi16(V27[i], V28[i]), 1); V60[i] = _mm256_srai_epi16(_mm256_add_epi16(V28[i], V29[i]), 1); V61[i] = _mm256_srai_epi16(_mm256_add_epi16(V29[i], V30[i]), 1); V62[i] = _mm256_srai_epi16(_mm256_add_epi16(V30[i], V31[i]), 1); V63[i] = _mm256_srai_epi16(_mm256_add_epi16(V31[i], V31[i]), 1); } /*--transposition & Store--*/ //64x64 TRANSPOSE_16x16_16BIT_m256i(V00[0], V32[0], V01[0], V33[0], V02[0], V34[0], V03[0], V35[0], V04[0], V36[0], V05[0], V37[0], V06[0], V38[0], V07[0], V39[0], T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0]); TRANSPOSE_16x16_16BIT_m256i(V00[1], V32[1], V01[1], V33[1], V02[1], V34[1], V03[1], V35[1], V04[1], V36[1], V05[1], V37[1], V06[1], V38[1], V07[1], V39[1], T16[0], T17[0], T18[0], T19[0], T20[0], T21[0], T22[0], T23[0], T24[0], T25[0], T26[0], T27[0], T28[0], T29[0], T30[0], T31[0]); TRANSPOSE_16x16_16BIT_m256i(V00[2], V32[2], V01[2], V33[2], V02[2], V34[2], V03[2], V35[2], V04[2], V36[2], V05[2], V37[2], V06[2], V38[2], V07[2], V39[2], T32[0], T33[0], T34[0], T35[0], T36[0], T37[0], T38[0], T39[0], T40[0], T41[0], T42[0], T43[0], T44[0], T45[0], T46[0], T47[0]); TRANSPOSE_16x16_16BIT_m256i(V00[3], V32[3], V01[3], V33[3], V02[3], V34[3], V03[3], V35[3], V04[3], V36[3], V05[3], V37[3], V06[3], V38[3], V07[3], V39[3], T48[0], T49[0], T50[0], T51[0], T52[0], T53[0], T54[0], T55[0], T56[0], T57[0], T58[0], T59[0], T60[0], T61[0], T62[0], T63[0]); TRANSPOSE_16x16_16BIT_m256i(V08[0], V40[0], V09[0], V41[0], V10[0], V42[0], V11[0], V43[0], V12[0], V44[0], V13[0], V45[0], V14[0], V46[0], V15[0], V47[0], T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1]); TRANSPOSE_16x16_16BIT_m256i(V08[1], V40[1], V09[1], V41[1], V10[1], V42[1], V11[1], V43[1], V12[1], V44[1], V13[1], V45[1], V14[1], V46[1], V15[1], V47[1], T16[1], T17[1], T18[1], T19[1], T20[1], T21[1], T22[1], T23[1], T24[1], T25[1], T26[1], T27[1], T28[1], T29[1], T30[1], T31[1]); TRANSPOSE_16x16_16BIT_m256i(V08[2], V40[2], V09[2], V41[2], V10[2], V42[2], V11[2], V43[2], V12[2], V44[2], V13[2], V45[2], V14[2], V46[2], V15[2], V47[2], T32[1], T33[1], T34[1], T35[1], T36[1], T37[1], T38[1], T39[1], T40[1], T41[1], T42[1], T43[1], T44[1], T45[1], T46[1], T47[1]); TRANSPOSE_16x16_16BIT_m256i(V08[3], V40[3], V09[3], V41[3], V10[3], V42[3], V11[3], V43[3], V12[3], V44[3], V13[3], V45[3], V14[3], V46[3], V15[3], V47[3], T48[1], T49[1], T50[1], T51[1], T52[1], T53[1], T54[1], T55[1], T56[1], T57[1], T58[1], T59[1], T60[1], T61[1], T62[1], T63[1]); TRANSPOSE_16x16_16BIT_m256i(V16[0], V48[0], V17[0], V49[0], V18[0], V50[0], V19[0], V51[0], V20[0], V52[0], V21[0], V53[0], V22[0], V54[0], V23[0], V55[0], T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2], T08[2], T09[2], T10[2], T11[2], T12[2], T13[2], T14[2], T15[2]); TRANSPOSE_16x16_16BIT_m256i(V16[1], V48[1], V17[1], V49[1], V18[1], V50[1], V19[1], V51[1], V20[1], V52[1], V21[1], V53[1], V22[1], V54[1], V23[1], V55[1], T16[2], T17[2], T18[2], T19[2], T20[2], T21[2], T22[2], T23[2], T24[2], T25[2], T26[2], T27[2], T28[2], T29[2], T30[2], T31[2]); TRANSPOSE_16x16_16BIT_m256i(V16[2], V48[2], V17[2], V49[2], V18[2], V50[2], V19[2], V51[2], V20[2], V52[2], V21[2], V53[2], V22[2], V54[2], V23[2], V55[2], T32[2], T33[2], T34[2], T35[2], T36[2], T37[2], T38[2], T39[2], T40[2], T41[2], T42[2], T43[2], T44[2], T45[2], T46[2], T47[2]); TRANSPOSE_16x16_16BIT_m256i(V16[3], V48[3], V17[3], V49[3], V18[3], V50[3], V19[3], V51[3], V20[3], V52[3], V21[3], V53[3], V22[3], V54[3], V23[3], V55[3], T48[2], T49[2], T50[2], T51[2], T52[2], T53[2], T54[2], T55[2], T56[2], T57[2], T58[2], T59[2], T60[2], T61[2], T62[2], T63[2]); TRANSPOSE_16x16_16BIT_m256i(V24[0], V56[0], V25[0], V57[0], V26[0], V58[0], V27[0], V59[0], V28[0], V60[0], V29[0], V61[0], V30[0], V62[0], V31[0], V63[0], T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3], T08[3], T09[3], T10[3], T11[3], T12[3], T13[3], T14[3], T15[3]); TRANSPOSE_16x16_16BIT_m256i(V24[1], V56[1], V25[1], V57[1], V26[1], V58[1], V27[1], V59[1], V28[1], V60[1], V29[1], V61[1], V30[1], V62[1], V31[1], V63[1], T16[3], T17[3], T18[3], T19[3], T20[3], T21[3], T22[3], T23[3], T24[3], T25[3], T26[3], T27[3], T28[3], T29[3], T30[3], T31[3]); TRANSPOSE_16x16_16BIT_m256i(V24[2], V56[2], V25[2], V57[2], V26[2], V58[2], V27[2], V59[2], V28[2], V60[2], V29[2], V61[2], V30[2], V62[2], V31[2], V63[2], T32[3], T33[3], T34[3], T35[3], T36[3], T37[3], T38[3], T39[3], T40[3], T41[3], T42[3], T43[3], T44[3], T45[3], T46[3], T47[3]); TRANSPOSE_16x16_16BIT_m256i(V24[3], V56[3], V25[3], V57[3], V26[3], V58[3], V27[3], V59[3], V28[3], V60[3], V29[3], V61[3], V30[3], V62[3], V31[3], V63[3], T48[3], T49[3], T50[3], T51[3], T52[3], T53[3], T54[3], T55[3], T56[3], T57[3], T58[3], T59[3], T60[3], T61[3], T62[3], T63[3]); //store for (i = 0; i < 4; i++) { _mm256_storeu_si256((__m256i*)&coeff[16 * i], T00[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64], T01[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 2], T02[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 3], T03[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 4], T04[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 5], T05[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 6], T06[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 7], T07[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 8], T08[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 9], T09[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 10], T10[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 11], T11[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 12], T12[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 13], T13[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 14], T14[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 15], T15[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 16], T16[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 17], T17[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 18], T18[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 19], T19[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 20], T20[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 21], T21[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 22], T22[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 23], T23[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 24], T24[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 25], T25[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 26], T26[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 27], T27[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 28], T28[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 29], T29[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 30], T30[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 31], T31[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 32], T32[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 33], T33[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 34], T34[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 35], T35[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 36], T36[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 37], T37[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 38], T38[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 39], T39[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 40], T40[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 41], T41[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 42], T42[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 43], T43[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 44], T44[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 45], T45[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 46], T46[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 47], T47[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 48], T48[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 49], T49[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 50], T50[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 51], T51[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 52], T52[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 53], T53[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 54], T54[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 55], T55[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 56], T56[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 57], T57[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 58], T58[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 59], T59[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 60], T60[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 61], T61[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 62], T62[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 63], T63[i]); } } /* --------------------------------------------------------------------------- */ void idct_c_64x64_avx2(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_c_32x32_avx2(src, dst, 32 | 0x01); inv_wavelet_64x64_avx2(dst); } /* --------------------------------------------------------------------------- */ void idct_c_64x16_avx2(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_c_32x8_sse128(src, dst, 32 | 0x01); inv_wavelet_64x16_avx2(dst); } /* --------------------------------------------------------------------------- */ void idct_c_16x64_avx2(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_c_8x32_sse128(src, dst, 8 | 0x01); inv_wavelet_16x64_avx2(dst); } xavs2-1.3/source/common/vec/intrinsic_inter_pred.c000066400000000000000000003051411340660520300223510ustar00rootroot00000000000000/* * intrinsic_inter-pred.c * * Description of this file: * SSE assembly functions of Inter-Prediction module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include #include #include #include #include #include "../basic_types.h" #include "intrinsic.h" #include "avs2_defs.h" /* --------------------------------------------------------------------------- */ void intpl_chroma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { const int16_t offset = 32; const int shift = 6; int row, col; const __m128i mAddOffset = _mm_set1_epi16(offset); const __m128i mSwitch1 = _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); const __m128i mSwitch2 = _mm_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); const __m128i mCoef = _mm_set1_epi32(*(int*)coeff); const __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); src -= 1; for (row = 0; row < height; row++) { __m128i mSrc, mT20, mT40, mVal; for (col = 0; col < width - 7; col += 8) { mSrc = _mm_loadu_si128((__m128i*)(src + col)); mT20 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch1), mCoef); mT40 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch2), mCoef); mVal = _mm_hadd_epi16(mT20, mT40); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); } if (col < width) { mSrc = _mm_loadu_si128((__m128i*)(src + col)); mT20 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch1), mCoef); mT40 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch2), mCoef); mVal = _mm_hadd_epi16(mT20, mT40); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row, col = 0; const short offset = 32; const int shift = 6; __m128i mAddOffset = _mm_set1_epi16(offset); __m128i mSwitch1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); __m128i mSwitch2 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m128i mSwitch3 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m128i mSwitch4 = _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m128i mCoef = _mm_loadl_epi64((__m128i*)coeff); __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); mCoef = _mm_unpacklo_epi64(mCoef, mCoef); src -= 3; for (row = 0; row < height; row++) { __m128i srcCoeff, T20, T40, T60, T80, sum; for (col = 0; col < width - 7; col += 8) { srcCoeff = _mm_loadu_si128((__m128i*)(src + col)); T20 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch1), mCoef); T40 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch2), mCoef); T60 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch3), mCoef); T80 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch4), mCoef); sum = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); sum = _mm_srai_epi16(_mm_add_epi16(sum, mAddOffset), shift); sum = _mm_packus_epi16(sum, sum); _mm_storel_epi64((__m128i*)&dst[col], sum); } if (col < width) { srcCoeff = _mm_loadu_si128((__m128i*)(src + col)); T20 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch1), mCoef); T40 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch2), mCoef); T60 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch3), mCoef); T80 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch4), mCoef); sum = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); sum = _mm_srai_epi16(_mm_add_epi16(sum, mAddOffset), shift); sum = _mm_packus_epi16(sum, sum); _mm_maskmoveu_si128(sum, mask, (char *)&dst[col]); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_hor_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int row, col = 0; const short offset = 32; const int shift = 6; __m128i mAddOffset = _mm_set1_epi16(offset); __m128i mSwitch1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); __m128i mSwitch2 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m128i mSwitch3 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m128i mSwitch4 = _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m128i mCoef = _mm_loadl_epi64((__m128i*)coeff); mCoef = _mm_unpacklo_epi64(mCoef, mCoef); __m128i T01, T23, T45, T67, T89, Tab, Tcd, Tef; __m128i S1, S2, S3, S4; __m128i U0, U1; __m128i Val1, Val2, Val; __m128i mask8 = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 8) - 1])); __m128i mask4 = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 4) - 1])); __m128i maskx = _mm_loadu_si128((__m128i*)(intrinsic_mask[((width & 7) << 1) - 1])); src -= 3; for (row = 0; row < height; row++) { for (col = 0; col < width - 15; col += 16) { __m128i srcCoeff1 = _mm_loadu_si128((__m128i*)(src + col)); __m128i srcCoeff2 = _mm_loadu_si128((__m128i*)(src + col + 8)); T01 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch1), mCoef); T23 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch2), mCoef); T45 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch3), mCoef); T67 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch4), mCoef); S1 = _mm_hadd_epi16(T01, T23); S2 = _mm_hadd_epi16(T45, T67); U0 = _mm_hadd_epi16(S1, S2); _mm_storeu_si128((__m128i*)&tmp[col], U0); T89 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff2, mSwitch1), mCoef); Tab = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff2, mSwitch2), mCoef); Tcd = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff2, mSwitch3), mCoef); Tef = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff2, mSwitch4), mCoef); S3 = _mm_hadd_epi16(T89, Tab); S4 = _mm_hadd_epi16(Tcd, Tef); U1 = _mm_hadd_epi16(S3, S4); _mm_storeu_si128((__m128i*)&tmp[col + 8], U1); Val1 = _mm_add_epi16(U0, mAddOffset); Val2 = _mm_add_epi16(U1, mAddOffset); Val1 = _mm_srai_epi16(Val1, shift); Val2 = _mm_srai_epi16(Val2, shift); Val = _mm_packus_epi16(Val1, Val2); _mm_storeu_si128((__m128i*)&dst[col], Val); } if (col < width - 7) { __m128i srcCoeff1 = _mm_loadu_si128((__m128i*)(src + col)); T01 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch1), mCoef); T23 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch2), mCoef); T45 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch3), mCoef); T67 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch4), mCoef); S1 = _mm_hadd_epi16(T01, T23); S2 = _mm_hadd_epi16(T45, T67); U0 = _mm_hadd_epi16(S1, S2); _mm_storeu_si128((__m128i*)&tmp[col], U0); Val1 = _mm_add_epi16(U0, mAddOffset); Val1 = _mm_srai_epi16(Val1, shift); Val = _mm_packus_epi16(Val1, Val1); _mm_maskmoveu_si128(Val, mask8, (char *)&dst[col]); col += 8; } if (col < width - 3) { __m128i srcCoeff1 = _mm_loadu_si128((__m128i*)(src + col)); T01 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch1), mCoef); T23 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch2), mCoef); T45 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch3), mCoef); T67 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch4), mCoef); S1 = _mm_hadd_epi16(T01, T23); S2 = _mm_hadd_epi16(T45, T67); U0 = _mm_hadd_epi16(S1, S2); //_mm_store_si128((__m128i*)&tmp[col], U0); _mm_maskmoveu_si128(U0, maskx, (char *)&tmp[col]); Val1 = _mm_add_epi16(U0, mAddOffset); Val1 = _mm_srai_epi16(Val1, shift); Val = _mm_packus_epi16(Val1, Val1); _mm_maskmoveu_si128(Val, mask4, (char *)&dst[col]); } src += i_src; tmp += i_tmp; dst += i_dst; } } /* --------------------------------------------------------------------------- * TODO: @luofl 20170827 intpl_luma_hor_sse128() дβֵ16 */ void intpl_luma_hor_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff) { int row, col = 0; const short offset = 32; const int shift = 6; __m128i mAddOffset = _mm_set1_epi16(offset); __m128i mSwitch1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); __m128i mSwitch2 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m128i mSwitch3 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m128i mSwitch4 = _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m128i mCoef0 = _mm_loadl_epi64((__m128i*)coeff[0]); __m128i mCoef1 = _mm_loadl_epi64((__m128i*)coeff[1]); __m128i mCoef2 = _mm_loadl_epi64((__m128i*)coeff[2]); mct_t *tmp0 = tmp[0]; mct_t *tmp1 = tmp[1]; mct_t *tmp2 = tmp[2]; pel_t *dst0 = dst[0]; pel_t *dst1 = dst[1]; pel_t *dst2 = dst[2]; __m128i mask8 = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 8) - 1])); __m128i mask4 = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 4) - 1])); __m128i maskx = _mm_loadu_si128((__m128i*)(intrinsic_mask[((width & 7) << 1) - 1])); mCoef0 = _mm_unpacklo_epi64(mCoef0, mCoef0); mCoef1 = _mm_unpacklo_epi64(mCoef1, mCoef1); mCoef2 = _mm_unpacklo_epi64(mCoef2, mCoef2); src -= 3; for (row = 0; row < height; row++) { __m128i TC1, TC2, TC3, TC4, TC5, TC6, TC7, TC8; __m128i T20, T40, T60, T80; __m128i sum1, sum2, val1, val2, val; __m128i srcCoeff1, srcCoeff2; for (col = 0; col < width - 15; col += 16) { srcCoeff1 = _mm_loadu_si128((__m128i*)(src + col)); srcCoeff2 = _mm_loadu_si128((__m128i*)(src + col + 8)); TC1 = _mm_shuffle_epi8(srcCoeff1, mSwitch1); TC2 = _mm_shuffle_epi8(srcCoeff1, mSwitch2); TC3 = _mm_shuffle_epi8(srcCoeff1, mSwitch3); TC4 = _mm_shuffle_epi8(srcCoeff1, mSwitch4); TC5 = _mm_shuffle_epi8(srcCoeff2, mSwitch1); TC6 = _mm_shuffle_epi8(srcCoeff2, mSwitch2); TC7 = _mm_shuffle_epi8(srcCoeff2, mSwitch3); TC8 = _mm_shuffle_epi8(srcCoeff2, mSwitch4); // First T20 = _mm_maddubs_epi16(TC1, mCoef0); T40 = _mm_maddubs_epi16(TC2, mCoef0); T60 = _mm_maddubs_epi16(TC3, mCoef0); T80 = _mm_maddubs_epi16(TC4, mCoef0); sum1 = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_storeu_si128((__m128i*)(&tmp0[col]), sum1); T20 = _mm_maddubs_epi16(TC5, mCoef0); T40 = _mm_maddubs_epi16(TC6, mCoef0); T60 = _mm_maddubs_epi16(TC7, mCoef0); T80 = _mm_maddubs_epi16(TC8, mCoef0); sum2 = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_storeu_si128((__m128i*)(&tmp0[col + 8]), sum2); val1 = _mm_srai_epi16(_mm_add_epi16(sum1, mAddOffset), shift); val2 = _mm_srai_epi16(_mm_add_epi16(sum2, mAddOffset), shift); val = _mm_packus_epi16(val1, val2); _mm_storeu_si128((__m128i*)&dst0[col], val); // Second T20 = _mm_maddubs_epi16(TC1, mCoef1); T40 = _mm_maddubs_epi16(TC2, mCoef1); T60 = _mm_maddubs_epi16(TC3, mCoef1); T80 = _mm_maddubs_epi16(TC4, mCoef1); sum1 = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_storeu_si128((__m128i*)(&tmp1[col]), sum1); T20 = _mm_maddubs_epi16(TC5, mCoef1); T40 = _mm_maddubs_epi16(TC6, mCoef1); T60 = _mm_maddubs_epi16(TC7, mCoef1); T80 = _mm_maddubs_epi16(TC8, mCoef1); sum2 = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_storeu_si128((__m128i*)(&tmp1[col + 8]), sum2); val1 = _mm_srai_epi16(_mm_add_epi16(sum1, mAddOffset), shift); val2 = _mm_srai_epi16(_mm_add_epi16(sum2, mAddOffset), shift); val = _mm_packus_epi16(val1, val2); _mm_storeu_si128((__m128i*)&dst1[col], val); // Third T20 = _mm_maddubs_epi16(TC1, mCoef2); T40 = _mm_maddubs_epi16(TC2, mCoef2); T60 = _mm_maddubs_epi16(TC3, mCoef2); T80 = _mm_maddubs_epi16(TC4, mCoef2); sum1 = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_storeu_si128((__m128i*)(&tmp2[col]), sum1); T20 = _mm_maddubs_epi16(TC5, mCoef2); T40 = _mm_maddubs_epi16(TC6, mCoef2); T60 = _mm_maddubs_epi16(TC7, mCoef2); T80 = _mm_maddubs_epi16(TC8, mCoef2); sum2 = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_storeu_si128((__m128i*)(&tmp2[col + 8]), sum2); val1 = _mm_srai_epi16(_mm_add_epi16(sum1, mAddOffset), shift); val2 = _mm_srai_epi16(_mm_add_epi16(sum2, mAddOffset), shift); val = _mm_packus_epi16(val1, val2); _mm_storeu_si128((__m128i*)&dst2[col], val); } if (col < width - 7) { srcCoeff1 = _mm_loadu_si128((__m128i*)(src + col)); TC1 = _mm_shuffle_epi8(srcCoeff1, mSwitch1); TC2 = _mm_shuffle_epi8(srcCoeff1, mSwitch2); TC3 = _mm_shuffle_epi8(srcCoeff1, mSwitch3); TC4 = _mm_shuffle_epi8(srcCoeff1, mSwitch4); // First T20 = _mm_maddubs_epi16(TC1, mCoef0); T40 = _mm_maddubs_epi16(TC2, mCoef0); T60 = _mm_maddubs_epi16(TC3, mCoef0); T80 = _mm_maddubs_epi16(TC4, mCoef0); sum1 = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_storeu_si128((__m128i*)&tmp0[col], sum1); val = _mm_srai_epi16(_mm_add_epi16(sum1, mAddOffset), shift); val = _mm_packus_epi16(val, val); _mm_maskmoveu_si128(val, mask8, (char *)&dst0[col]); // Second T20 = _mm_maddubs_epi16(TC1, mCoef1); T40 = _mm_maddubs_epi16(TC2, mCoef1); T60 = _mm_maddubs_epi16(TC3, mCoef1); T80 = _mm_maddubs_epi16(TC4, mCoef1); sum1 = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_storeu_si128((__m128i*)&tmp1[col], sum1); val = _mm_srai_epi16(_mm_add_epi16(sum1, mAddOffset), shift); val = _mm_packus_epi16(val, val); _mm_maskmoveu_si128(val, mask8, (char *)&dst1[col]); // Third T20 = _mm_maddubs_epi16(TC1, mCoef2); T40 = _mm_maddubs_epi16(TC2, mCoef2); T60 = _mm_maddubs_epi16(TC3, mCoef2); T80 = _mm_maddubs_epi16(TC4, mCoef2); sum1 = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_storeu_si128((__m128i*)&tmp2[col], sum1); val = _mm_srai_epi16(_mm_add_epi16(sum1, mAddOffset), shift); val = _mm_packus_epi16(val, val); _mm_maskmoveu_si128(val, mask8, (char *)&dst2[col]); col += 8; } if (col < (width - 3)) { srcCoeff1 = _mm_loadu_si128((__m128i*)(src + col)); TC1 = _mm_shuffle_epi8(srcCoeff1, mSwitch1); TC2 = _mm_shuffle_epi8(srcCoeff1, mSwitch2); TC3 = _mm_shuffle_epi8(srcCoeff1, mSwitch3); TC4 = _mm_shuffle_epi8(srcCoeff1, mSwitch4); // First T20 = _mm_maddubs_epi16(TC1, mCoef0); T40 = _mm_maddubs_epi16(TC2, mCoef0); T60 = _mm_maddubs_epi16(TC3, mCoef0); T80 = _mm_maddubs_epi16(TC4, mCoef0); sum1 = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_maskmoveu_si128(sum1, maskx, (char *)&tmp0[col]); val = _mm_srai_epi16(_mm_add_epi16(sum1, mAddOffset), shift); val = _mm_packus_epi16(val, val); _mm_maskmoveu_si128(val, mask4, (char *)&dst0[col]); // Second T20 = _mm_maddubs_epi16(TC1, mCoef1); T40 = _mm_maddubs_epi16(TC2, mCoef1); T60 = _mm_maddubs_epi16(TC3, mCoef1); T80 = _mm_maddubs_epi16(TC4, mCoef1); sum1 = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_maskmoveu_si128(sum1, maskx, (char *)&tmp1[col]); val = _mm_srai_epi16(_mm_add_epi16(sum1, mAddOffset), shift); val = _mm_packus_epi16(val, val); _mm_maskmoveu_si128(val, mask4, (char *)&dst1[col]); // Third T20 = _mm_maddubs_epi16(TC1, mCoef2); T40 = _mm_maddubs_epi16(TC2, mCoef2); T60 = _mm_maddubs_epi16(TC3, mCoef2); T80 = _mm_maddubs_epi16(TC4, mCoef2); sum1 = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_maskmoveu_si128(sum1, maskx, (char *)&tmp2[col]); val = _mm_srai_epi16(_mm_add_epi16(sum1, mAddOffset), shift); val = _mm_packus_epi16(val, val); _mm_maskmoveu_si128(val, mask4, (char *)&dst2[col]); } src += i_src; tmp0 += i_tmp; tmp1 += i_tmp; tmp2 += i_tmp; dst0 += i_dst; dst1 += i_dst; dst2 += i_dst; } } /* --------------------------------------------------------------------------- */ #define INTPL_LUMA_VER_SSE128_COMPUT(W0,W1,W2,W3,W4,W5,W6,W7,result) \ T0 = _mm_maddubs_epi16(D0, W0); \ T1 = _mm_maddubs_epi16(D1, W1); \ T2 = _mm_maddubs_epi16(D2, W2); \ T3 = _mm_maddubs_epi16(D3, W3); \ T4 = _mm_maddubs_epi16(D4, W4); \ T5 = _mm_maddubs_epi16(D5, W5); \ T6 = _mm_maddubs_epi16(D6, W6); \ T7 = _mm_maddubs_epi16(D7, W7); \ \ mVal1 = _mm_add_epi16(T0, T1); \ mVal1 = _mm_add_epi16(mVal1, T2); \ mVal1 = _mm_add_epi16(mVal1, T3); \ \ mVal2 = _mm_add_epi16(T4, T5); \ mVal2 = _mm_add_epi16(mVal2, T6); \ mVal2 = _mm_add_epi16(mVal2, T7); \ \ mVal1 = _mm_add_epi16(mVal1, mAddOffset); \ mVal2 = _mm_add_epi16(mVal2, mAddOffset); \ mVal1 = _mm_srai_epi16(mVal1, shift); \ mVal2 = _mm_srai_epi16(mVal2, shift); \ result = _mm_packus_epi16(mVal1, mVal2); #define INTPL_LUMA_VER_SSE128_STORE(result, store_dst) \ _mm_storeu_si128((__m128i*)&(store_dst)[col], result); #define INTPL_LUMA_VER_SSE128_COMPUT_LO(W0,W1,W2,W3,result) \ T0 = _mm_maddubs_epi16(D0, W0); \ T1 = _mm_maddubs_epi16(D1, W1); \ T2 = _mm_maddubs_epi16(D2, W2); \ T3 = _mm_maddubs_epi16(D3, W3); \ \ mVal1 = _mm_add_epi16(T0, T1); \ mVal1 = _mm_add_epi16(mVal1, T2); \ mVal1 = _mm_add_epi16(mVal1, T3); \ \ mVal1 = _mm_add_epi16(mVal1, mAddOffset); \ mVal1 = _mm_srai_epi16(mVal1, shift); \ result = _mm_packus_epi16(mVal1, mVal1); void intpl_luma_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int row, col; const short offset = 32; const int shift = 6; __m128i mAddOffset = _mm_set1_epi16(offset); pel_t const *p; src -= 3 * i_src; int8_t coeff_tmp[2]; coeff_tmp[0] = coeff[7],coeff_tmp[1] = coeff[0]; __m128i coeff70 = _mm_set1_epi16(*(short*)coeff_tmp); __m128i coeff12 = _mm_set1_epi16(*(short*)(coeff + 1)); __m128i coeff34 = _mm_set1_epi16(*(short*)(coeff + 3)); __m128i coeff56 = _mm_set1_epi16(*(short*)(coeff + 5)); __m128i coeff01 = _mm_set1_epi16(*(short*)coeff); __m128i coeff23 = _mm_set1_epi16(*(short*)(coeff + 2)); __m128i coeff45 = _mm_set1_epi16(*(short*)(coeff + 4)); __m128i coeff67 = _mm_set1_epi16(*(short*)(coeff + 6)); __m128i mVal1, mVal2; __m128i T00, T10, T20, T30, T40, T50, T60, T70, T80, T90, Ta0; __m128i T0, T1, T2, T3, T4, T5, T6, T7; __m128i D0, D1, D2, D3, D4, D5, D6, D7; __m128i U0, U1, U2, U3; for (row = 0; row < height; row = row + 4) { p = src; for (col = 0; col < width - 8; col += 16) { T00 = _mm_loadu_si128((__m128i*)(p)); T10 = _mm_loadu_si128((__m128i*)(p + i_src)); T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_src)); T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_src)); Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_src)); //0 D0 = _mm_unpacklo_epi8(T00, T10); D1 = _mm_unpacklo_epi8(T20, T30); D2 = _mm_unpacklo_epi8(T40, T50); D3 = _mm_unpacklo_epi8(T60, T70); D4 = _mm_unpackhi_epi8(T00, T10); D5 = _mm_unpackhi_epi8(T20, T30); D6 = _mm_unpackhi_epi8(T40, T50); D7 = _mm_unpackhi_epi8(T60, T70); INTPL_LUMA_VER_SSE128_COMPUT(coeff01, coeff23, coeff45, coeff67, coeff01, coeff23, coeff45, coeff67, U0); INTPL_LUMA_VER_SSE128_STORE(U0, dst); //1 D0 = _mm_unpacklo_epi8(T80, T10); D4 = _mm_unpackhi_epi8(T80, T10); INTPL_LUMA_VER_SSE128_COMPUT(coeff70, coeff12, coeff34, coeff56, coeff70, coeff12, coeff34, coeff56, U1); INTPL_LUMA_VER_SSE128_STORE(U1, dst + i_dst); //2 D0 = _mm_unpacklo_epi8(T80, T90); D4 = _mm_unpackhi_epi8(T80, T90); INTPL_LUMA_VER_SSE128_COMPUT(coeff67, coeff01, coeff23, coeff45, coeff67, coeff01, coeff23, coeff45, U2); INTPL_LUMA_VER_SSE128_STORE(U2, dst + 2 * i_dst); //3 D1 = _mm_unpacklo_epi8(Ta0, T30); D5 = _mm_unpackhi_epi8(Ta0, T30); INTPL_LUMA_VER_SSE128_COMPUT(coeff56, coeff70, coeff12, coeff34, coeff56, coeff70, coeff12, coeff34, U3); INTPL_LUMA_VER_SSE128_STORE(U3, dst + 3 * i_dst); p += 16; } //<=8bit if (col < width) { T00 = _mm_loadu_si128((__m128i*)(p)); T10 = _mm_loadu_si128((__m128i*)(p + i_src)); T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_src)); T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_src)); Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_src)); //0 D0 = _mm_unpacklo_epi8(T00, T10); D1 = _mm_unpacklo_epi8(T20, T30); D2 = _mm_unpacklo_epi8(T40, T50); D3 = _mm_unpacklo_epi8(T60, T70); INTPL_LUMA_VER_SSE128_COMPUT_LO(coeff01, coeff23, coeff45, coeff67, U0); INTPL_LUMA_VER_SSE128_STORE(U0, dst); //1 D0 = _mm_unpacklo_epi8(T80, T10); INTPL_LUMA_VER_SSE128_COMPUT_LO(coeff70, coeff12, coeff34, coeff56, U1); INTPL_LUMA_VER_SSE128_STORE(U1, dst + i_dst); //2 D0 = _mm_unpacklo_epi8(T80, T90); INTPL_LUMA_VER_SSE128_COMPUT_LO(coeff67, coeff01, coeff23, coeff45, U2); INTPL_LUMA_VER_SSE128_STORE(U2, dst + 2 * i_dst); //3 D1 = _mm_unpacklo_epi8(Ta0, T30); INTPL_LUMA_VER_SSE128_COMPUT_LO(coeff56, coeff70, coeff12, coeff34, U3); INTPL_LUMA_VER_SSE128_STORE(U3, dst + 3 * i_dst); p += 8; col += 8; } src += i_src * 4; dst += i_dst * 4; } } /* --------------------------------------------------------------------------- * */ void intpl_luma_ver_x3_sse128(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, int8_t const **coeff) { /* intpl_luma_ver_sse128(dst0, i_dst, src, i_src, width, height, coeff[0]); intpl_luma_ver_sse128(dst1, i_dst, src, i_src, width, height, coeff[1]); intpl_luma_ver_sse128(dst2, i_dst, src, i_src, width, height, coeff[2]); */ int row, col; const short offset = 32; const int shift = 6; int bsymFirst = (coeff[0][1] == coeff[0][6]); int bsymSecond = (coeff[1][1] == coeff[1][6]); int bsymThird = (coeff[2][1] == coeff[2][6]); __m128i mAddOffset = _mm_set1_epi16(offset); pel_t const *p; __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); src -= 3 * i_src; __m128i coeffFirst0, coeffFirst1, coeffFirst2, coeffFirst3; __m128i coeffSecond0, coeffSecond1, coeffSecond2, coeffSecond3; __m128i coeffThird0, coeffThird1, coeffThird2, coeffThird3; __m128i tempT00, tempT10, tempT20, tempT30; __m128i mVal; pel_t *dst0 = dst[0]; pel_t *dst1 = dst[1]; pel_t *dst2 = dst[2]; //load Coefficient if (bsymFirst) { coeffFirst0 = _mm_set1_epi8(coeff[0][0]); coeffFirst1 = _mm_set1_epi8(coeff[0][1]); coeffFirst2 = _mm_set1_epi8(coeff[0][2]); coeffFirst3 = _mm_set1_epi8(coeff[0][3]); } else { coeffFirst0 = _mm_set1_epi16(*(short*)coeff[0]); coeffFirst1 = _mm_set1_epi16(*(short*)(coeff[0] + 2)); coeffFirst2 = _mm_set1_epi16(*(short*)(coeff[0] + 4)); coeffFirst3 = _mm_set1_epi16(*(short*)(coeff[0] + 6)); } if (bsymSecond) { coeffSecond0 = _mm_set1_epi8(coeff[1][0]); coeffSecond1 = _mm_set1_epi8(coeff[1][1]); coeffSecond2 = _mm_set1_epi8(coeff[1][2]); coeffSecond3 = _mm_set1_epi8(coeff[1][3]); } else { coeffSecond0 = _mm_set1_epi16(*(short*)coeff[1]); coeffSecond1 = _mm_set1_epi16(*(short*)(coeff[1] + 2)); coeffSecond2 = _mm_set1_epi16(*(short*)(coeff[1] + 4)); coeffSecond3 = _mm_set1_epi16(*(short*)(coeff[1] + 6)); } if (bsymThird) { coeffThird0 = _mm_set1_epi8(coeff[2][0]); coeffThird1 = _mm_set1_epi8(coeff[2][1]); coeffThird2 = _mm_set1_epi8(coeff[2][2]); coeffThird3 = _mm_set1_epi8(coeff[2][3]); } else { coeffThird0 = _mm_set1_epi16(*(short*)coeff[2]); coeffThird1 = _mm_set1_epi16(*(short*)(coeff[2] + 2)); coeffThird2 = _mm_set1_epi16(*(short*)(coeff[2] + 4)); coeffThird3 = _mm_set1_epi16(*(short*)(coeff[2] + 6)); } //Double For for (row = 0; row < height; row++) { p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); //First if (bsymFirst) { tempT00 = _mm_unpacklo_epi8(T00, T70); tempT10 = _mm_unpacklo_epi8(T10, T60); tempT20 = _mm_unpacklo_epi8(T20, T50); tempT30 = _mm_unpacklo_epi8(T30, T40); } else { tempT00 = _mm_unpacklo_epi8(T00, T10); tempT10 = _mm_unpacklo_epi8(T20, T30); tempT20 = _mm_unpacklo_epi8(T40, T50); tempT30 = _mm_unpacklo_epi8(T60, T70); } tempT00 = _mm_maddubs_epi16(tempT00, coeffFirst0); tempT10 = _mm_maddubs_epi16(tempT10, coeffFirst1); tempT20 = _mm_maddubs_epi16(tempT20, coeffFirst2); tempT30 = _mm_maddubs_epi16(tempT30, coeffFirst3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst0[col], mVal); //Second if (bsymSecond) { tempT00 = _mm_unpacklo_epi8(T00, T70); tempT10 = _mm_unpacklo_epi8(T10, T60); tempT20 = _mm_unpacklo_epi8(T20, T50); tempT30 = _mm_unpacklo_epi8(T30, T40); } else { tempT00 = _mm_unpacklo_epi8(T00, T10); tempT10 = _mm_unpacklo_epi8(T20, T30); tempT20 = _mm_unpacklo_epi8(T40, T50); tempT30 = _mm_unpacklo_epi8(T60, T70); } tempT00 = _mm_maddubs_epi16(tempT00, coeffSecond0); tempT10 = _mm_maddubs_epi16(tempT10, coeffSecond1); tempT20 = _mm_maddubs_epi16(tempT20, coeffSecond2); tempT30 = _mm_maddubs_epi16(tempT30, coeffSecond3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst1[col], mVal); //Third if (bsymThird) { tempT00 = _mm_unpacklo_epi8(T00, T70); tempT10 = _mm_unpacklo_epi8(T10, T60); tempT20 = _mm_unpacklo_epi8(T20, T50); tempT30 = _mm_unpacklo_epi8(T30, T40); } else { tempT00 = _mm_unpacklo_epi8(T00, T10); tempT10 = _mm_unpacklo_epi8(T20, T30); tempT20 = _mm_unpacklo_epi8(T40, T50); tempT30 = _mm_unpacklo_epi8(T60, T70); } tempT00 = _mm_maddubs_epi16(tempT00, coeffThird0); tempT10 = _mm_maddubs_epi16(tempT10, coeffThird1); tempT20 = _mm_maddubs_epi16(tempT20, coeffThird2); tempT30 = _mm_maddubs_epi16(tempT30, coeffThird3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst2[col], mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); //First if (bsymFirst) { tempT00 = _mm_unpacklo_epi8(T00, T70); tempT10 = _mm_unpacklo_epi8(T10, T60); tempT20 = _mm_unpacklo_epi8(T20, T50); tempT30 = _mm_unpacklo_epi8(T30, T40); } else { tempT00 = _mm_unpacklo_epi8(T00, T10); tempT10 = _mm_unpacklo_epi8(T20, T30); tempT20 = _mm_unpacklo_epi8(T40, T50); tempT30 = _mm_unpacklo_epi8(T60, T70); } tempT00 = _mm_maddubs_epi16(tempT00, coeffFirst0); tempT10 = _mm_maddubs_epi16(tempT10, coeffFirst1); tempT20 = _mm_maddubs_epi16(tempT20, coeffFirst2); tempT30 = _mm_maddubs_epi16(tempT30, coeffFirst3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst0[col]); //Second if (bsymSecond) { tempT00 = _mm_unpacklo_epi8(T00, T70); tempT10 = _mm_unpacklo_epi8(T10, T60); tempT20 = _mm_unpacklo_epi8(T20, T50); tempT30 = _mm_unpacklo_epi8(T30, T40); } else { tempT00 = _mm_unpacklo_epi8(T00, T10); tempT10 = _mm_unpacklo_epi8(T20, T30); tempT20 = _mm_unpacklo_epi8(T40, T50); tempT30 = _mm_unpacklo_epi8(T60, T70); } tempT00 = _mm_maddubs_epi16(tempT00, coeffSecond0); tempT10 = _mm_maddubs_epi16(tempT10, coeffSecond1); tempT20 = _mm_maddubs_epi16(tempT20, coeffSecond2); tempT30 = _mm_maddubs_epi16(tempT30, coeffSecond3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst1[col]); //Third if (bsymThird) { tempT00 = _mm_unpacklo_epi8(T00, T70); tempT10 = _mm_unpacklo_epi8(T10, T60); tempT20 = _mm_unpacklo_epi8(T20, T50); tempT30 = _mm_unpacklo_epi8(T30, T40); } else { tempT00 = _mm_unpacklo_epi8(T00, T10); tempT10 = _mm_unpacklo_epi8(T20, T30); tempT20 = _mm_unpacklo_epi8(T40, T50); tempT30 = _mm_unpacklo_epi8(T60, T70); } tempT00 = _mm_maddubs_epi16(tempT00, coeffThird0); tempT10 = _mm_maddubs_epi16(tempT10, coeffThird1); tempT20 = _mm_maddubs_epi16(tempT20, coeffThird2); tempT30 = _mm_maddubs_epi16(tempT30, coeffThird3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst2[col]); } src += i_src; dst0 += i_dst; dst1 += i_dst; dst2 += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_ext_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff) { int row, col; int shift; int16_t const *p; int bsymy = (coeff[1] == coeff[6]); __m128i mAddOffset; __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); // VER shift = 12; mAddOffset = _mm_set1_epi32(1 << (shift - 1)); tmp = tmp - 3 * i_tmp; if (bsymy) { __m128i mCoefy1 = _mm_set1_epi16(coeff[0]); __m128i mCoefy2 = _mm_set1_epi16(coeff[1]); __m128i mCoefy3 = _mm_set1_epi16(coeff[2]); __m128i mCoefy4 = _mm_set1_epi16(coeff[3]); __m128i mVal1, mVal2, mVal; for (row = 0; row < height; row++) { p = tmp; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T70); __m128i T1 = _mm_unpacklo_epi16(T10, T60); __m128i T2 = _mm_unpacklo_epi16(T20, T50); __m128i T3 = _mm_unpacklo_epi16(T30, T40); __m128i T4 = _mm_unpackhi_epi16(T00, T70); __m128i T5 = _mm_unpackhi_epi16(T10, T60); __m128i T6 = _mm_unpackhi_epi16(T20, T50); __m128i T7 = _mm_unpackhi_epi16(T30, T40); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T70); __m128i T1 = _mm_unpacklo_epi16(T10, T60); __m128i T2 = _mm_unpacklo_epi16(T20, T50); __m128i T3 = _mm_unpacklo_epi16(T30, T40); __m128i T4 = _mm_unpackhi_epi16(T00, T70); __m128i T5 = _mm_unpackhi_epi16(T10, T60); __m128i T6 = _mm_unpackhi_epi16(T20, T50); __m128i T7 = _mm_unpackhi_epi16(T30, T40); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } tmp += i_tmp; dst += i_dst; } } else { __m128i mCoefy1 = _mm_set1_epi16(*(int16_t*)(coeff + 0)); __m128i mCoefy2 = _mm_set1_epi16(*(int16_t*)(coeff + 2)); __m128i mCoefy3 = _mm_set1_epi16(*(int16_t*)(coeff + 4)); __m128i mCoefy4 = _mm_set1_epi16(*(int16_t*)(coeff + 6)); __m128i mVal1, mVal2, mVal; mCoefy1 = _mm_cvtepi8_epi16(mCoefy1); mCoefy2 = _mm_cvtepi8_epi16(mCoefy2); mCoefy3 = _mm_cvtepi8_epi16(mCoefy3); mCoefy4 = _mm_cvtepi8_epi16(mCoefy4); for (row = 0; row < height; row++) { p = tmp; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T10); __m128i T1 = _mm_unpacklo_epi16(T20, T30); __m128i T2 = _mm_unpacklo_epi16(T40, T50); __m128i T3 = _mm_unpacklo_epi16(T60, T70); __m128i T4 = _mm_unpackhi_epi16(T00, T10); __m128i T5 = _mm_unpackhi_epi16(T20, T30); __m128i T6 = _mm_unpackhi_epi16(T40, T50); __m128i T7 = _mm_unpackhi_epi16(T60, T70); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); p += 8; } if (col < width) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T10); __m128i T1 = _mm_unpacklo_epi16(T20, T30); __m128i T2 = _mm_unpacklo_epi16(T40, T50); __m128i T3 = _mm_unpacklo_epi16(T60, T70); __m128i T4 = _mm_unpackhi_epi16(T00, T10); __m128i T5 = _mm_unpackhi_epi16(T20, T30); __m128i T6 = _mm_unpackhi_epi16(T40, T50); __m128i T7 = _mm_unpackhi_epi16(T60, T70); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } tmp += i_tmp; dst += i_dst; } } } void intpl_luma_ext_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff) { /* intpl_luma_ext_sse128(dst0, i_dst, tmp, i_tmp, width, height, coeff[0]); intpl_luma_ext_sse128(dst1, i_dst, tmp, i_tmp, width, height, coeff[1]); intpl_luma_ext_sse128(dst2, i_dst, tmp, i_tmp, width, height, coeff[2]); */ int row, col; int shift; int16_t const *p; int bsymyFirst = (coeff[0][1] == coeff[0][6]); int bsymySecond = (coeff[1][1] == coeff[1][6]); int bsymyThird = (coeff[2][1] == coeff[2][6]); __m128i mAddOffset; __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); // VER shift = 12; mAddOffset = _mm_set1_epi32(1 << (shift - 1)); tmp = tmp - 3 * i_tmp; __m128i mCoefy1First,mCoefy2First,mCoefy3First,mCoefy4First; __m128i mCoefy1Second,mCoefy2Second,mCoefy3Second,mCoefy4Second; __m128i mCoefy1Third,mCoefy2Third,mCoefy3Third,mCoefy4Third; pel_t *dst0 = dst[0]; pel_t *dst1 = dst[1]; pel_t *dst2 = dst[2]; if(bsymyFirst) { mCoefy1First = _mm_set1_epi16(coeff[0][0]); mCoefy2First = _mm_set1_epi16(coeff[0][1]); mCoefy3First = _mm_set1_epi16(coeff[0][2]); mCoefy4First = _mm_set1_epi16(coeff[0][3]); } else { mCoefy1First = _mm_set1_epi16(*(int16_t*)coeff[0]); mCoefy2First = _mm_set1_epi16(*(int16_t*)(coeff[0] + 2)); mCoefy3First = _mm_set1_epi16(*(int16_t*)(coeff[0] + 4)); mCoefy4First = _mm_set1_epi16(*(int16_t*)(coeff[0] + 6)); mCoefy1First = _mm_cvtepi8_epi16(mCoefy1First); mCoefy2First = _mm_cvtepi8_epi16(mCoefy2First); mCoefy3First = _mm_cvtepi8_epi16(mCoefy3First); mCoefy4First = _mm_cvtepi8_epi16(mCoefy4First); } if(bsymySecond) { mCoefy1Second = _mm_set1_epi16(coeff[1][0]); mCoefy2Second = _mm_set1_epi16(coeff[1][1]); mCoefy3Second = _mm_set1_epi16(coeff[1][2]); mCoefy4Second = _mm_set1_epi16(coeff[1][3]); } else { mCoefy1Second = _mm_set1_epi16(*(int16_t*)coeff[1]); mCoefy2Second = _mm_set1_epi16(*(int16_t*)(coeff[1] + 2)); mCoefy3Second = _mm_set1_epi16(*(int16_t*)(coeff[1] + 4)); mCoefy4Second = _mm_set1_epi16(*(int16_t*)(coeff[1] + 6)); mCoefy1Second = _mm_cvtepi8_epi16(mCoefy1Second); mCoefy2Second = _mm_cvtepi8_epi16(mCoefy2Second); mCoefy3Second = _mm_cvtepi8_epi16(mCoefy3Second); mCoefy4Second = _mm_cvtepi8_epi16(mCoefy4Second); } if(bsymyThird) { mCoefy1Third = _mm_set1_epi16(coeff[2][0]); mCoefy2Third = _mm_set1_epi16(coeff[2][1]); mCoefy3Third = _mm_set1_epi16(coeff[2][2]); mCoefy4Third = _mm_set1_epi16(coeff[2][3]); } else { mCoefy1Third = _mm_set1_epi16(*(int16_t*)coeff[2]); mCoefy2Third = _mm_set1_epi16(*(int16_t*)(coeff[2] + 2)); mCoefy3Third = _mm_set1_epi16(*(int16_t*)(coeff[2] + 4)); mCoefy4Third = _mm_set1_epi16(*(int16_t*)(coeff[2] + 6)); mCoefy1Third = _mm_cvtepi8_epi16(mCoefy1Third); mCoefy2Third = _mm_cvtepi8_epi16(mCoefy2Third); mCoefy3Third = _mm_cvtepi8_epi16(mCoefy3Third); mCoefy4Third = _mm_cvtepi8_epi16(mCoefy4Third); } __m128i T00, T10, T20, T30, T40, T50, T60, T70; __m128i T0, T1, T2, T3, T4, T5, T6, T7; __m128i mVal1, mVal2, mVal; // for (row = 0; row < height; row++) { p = tmp; for (col = 0; col < width - 7; col += 8) { T00 = _mm_loadu_si128((__m128i*)(p)); T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); //First if (bsymyFirst) { T0 = _mm_unpacklo_epi16(T00, T70); T1 = _mm_unpacklo_epi16(T10, T60); T2 = _mm_unpacklo_epi16(T20, T50); T3 = _mm_unpacklo_epi16(T30, T40); T4 = _mm_unpackhi_epi16(T00, T70); T5 = _mm_unpackhi_epi16(T10, T60); T6 = _mm_unpackhi_epi16(T20, T50); T7 = _mm_unpackhi_epi16(T30, T40); } else { T0 = _mm_unpacklo_epi16(T00, T10); T1 = _mm_unpacklo_epi16(T20, T30); T2 = _mm_unpacklo_epi16(T40, T50); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T00, T10); T5 = _mm_unpackhi_epi16(T20, T30); T6 = _mm_unpackhi_epi16(T40, T50); T7 = _mm_unpackhi_epi16(T60, T70); } T0 = _mm_madd_epi16(T0, mCoefy1First); T1 = _mm_madd_epi16(T1, mCoefy2First); T2 = _mm_madd_epi16(T2, mCoefy3First); T3 = _mm_madd_epi16(T3, mCoefy4First); T4 = _mm_madd_epi16(T4, mCoefy1First); T5 = _mm_madd_epi16(T5, mCoefy2First); T6 = _mm_madd_epi16(T6, mCoefy3First); T7 = _mm_madd_epi16(T7, mCoefy4First); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst0[col], mVal); //Second if (bsymySecond) { T0 = _mm_unpacklo_epi16(T00, T70); T1 = _mm_unpacklo_epi16(T10, T60); T2 = _mm_unpacklo_epi16(T20, T50); T3 = _mm_unpacklo_epi16(T30, T40); T4 = _mm_unpackhi_epi16(T00, T70); T5 = _mm_unpackhi_epi16(T10, T60); T6 = _mm_unpackhi_epi16(T20, T50); T7 = _mm_unpackhi_epi16(T30, T40); } else { T0 = _mm_unpacklo_epi16(T00, T10); T1 = _mm_unpacklo_epi16(T20, T30); T2 = _mm_unpacklo_epi16(T40, T50); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T00, T10); T5 = _mm_unpackhi_epi16(T20, T30); T6 = _mm_unpackhi_epi16(T40, T50); T7 = _mm_unpackhi_epi16(T60, T70); } T0 = _mm_madd_epi16(T0, mCoefy1Second); T1 = _mm_madd_epi16(T1, mCoefy2Second); T2 = _mm_madd_epi16(T2, mCoefy3Second); T3 = _mm_madd_epi16(T3, mCoefy4Second); T4 = _mm_madd_epi16(T4, mCoefy1Second); T5 = _mm_madd_epi16(T5, mCoefy2Second); T6 = _mm_madd_epi16(T6, mCoefy3Second); T7 = _mm_madd_epi16(T7, mCoefy4Second); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst1[col], mVal); //Third if (bsymyThird) { T0 = _mm_unpacklo_epi16(T00, T70); T1 = _mm_unpacklo_epi16(T10, T60); T2 = _mm_unpacklo_epi16(T20, T50); T3 = _mm_unpacklo_epi16(T30, T40); T4 = _mm_unpackhi_epi16(T00, T70); T5 = _mm_unpackhi_epi16(T10, T60); T6 = _mm_unpackhi_epi16(T20, T50); T7 = _mm_unpackhi_epi16(T30, T40); } else { T0 = _mm_unpacklo_epi16(T00, T10); T1 = _mm_unpacklo_epi16(T20, T30); T2 = _mm_unpacklo_epi16(T40, T50); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T00, T10); T5 = _mm_unpackhi_epi16(T20, T30); T6 = _mm_unpackhi_epi16(T40, T50); T7 = _mm_unpackhi_epi16(T60, T70); } T0 = _mm_madd_epi16(T0, mCoefy1Third); T1 = _mm_madd_epi16(T1, mCoefy2Third); T2 = _mm_madd_epi16(T2, mCoefy3Third); T3 = _mm_madd_epi16(T3, mCoefy4Third); T4 = _mm_madd_epi16(T4, mCoefy1Third); T5 = _mm_madd_epi16(T5, mCoefy2Third); T6 = _mm_madd_epi16(T6, mCoefy3Third); T7 = _mm_madd_epi16(T7, mCoefy4Third); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst2[col], mVal); p += 8; } if (col < width) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); //First if (bsymyFirst) { T0 = _mm_unpacklo_epi16(T00, T70); T1 = _mm_unpacklo_epi16(T10, T60); T2 = _mm_unpacklo_epi16(T20, T50); T3 = _mm_unpacklo_epi16(T30, T40); T4 = _mm_unpackhi_epi16(T00, T70); T5 = _mm_unpackhi_epi16(T10, T60); T6 = _mm_unpackhi_epi16(T20, T50); T7 = _mm_unpackhi_epi16(T30, T40); } else { T0 = _mm_unpacklo_epi16(T00, T10); T1 = _mm_unpacklo_epi16(T20, T30); T2 = _mm_unpacklo_epi16(T40, T50); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T00, T10); T5 = _mm_unpackhi_epi16(T20, T30); T6 = _mm_unpackhi_epi16(T40, T50); T7 = _mm_unpackhi_epi16(T60, T70); } T0 = _mm_madd_epi16(T0, mCoefy1First); T1 = _mm_madd_epi16(T1, mCoefy2First); T2 = _mm_madd_epi16(T2, mCoefy3First); T3 = _mm_madd_epi16(T3, mCoefy4First); T4 = _mm_madd_epi16(T4, mCoefy1First); T5 = _mm_madd_epi16(T5, mCoefy2First); T6 = _mm_madd_epi16(T6, mCoefy3First); T7 = _mm_madd_epi16(T7, mCoefy4First); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst0[col]); //Second if (bsymySecond) { T0 = _mm_unpacklo_epi16(T00, T70); T1 = _mm_unpacklo_epi16(T10, T60); T2 = _mm_unpacklo_epi16(T20, T50); T3 = _mm_unpacklo_epi16(T30, T40); T4 = _mm_unpackhi_epi16(T00, T70); T5 = _mm_unpackhi_epi16(T10, T60); T6 = _mm_unpackhi_epi16(T20, T50); T7 = _mm_unpackhi_epi16(T30, T40); } else { T0 = _mm_unpacklo_epi16(T00, T10); T1 = _mm_unpacklo_epi16(T20, T30); T2 = _mm_unpacklo_epi16(T40, T50); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T00, T10); T5 = _mm_unpackhi_epi16(T20, T30); T6 = _mm_unpackhi_epi16(T40, T50); T7 = _mm_unpackhi_epi16(T60, T70); } T0 = _mm_madd_epi16(T0, mCoefy1Second); T1 = _mm_madd_epi16(T1, mCoefy2Second); T2 = _mm_madd_epi16(T2, mCoefy3Second); T3 = _mm_madd_epi16(T3, mCoefy4Second); T4 = _mm_madd_epi16(T4, mCoefy1Second); T5 = _mm_madd_epi16(T5, mCoefy2Second); T6 = _mm_madd_epi16(T6, mCoefy3Second); T7 = _mm_madd_epi16(T7, mCoefy4Second); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst1[col]); //Third if (bsymyThird) { T0 = _mm_unpacklo_epi16(T00, T70); T1 = _mm_unpacklo_epi16(T10, T60); T2 = _mm_unpacklo_epi16(T20, T50); T3 = _mm_unpacklo_epi16(T30, T40); T4 = _mm_unpackhi_epi16(T00, T70); T5 = _mm_unpackhi_epi16(T10, T60); T6 = _mm_unpackhi_epi16(T20, T50); T7 = _mm_unpackhi_epi16(T30, T40); } else { T0 = _mm_unpacklo_epi16(T00, T10); T1 = _mm_unpacklo_epi16(T20, T30); T2 = _mm_unpacklo_epi16(T40, T50); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T00, T10); T5 = _mm_unpackhi_epi16(T20, T30); T6 = _mm_unpackhi_epi16(T40, T50); T7 = _mm_unpackhi_epi16(T60, T70); } T0 = _mm_madd_epi16(T0, mCoefy1Third); T1 = _mm_madd_epi16(T1, mCoefy2Third); T2 = _mm_madd_epi16(T2, mCoefy3Third); T3 = _mm_madd_epi16(T3, mCoefy4Third); T4 = _mm_madd_epi16(T4, mCoefy1Third); T5 = _mm_madd_epi16(T5, mCoefy2Third); T6 = _mm_madd_epi16(T6, mCoefy3Third); T7 = _mm_madd_epi16(T7, mCoefy4Third); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst2[col]); } tmp += i_tmp; dst0 += i_dst; dst1 += i_dst; dst2 += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row, col; const short offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[2]); __m128i mAddOffset = _mm_set1_epi16(offset); pel_t const *p; __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); src -= i_src; if (bsym) { __m128i coeff0 = _mm_set1_epi8(coeff[0]); __m128i coeff1 = _mm_set1_epi8(coeff[1]); __m128i mVal; for (row = 0; row < height; row++) { p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); T00 = _mm_unpacklo_epi8(T00, T30); T10 = _mm_unpacklo_epi8(T10, T20); T00 = _mm_maddubs_epi16(T00, coeff0); T10 = _mm_maddubs_epi16(T10, coeff1); mVal = _mm_add_epi16(T00, T10); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); T00 = _mm_unpacklo_epi8(T00, T30); T10 = _mm_unpacklo_epi8(T10, T20); T00 = _mm_maddubs_epi16(T00, coeff0); T10 = _mm_maddubs_epi16(T10, coeff1); mVal = _mm_add_epi16(T00, T10); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } src += i_src; dst += i_dst; } } else { __m128i coeff0 = _mm_set1_epi16(*(short*)coeff); __m128i coeff1 = _mm_set1_epi16(*(short*)(coeff + 2)); __m128i mVal; for (row = 0; row < height; row++) { p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); T00 = _mm_unpacklo_epi8(T00, T10); T10 = _mm_unpacklo_epi8(T20, T30); T00 = _mm_maddubs_epi16(T00, coeff0); T10 = _mm_maddubs_epi16(T10, coeff1); mVal = _mm_add_epi16(T00, T10); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); T00 = _mm_unpacklo_epi8(T00, T10); T10 = _mm_unpacklo_epi8(T20, T30); T00 = _mm_maddubs_epi16(T00, coeff0); T10 = _mm_maddubs_epi16(T10, coeff1); mVal = _mm_add_epi16(T00, T10); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { const short offset = 32; const int shift = 6; int row, col; int bsym = (coeff[1] == coeff[6]); __m128i mAddOffset = _mm_set1_epi16(offset); pel_t const *p; __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); src -= 3 * i_src; if (bsym) { __m128i coeff0 = _mm_set1_epi8(coeff[0]); __m128i coeff1 = _mm_set1_epi8(coeff[1]); __m128i coeff2 = _mm_set1_epi8(coeff[2]); __m128i coeff3 = _mm_set1_epi8(coeff[3]); for (row = 0; row < height; row++) { __m128i mVal; p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); T00 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T00, T70), coeff0); T10 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T10, T60), coeff1); T20 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T50), coeff2); T30 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, T40), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T00, T10), _mm_add_epi16(T20, T30)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); T00 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T00, T70), coeff0); T10 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T10, T60), coeff1); T20 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T50), coeff2); T30 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, T40), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T00, T10), _mm_add_epi16(T20, T30)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } src += i_src; dst += i_dst; } } else { __m128i coeff0 = _mm_set1_epi16(*(short*)coeff); __m128i coeff1 = _mm_set1_epi16(*(short*)(coeff + 2)); __m128i coeff2 = _mm_set1_epi16(*(short*)(coeff + 4)); __m128i coeff3 = _mm_set1_epi16(*(short*)(coeff + 6)); for (row = 0; row < height; row++) { __m128i mVal; p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); T00 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T00, T10), coeff0); T10 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T30), coeff1); T20 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T50), coeff2); T30 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T60, T70), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T00, T10), _mm_add_epi16(T20, T30)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); T00 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T00, T10), coeff0); T10 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T30), coeff1); T20 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T50), coeff2); T30 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T60, T70), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T00, T10), _mm_add_epi16(T20, T30)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { ALIGN16(int16_t tmp_res[(32 + 3) * 32]); int16_t *tmp = tmp_res; const int i_tmp = 32; int row, col; int shift; int16_t const *p; int bsymy = (coef_y[1] == coef_y[6]); __m128i mAddOffset; __m128i mSwitch1 = _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); __m128i mSwitch2 = _mm_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); __m128i mCoefx = _mm_set1_epi32(*(int*)coef_x); __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); // HOR src = src - 1 * i_src - 1; if (width > 4) { for (row = -1; row < height + 2; row++) { __m128i mT0, mT1, mV01; for (col = 0; col < width; col += 8) { __m128i mSrc = _mm_loadu_si128((__m128i*)(src + col)); mT0 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch1), mCoefx); mT1 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch2), mCoefx); mV01 = _mm_hadd_epi16(mT0, mT1); _mm_store_si128((__m128i*)&tmp[col], mV01); } src += i_src; tmp += i_tmp; } } else { for (row = -1; row < height + 2; row++) { __m128i mSrc = _mm_loadu_si128((__m128i*)src); __m128i mT0 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch1), mCoefx); __m128i mV01 = _mm_hadd_epi16(mT0, mT0); _mm_storel_epi64((__m128i*)tmp, mV01); src += i_src; tmp += i_tmp; } } // VER shift = 12; mAddOffset = _mm_set1_epi32(1 << 11); tmp = tmp_res; if (bsymy) { __m128i mCoefy1 = _mm_set1_epi16(coef_y[0]); __m128i mCoefy2 = _mm_set1_epi16(coef_y[1]); for (row = 0; row < height; row += 2) { p = tmp; for (col = 0; col < width - 7; col += 8) { __m128i mV01, mV02; __m128i mV11, mV12; __m128i T0 = _mm_loadu_si128((__m128i*)(p)); __m128i T1 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T2 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T3 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T4 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i M00 = _mm_unpacklo_epi16(T0, T3); __m128i M01 = _mm_unpacklo_epi16(T1, T2); __m128i M02 = _mm_unpackhi_epi16(T0, T3); __m128i M03 = _mm_unpackhi_epi16(T1, T2); __m128i M10 = _mm_unpacklo_epi16(T1, T4); __m128i M11 = _mm_unpacklo_epi16(T2, T3); __m128i M12 = _mm_unpackhi_epi16(T1, T4); __m128i M13 = _mm_unpackhi_epi16(T2, T3); mV01 = _mm_add_epi32(_mm_madd_epi16(M00, mCoefy1), _mm_madd_epi16(M01, mCoefy2)); mV02 = _mm_add_epi32(_mm_madd_epi16(M02, mCoefy1), _mm_madd_epi16(M03, mCoefy2)); mV11 = _mm_add_epi32(_mm_madd_epi16(M10, mCoefy1), _mm_madd_epi16(M11, mCoefy2)); mV12 = _mm_add_epi32(_mm_madd_epi16(M12, mCoefy1), _mm_madd_epi16(M13, mCoefy2)); mV01 = _mm_srai_epi32(_mm_add_epi32(mV01, mAddOffset), shift); mV02 = _mm_srai_epi32(_mm_add_epi32(mV02, mAddOffset), shift); mV11 = _mm_srai_epi32(_mm_add_epi32(mV11, mAddOffset), shift); mV12 = _mm_srai_epi32(_mm_add_epi32(mV12, mAddOffset), shift); mV01 = _mm_packs_epi32 (mV01, mV02); mV01 = _mm_packus_epi16(mV01, mV01); mV11 = _mm_packs_epi32 (mV11, mV12); mV11 = _mm_packus_epi16(mV11, mV11); _mm_storel_epi64((__m128i*)&dst[col], mV01); _mm_storel_epi64((__m128i*)&dst[col + i_dst], mV11); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i mV01, mV02; __m128i mV11, mV12; __m128i T0 = _mm_loadu_si128((__m128i*)(p)); __m128i T1 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T2 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T3 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T4 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i M00 = _mm_unpacklo_epi16(T0, T3); __m128i M01 = _mm_unpacklo_epi16(T1, T2); __m128i M02 = _mm_unpackhi_epi16(T0, T3); __m128i M03 = _mm_unpackhi_epi16(T1, T2); __m128i M10 = _mm_unpacklo_epi16(T1, T4); __m128i M11 = _mm_unpacklo_epi16(T2, T3); __m128i M12 = _mm_unpackhi_epi16(T1, T4); __m128i M13 = _mm_unpackhi_epi16(T2, T3); mV01 = _mm_add_epi32(_mm_madd_epi16(M00, mCoefy1), _mm_madd_epi16(M01, mCoefy2)); mV02 = _mm_add_epi32(_mm_madd_epi16(M02, mCoefy1), _mm_madd_epi16(M03, mCoefy2)); mV11 = _mm_add_epi32(_mm_madd_epi16(M10, mCoefy1), _mm_madd_epi16(M11, mCoefy2)); mV12 = _mm_add_epi32(_mm_madd_epi16(M12, mCoefy1), _mm_madd_epi16(M13, mCoefy2)); mV01 = _mm_srai_epi32(_mm_add_epi32(mV01, mAddOffset), shift); mV02 = _mm_srai_epi32(_mm_add_epi32(mV02, mAddOffset), shift); mV11 = _mm_srai_epi32(_mm_add_epi32(mV11, mAddOffset), shift); mV12 = _mm_srai_epi32(_mm_add_epi32(mV12, mAddOffset), shift); mV01 = _mm_packs_epi32 (mV01, mV02); mV01 = _mm_packus_epi16(mV01, mV01); mV11 = _mm_packs_epi32 (mV11, mV12); mV11 = _mm_packus_epi16(mV11, mV11); _mm_maskmoveu_si128(mV01, mask, (char *)&dst[col]); _mm_maskmoveu_si128(mV01, mask, (char *)&dst[col + i_dst]); } tmp += i_tmp * 2; dst += i_dst * 2; } } else { __m128i coeff0 = _mm_set1_epi16(*(short*)coef_y); __m128i coeff1 = _mm_set1_epi16(*(short*)(coef_y + 2)); coeff0 = _mm_cvtepi8_epi16(coeff0); coeff1 = _mm_cvtepi8_epi16(coeff1); for (row = 0; row < height; row += 2) { p = tmp; for (col = 0; col < width - 7; col += 8) { __m128i mV01, mV02; __m128i mV11, mV12; __m128i T0 = _mm_loadu_si128((__m128i*)(p)); __m128i T1 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T2 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T3 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T4 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i M00 = _mm_unpacklo_epi16(T0, T1); __m128i M01 = _mm_unpacklo_epi16(T2, T3); __m128i M02 = _mm_unpackhi_epi16(T0, T1); __m128i M03 = _mm_unpackhi_epi16(T2, T3); __m128i M10 = _mm_unpacklo_epi16(T1, T2); __m128i M11 = _mm_unpacklo_epi16(T3, T4); __m128i M12 = _mm_unpackhi_epi16(T1, T2); __m128i M13 = _mm_unpackhi_epi16(T3, T4); mV01 = _mm_add_epi32(_mm_madd_epi16(M00, coeff0), _mm_madd_epi16(M01, coeff1)); mV02 = _mm_add_epi32(_mm_madd_epi16(M02, coeff0), _mm_madd_epi16(M03, coeff1)); mV11 = _mm_add_epi32(_mm_madd_epi16(M10, coeff0), _mm_madd_epi16(M11, coeff1)); mV12 = _mm_add_epi32(_mm_madd_epi16(M12, coeff0), _mm_madd_epi16(M13, coeff1)); mV01 = _mm_srai_epi32(_mm_add_epi32(mV01, mAddOffset), shift); mV02 = _mm_srai_epi32(_mm_add_epi32(mV02, mAddOffset), shift); mV11 = _mm_srai_epi32(_mm_add_epi32(mV11, mAddOffset), shift); mV12 = _mm_srai_epi32(_mm_add_epi32(mV12, mAddOffset), shift); mV01 = _mm_packs_epi32 (mV01, mV02); mV01 = _mm_packus_epi16(mV01, mV01); mV11 = _mm_packs_epi32 (mV11, mV12); mV11 = _mm_packus_epi16(mV11, mV11); _mm_storel_epi64((__m128i*)&dst[col], mV01); _mm_storel_epi64((__m128i*)&dst[col + i_dst], mV11); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i mV01, mV02; __m128i mV11, mV12; __m128i T0 = _mm_loadu_si128((__m128i*)(p)); __m128i T1 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T2 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T3 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T4 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i M00 = _mm_unpacklo_epi16(T0, T1); __m128i M01 = _mm_unpacklo_epi16(T2, T3); __m128i M02 = _mm_unpackhi_epi16(T0, T1); __m128i M03 = _mm_unpackhi_epi16(T2, T3); __m128i M10 = _mm_unpacklo_epi16(T1, T2); __m128i M11 = _mm_unpacklo_epi16(T3, T4); __m128i M12 = _mm_unpackhi_epi16(T1, T2); __m128i M13 = _mm_unpackhi_epi16(T3, T4); mV01 = _mm_add_epi32(_mm_madd_epi16(M00, coeff0), _mm_madd_epi16(M01, coeff1)); mV02 = _mm_add_epi32(_mm_madd_epi16(M02, coeff0), _mm_madd_epi16(M03, coeff1)); mV11 = _mm_add_epi32(_mm_madd_epi16(M10, coeff0), _mm_madd_epi16(M11, coeff1)); mV12 = _mm_add_epi32(_mm_madd_epi16(M12, coeff0), _mm_madd_epi16(M13, coeff1)); mV01 = _mm_srai_epi32(_mm_add_epi32(mV01, mAddOffset), shift); mV02 = _mm_srai_epi32(_mm_add_epi32(mV02, mAddOffset), shift); mV11 = _mm_srai_epi32(_mm_add_epi32(mV11, mAddOffset), shift); mV12 = _mm_srai_epi32(_mm_add_epi32(mV12, mAddOffset), shift); mV01 = _mm_packs_epi32 (mV01, mV02); mV01 = _mm_packus_epi16(mV01, mV01); mV11 = _mm_packs_epi32 (mV11, mV12); mV11 = _mm_packus_epi16(mV11, mV11); _mm_maskmoveu_si128(mV01, mask, (char *)&dst[col]); _mm_maskmoveu_si128(mV11, mask, (char *)&dst[col + i_dst]); } tmp += i_tmp * 2; dst += i_dst * 2; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { ALIGN16(int16_t tmp_res[(64 + 7) * 64]); int16_t *tmp = tmp_res; const int i_tmp = 64; int row, col; int shift = 12; int16_t const *p; int bsymy = (coef_y[1] == coef_y[6]); __m128i mAddOffset = _mm_set1_epi32(1 << (shift - 1)); __m128i mSwitch1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); __m128i mSwitch2 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m128i mSwitch3 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m128i mSwitch4 = _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m128i mCoefx = _mm_loadl_epi64((__m128i*)coef_x); __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); mCoefx = _mm_unpacklo_epi64(mCoefx, mCoefx); // HOR src -= (3 * i_src + 3); for (row = -3; row < height + 4; row++) { for (col = 0; col < width; col += 8) { __m128i mSrc = _mm_loadu_si128((__m128i*)(src + col)); __m128i mT0 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch1), mCoefx); __m128i mT1 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch2), mCoefx); __m128i mT2 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch3), mCoefx); __m128i mT3 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch4), mCoefx); __m128i mVal = _mm_hadd_epi16(_mm_hadd_epi16(mT0, mT1), _mm_hadd_epi16(mT2, mT3)); _mm_store_si128((__m128i*)&tmp[col], mVal); } src += i_src; tmp += i_tmp; } // VER tmp = tmp_res; if (bsymy) { __m128i mCoefy1 = _mm_set1_epi16(coef_y[0]); __m128i mCoefy2 = _mm_set1_epi16(coef_y[1]); __m128i mCoefy3 = _mm_set1_epi16(coef_y[2]); __m128i mCoefy4 = _mm_set1_epi16(coef_y[3]); for (row = 0; row < height; row++) { p = tmp; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T70); __m128i T1 = _mm_unpacklo_epi16(T10, T60); __m128i T2 = _mm_unpacklo_epi16(T20, T50); __m128i T3 = _mm_unpacklo_epi16(T30, T40); __m128i T4 = _mm_unpackhi_epi16(T00, T70); __m128i T5 = _mm_unpackhi_epi16(T10, T60); __m128i T6 = _mm_unpackhi_epi16(T20, T50); __m128i T7 = _mm_unpackhi_epi16(T30, T40); __m128i mVal1, mVal2, mVal; T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T70); __m128i T1 = _mm_unpacklo_epi16(T10, T60); __m128i T2 = _mm_unpacklo_epi16(T20, T50); __m128i T3 = _mm_unpacklo_epi16(T30, T40); __m128i T4 = _mm_unpackhi_epi16(T00, T70); __m128i T5 = _mm_unpackhi_epi16(T10, T60); __m128i T6 = _mm_unpackhi_epi16(T20, T50); __m128i T7 = _mm_unpackhi_epi16(T30, T40); __m128i mVal1, mVal2, mVal; T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } tmp += i_tmp; dst += i_dst; } } else { __m128i mCoefy1 = _mm_set1_epi16(*(int16_t*)coef_y); __m128i mCoefy2 = _mm_set1_epi16(*(int16_t*)(coef_y + 2)); __m128i mCoefy3 = _mm_set1_epi16(*(int16_t*)(coef_y + 4)); __m128i mCoefy4 = _mm_set1_epi16(*(int16_t*)(coef_y + 6)); mCoefy1 = _mm_cvtepi8_epi16(mCoefy1); mCoefy2 = _mm_cvtepi8_epi16(mCoefy2); mCoefy3 = _mm_cvtepi8_epi16(mCoefy3); mCoefy4 = _mm_cvtepi8_epi16(mCoefy4); for (row = 0; row < height; row++) { p = tmp; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T10); __m128i T1 = _mm_unpacklo_epi16(T20, T30); __m128i T2 = _mm_unpacklo_epi16(T40, T50); __m128i T3 = _mm_unpacklo_epi16(T60, T70); __m128i T4 = _mm_unpackhi_epi16(T00, T10); __m128i T5 = _mm_unpackhi_epi16(T20, T30); __m128i T6 = _mm_unpackhi_epi16(T40, T50); __m128i T7 = _mm_unpackhi_epi16(T60, T70); __m128i mVal1, mVal2, mVal; T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); p += 8; } if (col < width) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T10); __m128i T1 = _mm_unpacklo_epi16(T20, T30); __m128i T2 = _mm_unpacklo_epi16(T40, T50); __m128i T3 = _mm_unpacklo_epi16(T60, T70); __m128i T4 = _mm_unpackhi_epi16(T00, T10); __m128i T5 = _mm_unpackhi_epi16(T20, T30); __m128i T6 = _mm_unpackhi_epi16(T40, T50); __m128i T7 = _mm_unpackhi_epi16(T60, T70); __m128i mVal1, mVal2, mVal; T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } tmp += i_tmp; dst += i_dst; } } } xavs2-1.3/source/common/vec/intrinsic_inter_pred_avx2.c000066400000000000000000003442371340660520300233220ustar00rootroot00000000000000/* * intrinsic_inter-pred_avx2.c * * Description of this file: * AVX2 assembly functions of Inter-Prediction module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include #include #include #include #include #include "../basic_types.h" #include "intrinsic.h" #pragma warning(disable:4127) // warning C4127: ʽdz /* --------------------------------------------------------------------------- */ void intpl_luma_block_hor_w16_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row, col; const int offset = 32; const int shift = 6; const __m256i mAddOffset = _mm256_set1_epi16((short)offset); const __m256i mask16 = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0); const __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); const __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); const __m256i mSwitch3 = _mm256_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); const __m256i mSwitch4 = _mm256_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m256i mCoef; src -= 3; #if ARCH_X86_64 mCoef = _mm256_set1_epi64x(*(long long*)coeff); #else mCoef = _mm256_loadu_si256((__m256i*)coeff); mCoef = _mm256_permute4x64_epi64(mCoef, 0x0); #endif for (row = 0; row < height; row++) { for (col = 0; col < width; col += 16) { __m256i S = _mm256_loadu_si256((__m256i*)(src + col)); __m256i S0 = _mm256_permute4x64_epi64(S, 0x94); __m256i T0, T1, T2, T3; __m256i sum; T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch2), mCoef); T2 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch3), mCoef); T3 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch4), mCoef); T0 = _mm256_hadd_epi16(T0, T1); T1 = _mm256_hadd_epi16(T2, T3); sum = _mm256_hadd_epi16(T0, T1); sum = _mm256_srai_epi16(_mm256_add_epi16(sum, mAddOffset), shift); sum = _mm256_packus_epi16(sum, sum); sum = _mm256_permute4x64_epi64(sum, 0xd8); _mm256_maskstore_epi32((int*)(dst + col), mask16, sum); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_hor_w24_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; const __m256i mAddOffset = _mm256_set1_epi16((short)offset); const __m256i index = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); const __m256i mask24 = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0); const __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); const __m256i mSwitch2 = _mm256_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); const __m256i mSwitch3 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); const __m256i mSwitch4 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); const __m256i mSwitch5 = _mm256_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); const __m256i mSwitch6 = _mm256_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m256i mCoef; UNUSED_PARAMETER(width); src -= 3; #if ARCH_X86_64 mCoef = _mm256_set1_epi64x(*(long long*)coeff); #else mCoef = _mm256_loadu_si256((__m256i*)coeff); mCoef = _mm256_permute4x64_epi64(mCoef, 0x0); #endif for (row = 0; row < height; row++) { __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_permute4x64_epi64(S0, 0x99); __m256i T0, T1, T2, T3, T4, T5; __m256i sum1, sum2; T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S1, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S1, mSwitch2), mCoef); T2 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch3), mCoef); T3 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch4), mCoef); T4 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch5), mCoef); T5 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch6), mCoef); T0 = _mm256_hadd_epi16(T0, T1); sum1 = _mm256_hadd_epi16(_mm256_hadd_epi16(T2, T3), _mm256_hadd_epi16(T4, T5)); sum2 = _mm256_hadd_epi16(T0, T0); sum1 = _mm256_srai_epi16(_mm256_add_epi16(sum1, mAddOffset), shift); sum2 = _mm256_srai_epi16(_mm256_add_epi16(sum2, mAddOffset), shift); sum2 = _mm256_permutevar8x32_epi32(sum2, index); sum1 = _mm256_packus_epi16(sum1, sum2); _mm256_maskstore_epi32((int*)(dst), mask24, sum1); src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver_w32_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[6]); const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; const int i_src4 = i_src * 4; const int i_src5 = i_src * 5; const int i_src6 = i_src * 6; const int i_src7 = i_src * 7; __m256i mAddOffset = _mm256_set1_epi16((short)offset); UNUSED_PARAMETER(width); src -= 3 * i_src; if (bsym) { __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); __m256i coeff2 = _mm256_set1_epi8(coeff[2]); __m256i coeff3 = _mm256_set1_epi8(coeff[3]); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; for (row = 0; row < height; row++) { __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S7), coeff0); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S7), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S6), coeff1); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S6), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S5), coeff2); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S3, S4), coeff3); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S3, S4), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); src += i_src; dst += i_dst; } } else { __m256i coeff0 = _mm256_set1_epi16(*(short*)coeff); __m256i coeff1 = _mm256_set1_epi16(*(short*)(coeff + 2)); __m256i coeff2 = _mm256_set1_epi16(*(short*)(coeff + 4)); __m256i coeff3 = _mm256_set1_epi16(*(short*)(coeff + 6)); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; for (row = 0; row < height; row++) { __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S4, S5), coeff2); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S4, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S6, S7), coeff3); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S6, S7), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver_w64_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[6]); const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; const int i_src4 = i_src * 4; const int i_src5 = i_src * 5; const int i_src6 = i_src * 6; const int i_src7 = i_src * 7; __m256i mAddOffset = _mm256_set1_epi16((short)offset); UNUSED_PARAMETER(width); src -= 3 * i_src; if (bsym) { __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); __m256i coeff2 = _mm256_set1_epi8(coeff[2]); __m256i coeff3 = _mm256_set1_epi8(coeff[3]); for (row = 0; row < height; row++) { const pel_t *p = src + 32; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S7), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S6), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S3, S4), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S7), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S6), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S3, S4), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); S0 = _mm256_loadu_si256((__m256i*)(p)); S1 = _mm256_loadu_si256((__m256i*)(p + i_src)); S2 = _mm256_loadu_si256((__m256i*)(p + i_src2)); S3 = _mm256_loadu_si256((__m256i*)(p + i_src3)); S4 = _mm256_loadu_si256((__m256i*)(p + i_src4)); S5 = _mm256_loadu_si256((__m256i*)(p + i_src5)); S6 = _mm256_loadu_si256((__m256i*)(p + i_src6)); S7 = _mm256_loadu_si256((__m256i*)(p + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S7), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S6), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S3, S4), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S7), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S6), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S3, S4), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst + 32), mVal1); src += i_src; dst += i_dst; } } else { __m256i coeff0 = _mm256_set1_epi16(*(short*)coeff); __m256i coeff1 = _mm256_set1_epi16(*(short*)(coeff + 2)); __m256i coeff2 = _mm256_set1_epi16(*(short*)(coeff + 4)); __m256i coeff3 = _mm256_set1_epi16(*(short*)(coeff + 6)); for (row = 0; row < height; row++) { __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; const pel_t *p = src + 32; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S4, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S6, S7), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S4, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S6, S7), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); S0 = _mm256_loadu_si256((__m256i*)(p)); S1 = _mm256_loadu_si256((__m256i*)(p + i_src)); S2 = _mm256_loadu_si256((__m256i*)(p + i_src2)); S3 = _mm256_loadu_si256((__m256i*)(p + i_src3)); S4 = _mm256_loadu_si256((__m256i*)(p + i_src4)); S5 = _mm256_loadu_si256((__m256i*)(p + i_src5)); S6 = _mm256_loadu_si256((__m256i*)(p + i_src6)); S7 = _mm256_loadu_si256((__m256i*)(p + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S4, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S6, S7), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S4, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S6, S7), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst + 32), mVal1); src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver_w16_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[6]); const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; const int i_src4 = i_src * 4; const int i_src5 = i_src * 5; const int i_src6 = i_src * 6; const int i_src7 = i_src * 7; const int i_src8 = i_src * 8; __m256i mAddOffset = _mm256_set1_epi16((short)offset); src -= 3 * i_src; UNUSED_PARAMETER(width); if (bsym) { __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); __m256i coeff2 = _mm256_set1_epi8(coeff[2]); __m256i coeff3 = _mm256_set1_epi8(coeff[3]); for (row = 0; row < height; row += 2) { __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + i_src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + i_src2)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + i_src3)); __m128i S4 = _mm_loadu_si128((__m128i*)(src + i_src4)); __m128i S5 = _mm_loadu_si128((__m128i*)(src + i_src5)); __m128i S6 = _mm_loadu_si128((__m128i*)(src + i_src6)); __m128i S7 = _mm_loadu_si128((__m128i*)(src + i_src7)); __m128i S8 = _mm_loadu_si128((__m128i*)(src + i_src8)); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; __m256i R0, R1, R2, R3, R4, R5, R6, R7; R0 = _mm256_set_m128i(S0, S1); R1 = _mm256_set_m128i(S1, S2); R2 = _mm256_set_m128i(S2, S3); R3 = _mm256_set_m128i(S3, S4); R4 = _mm256_set_m128i(S4, S5); R5 = _mm256_set_m128i(S5, S6); R6 = _mm256_set_m128i(S6, S7); R7 = _mm256_set_m128i(S7, S8); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R0, R7), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R0, R7), coeff0); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R1, R6), coeff1); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R1, R6), coeff1); T4 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R2, R5), coeff2); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R2, R5), coeff2); T6 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R3, R4), coeff3); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R3, R4), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T2), _mm256_add_epi16(T4, T6)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T1, T3), _mm256_add_epi16(T5, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu2_m128i((__m128i*)dst, (__m128i*)(dst + i_dst), mVal1); src += 2 * i_src; dst += 2 * i_dst; } } else { __m256i coeff0 = _mm256_set1_epi16(*(int16_t*)(coeff + 0)); __m256i coeff1 = _mm256_set1_epi16(*(int16_t*)(coeff + 2)); __m256i coeff2 = _mm256_set1_epi16(*(int16_t*)(coeff + 4)); __m256i coeff3 = _mm256_set1_epi16(*(int16_t*)(coeff + 6)); for (row = 0; row < height; row += 2) { __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + i_src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + i_src2)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + i_src3)); __m128i S4 = _mm_loadu_si128((__m128i*)(src + i_src4)); __m128i S5 = _mm_loadu_si128((__m128i*)(src + i_src5)); __m128i S6 = _mm_loadu_si128((__m128i*)(src + i_src6)); __m128i S7 = _mm_loadu_si128((__m128i*)(src + i_src7)); __m128i S8 = _mm_loadu_si128((__m128i*)(src + i_src8)); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; __m256i R0, R1, R2, R3, R4, R5, R6, R7; R0 = _mm256_set_m128i(S0, S1); R1 = _mm256_set_m128i(S1, S2); R2 = _mm256_set_m128i(S2, S3); R3 = _mm256_set_m128i(S3, S4); R4 = _mm256_set_m128i(S4, S5); R5 = _mm256_set_m128i(S5, S6); R6 = _mm256_set_m128i(S6, S7); R7 = _mm256_set_m128i(S7, S8); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R0, R1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R0, R1), coeff0); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R2, R3), coeff1); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R2, R3), coeff1); T4 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R4, R5), coeff2); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R4, R5), coeff2); T6 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R6, R7), coeff3); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R6, R7), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T2), _mm256_add_epi16(T4, T6)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T1, T3), _mm256_add_epi16(T5, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu2_m128i((__m128i*)dst, (__m128i*)(dst + i_dst), mVal1); src += 2 * i_src; dst += 2 * i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver_w24_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[6]); __m256i mask24 = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0); const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; const int i_src4 = i_src * 4; const int i_src5 = i_src * 5; const int i_src6 = i_src * 6; const int i_src7 = i_src * 7; __m256i mAddOffset = _mm256_set1_epi16((short)offset); UNUSED_PARAMETER(width); src -= 3 * i_src; if (bsym) { __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); __m256i coeff2 = _mm256_set1_epi8(coeff[2]); __m256i coeff3 = _mm256_set1_epi8(coeff[3]); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; for (row = 0; row < height; row++) { __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S7), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S6), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S3, S4), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S7), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S6), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S3, S4), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_maskstore_epi32((int*)(dst), mask24, mVal1); src += i_src; dst += i_dst; } } else { __m256i coeff0 = _mm256_set1_epi16(*(short*)coeff); __m256i coeff1 = _mm256_set1_epi16(*(short*)(coeff + 2)); __m256i coeff2 = _mm256_set1_epi16(*(short*)(coeff + 4)); __m256i coeff3 = _mm256_set1_epi16(*(short*)(coeff + 6)); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; for (row = 0; row < height; row++) { __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S4, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S6, S7), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S4, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S6, S7), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_maskstore_epi32((int*)(dst), mask24, mVal1); src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver_w48_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { const int shift = 6; const int offset = (1 << shift) >> 1; const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; const int i_src4 = i_src * 4; const int i_src5 = i_src * 5; const int i_src6 = i_src * 6; const int i_src7 = i_src * 7; const __m256i mask16 = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0); int bsym = (coeff[1] == coeff[6]); int row; src -= 3 * i_src; UNUSED_PARAMETER(width); if (bsym) { __m256i mAddOffset = _mm256_set1_epi16((short)offset); __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); __m256i coeff2 = _mm256_set1_epi8(coeff[2]); __m256i coeff3 = _mm256_set1_epi8(coeff[3]); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; for (row = 0; row < height; row++) { const pel_t *p = src + 32; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S7), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S6), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S3, S4), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S7), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S6), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S3, S4), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); S0 = _mm256_loadu_si256((__m256i*)(p)); S1 = _mm256_loadu_si256((__m256i*)(p + i_src)); S2 = _mm256_loadu_si256((__m256i*)(p + i_src2)); S3 = _mm256_loadu_si256((__m256i*)(p + i_src3)); S4 = _mm256_loadu_si256((__m256i*)(p + i_src4)); S5 = _mm256_loadu_si256((__m256i*)(p + i_src5)); S6 = _mm256_loadu_si256((__m256i*)(p + i_src6)); S7 = _mm256_loadu_si256((__m256i*)(p + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S7), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S6), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S3, S4), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S7), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S6), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S3, S4), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_maskstore_epi32((int*)(dst + 32), mask16, mVal1); src += i_src; dst += i_dst; } } else { __m256i mAddOffset = _mm256_set1_epi16((short)offset); __m256i coeff0 = _mm256_set1_epi16(*(short*)coeff); __m256i coeff1 = _mm256_set1_epi16(*(short*)(coeff + 2)); __m256i coeff2 = _mm256_set1_epi16(*(short*)(coeff + 4)); __m256i coeff3 = _mm256_set1_epi16(*(short*)(coeff + 6)); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; for (row = 0; row < height; row++) { const pel_t *p = src + 32; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S4, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S6, S7), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S4, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S6, S7), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); S0 = _mm256_loadu_si256((__m256i*)(p)); S1 = _mm256_loadu_si256((__m256i*)(p + i_src)); S2 = _mm256_loadu_si256((__m256i*)(p + i_src2)); S3 = _mm256_loadu_si256((__m256i*)(p + i_src3)); S4 = _mm256_loadu_si256((__m256i*)(p + i_src4)); S5 = _mm256_loadu_si256((__m256i*)(p + i_src5)); S6 = _mm256_loadu_si256((__m256i*)(p + i_src6)); S7 = _mm256_loadu_si256((__m256i*)(p + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S4, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S6, S7), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S4, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S6, S7), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_maskstore_epi32((int*)(dst + 32), mask16, mVal1); src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ext_w16_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { ALIGN32(int16_t tmp_res[(64 + 7) * 64]); int16_t *tmp = tmp_res; const int i_tmp = 64; const int i_tmp2 = 2 * i_tmp; const int i_tmp3 = 3 * i_tmp; const int i_tmp4 = 4 * i_tmp; const int i_tmp5 = 5 * i_tmp; const int i_tmp6 = 6 * i_tmp; const int i_tmp7 = 7 * i_tmp; const int shift = 12; const __m256i mAddOffset = _mm256_set1_epi32((1 << shift) >> 1); int row, col; __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, // ǰ 8 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); // 8 __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m256i mSwitch3 = _mm256_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m256i mSwitch4 = _mm256_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m256i mCoef; src = src - 3 * i_src - 3; //HOR #if ARCH_X86_64 mCoef = _mm256_set1_epi64x(*(long long*)coef_x); #else mCoef = _mm256_loadu_si256((__m256i*)coef_x); mCoef = _mm256_permute4x64_epi64(mCoef, 0x0); #endif for (row = -3; row < height + 4; row++) { for (col = 0; col < width; col += 16) { __m256i T0, T1, sum, T2, T3; __m256i S = _mm256_loadu_si256((__m256i*)(src + col)); // ǰ8ֵصͺ8ֵĵֱ뵽ǰ128λ __m256i S0 = _mm256_permute4x64_epi64(S, 0x94); T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch2), mCoef); T2 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch3), mCoef); T3 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch4), mCoef); sum = _mm256_hadd_epi16(_mm256_hadd_epi16(T0, T1), _mm256_hadd_epi16(T2, T3)); _mm256_store_si256((__m256i*)(tmp + col), sum); } src += i_src; tmp += i_tmp; } // VER tmp = tmp_res; __m256i mCoefy1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)coef_y)); __m256i mCoefy2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 2))); __m256i mCoefy3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 4))); __m256i mCoefy4 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 6))); // ͬʱֵ2/4Уظload for (row = 0; row < height; row++) { for (col = 0; col < width; col += 16) { __m256i T0, T1, T2, T3, T4, T5, T6, T7; __m256i mVal1, mVal2; __m256i S0 = _mm256_load_si256((__m256i*)(tmp + col)); __m256i S1 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp)); __m256i S2 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp2)); __m256i S3 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp3)); __m256i S4 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp4)); __m256i S5 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp5)); __m256i S6 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp6)); __m256i S7 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp7)); T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S0, S1), mCoefy1); T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S2, S3), mCoefy2); T2 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S4, S5), mCoefy3); T3 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S6, S7), mCoefy4); T4 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S0, S1), mCoefy1); T5 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S2, S3), mCoefy2); T6 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S4, S5), mCoefy3); T7 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S6, S7), mCoefy4); mVal1 = _mm256_add_epi32(_mm256_add_epi32(T0, T1), _mm256_add_epi32(T2, T3)); mVal2 = _mm256_add_epi32(_mm256_add_epi32(T4, T5), _mm256_add_epi32(T6, T7)); mVal1 = _mm256_srai_epi32(_mm256_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi32(_mm256_add_epi32(mVal2, mAddOffset), shift); mVal1 = _mm256_packs_epi32(mVal1, mVal2); mVal1 = _mm256_packus_epi16(mVal1, mVal1); mVal1 = _mm256_permute4x64_epi64(mVal1, 0xd8); _mm_storeu_si128((__m128i*)(dst + col), _mm256_castsi256_si128(mVal1)); } tmp += i_tmp; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ext_w24_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int height, const int8_t *coef_x, const int8_t *coef_y) { ALIGN32(int16_t tmp_res[(64 + 7) * 64]); int16_t *tmp = tmp_res; const int i_tmp = 32; const int i_tmp2 = 2 * i_tmp; const int i_tmp3 = 3 * i_tmp; const int i_tmp4 = 4 * i_tmp; const int i_tmp5 = 5 * i_tmp; const int i_tmp6 = 6 * i_tmp; const int i_tmp7 = 7 * i_tmp; int row; int bsymy = (coef_y[1] == coef_y[6]); int shift = 12; __m256i mAddOffset = _mm256_set1_epi32(1 << 11); __m256i mCoef; __m256i mask24 = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0); // HOR __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m256i mSwitch3 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); __m256i mSwitch4 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m256i mSwitch5 = _mm256_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m256i mSwitch6 = _mm256_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); src -= (3 * i_src + 3); #if ARCH_X86_64 mCoef = _mm256_set1_epi64x(*(long long*)coef_x); #else mCoef = _mm256_loadu_si256((__m256i*)coef_x); mCoef = _mm256_permute4x64_epi64(mCoef, 0x0); #endif for (row = -3; row < height + 4; row++) { __m256i T0, T1, T2, T3, T4, T5, sum1, sum2; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_permute4x64_epi64(S0, 0x99); T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S1, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S1, mSwitch2), mCoef); T0 = _mm256_hadd_epi16(T0, T1); T2 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch3), mCoef); T3 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch4), mCoef); T4 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch5), mCoef); T5 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch6), mCoef); sum1 = _mm256_hadd_epi16(_mm256_hadd_epi16(T2, T3), _mm256_hadd_epi16(T4, T5)); sum2 = _mm256_hadd_epi16(T0, T0); sum2 = _mm256_permute4x64_epi64(sum2, 0xd8); sum2 = _mm256_permute2x128_si256(sum1, sum2, 0x13); _mm_storeu_si128((__m128i*)(tmp), _mm256_castsi256_si128(sum1)); _mm256_storeu_si256((__m256i*)(tmp + 8), sum2); src += i_src; tmp += i_tmp; } // VER tmp = tmp_res; if (bsymy) { __m256i mCoefy1 = _mm256_set1_epi16(coef_y[0]); __m256i mCoefy2 = _mm256_set1_epi16(coef_y[1]); __m256i mCoefy3 = _mm256_set1_epi16(coef_y[2]); __m256i mCoefy4 = _mm256_set1_epi16(coef_y[3]); for (row = 0; row < height; row++) { __m256i mVal1, mVal2, mVal, mVal3, mVal4; __m256i T0, T1, T2, T3, S0, S1, S2, S3; __m256i T4, T5, T6, T7, S4, S5, S6, S7; __m256i T00, T11, T22, T33, S00, S11, S22, S33; __m256i T44, T55, T66, T77, S44, S55, S66, S77; S0 = _mm256_loadu_si256((__m256i*)(tmp)); S1 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp)); S2 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp2)); S3 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp3)); S4 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp4)); S5 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp5)); S6 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp6)); S7 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp7)); S00 = _mm256_loadu_si256((__m256i*)(tmp + 16)); S11 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp)); S22 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp2)); S33 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp3)); S44 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp4)); S55 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp5)); S66 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp6)); S77 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp7)); T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S0, S7), mCoefy1); T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S1, S6), mCoefy2); T2 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S2, S5), mCoefy3); T3 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S3, S4), mCoefy4); T4 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S0, S7), mCoefy1); T5 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S1, S6), mCoefy2); T6 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S2, S5), mCoefy3); T7 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S3, S4), mCoefy4); T00 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S00, S77), mCoefy1); T11 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S11, S66), mCoefy2); T22 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S22, S55), mCoefy3); T33 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S33, S44), mCoefy4); T44 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S00, S77), mCoefy1); T55 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S11, S66), mCoefy2); T66 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S22, S55), mCoefy3); T77 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S33, S44), mCoefy4); mVal1 = _mm256_add_epi32(_mm256_add_epi32(T0, T1), _mm256_add_epi32(T2, T3)); mVal2 = _mm256_add_epi32(_mm256_add_epi32(T4, T5), _mm256_add_epi32(T6, T7)); mVal3 = _mm256_add_epi32(_mm256_add_epi32(T00, T11), _mm256_add_epi32(T22, T33)); mVal4 = _mm256_add_epi32(_mm256_add_epi32(T44, T55), _mm256_add_epi32(T66, T77)); mVal1 = _mm256_srai_epi32(_mm256_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi32(_mm256_add_epi32(mVal2, mAddOffset), shift); mVal3 = _mm256_srai_epi32(_mm256_add_epi32(mVal3, mAddOffset), shift); mVal4 = _mm256_srai_epi32(_mm256_add_epi32(mVal4, mAddOffset), shift); mVal = _mm256_packus_epi16(_mm256_packs_epi32(mVal1, mVal2), _mm256_packs_epi32(mVal3, mVal4)); mVal = _mm256_permute4x64_epi64(mVal, 0xd8); _mm256_maskstore_epi32((int*)(dst), mask24, mVal); tmp += i_tmp; dst += i_dst; } } else { __m256i mCoefy1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y))); __m256i mCoefy2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 2))); __m256i mCoefy3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 4))); __m256i mCoefy4 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 6))); for (row = 0; row < height; row++) { __m256i mVal1, mVal2, mVal, mVal3, mVal4; __m256i T0, T1, T2, T3, S0, S1, S2, S3; __m256i T4, T5, T6, T7, S4, S5, S6, S7; __m256i T00, T11, T22, T33, S00, S11, S22, S33; __m256i T44, T55, T66, T77, S44, S55, S66, S77; S0 = _mm256_loadu_si256((__m256i*)(tmp)); S1 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp)); S2 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp2)); S3 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp3)); S4 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp4)); S5 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp5)); S6 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp6)); S7 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp7)); S00 = _mm256_loadu_si256((__m256i*)(tmp + 16)); S11 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp)); S22 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp2)); S33 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp3)); S44 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp4)); S55 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp5)); S66 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp6)); S77 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp7)); T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S0, S1), mCoefy1); T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S2, S3), mCoefy2); T2 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S4, S5), mCoefy3); T3 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S6, S7), mCoefy4); T4 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S0, S1), mCoefy1); T5 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S2, S3), mCoefy2); T6 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S4, S5), mCoefy3); T7 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S6, S7), mCoefy4); T00 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S00, S11), mCoefy1); T11 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S22, S33), mCoefy2); T22 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S44, S55), mCoefy3); T33 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S66, S77), mCoefy4); T44 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S00, S11), mCoefy1); T55 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S22, S33), mCoefy2); T66 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S44, S55), mCoefy3); T77 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S66, S77), mCoefy4); mVal1 = _mm256_add_epi32(_mm256_add_epi32(T0, T1), _mm256_add_epi32(T2, T3)); mVal2 = _mm256_add_epi32(_mm256_add_epi32(T4, T5), _mm256_add_epi32(T6, T7)); mVal3 = _mm256_add_epi32(_mm256_add_epi32(T00, T11), _mm256_add_epi32(T22, T33)); mVal4 = _mm256_add_epi32(_mm256_add_epi32(T44, T55), _mm256_add_epi32(T66, T77)); mVal1 = _mm256_srai_epi32(_mm256_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi32(_mm256_add_epi32(mVal2, mAddOffset), shift); mVal3 = _mm256_srai_epi32(_mm256_add_epi32(mVal3, mAddOffset), shift); mVal4 = _mm256_srai_epi32(_mm256_add_epi32(mVal4, mAddOffset), shift); mVal = _mm256_packus_epi16(_mm256_packs_epi32(mVal1, mVal2), _mm256_packs_epi32(mVal3, mVal4)); mVal = _mm256_permute4x64_epi64(mVal, 0xd8); _mm256_maskstore_epi32((int*)(dst), mask24, mVal); tmp += i_tmp; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_hor_w16_avx2(pel_t *dst, int i_dst, const pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row, col; const int offset = 32; const int shift = 6; __m256i mCoef = _mm256_set1_epi32(*(int32_t*)coeff); __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); __m256i mSwitch2 = _mm256_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); __m256i mAddOffset = _mm256_set1_epi16((short)offset); __m256i mask16 = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0); src -= 1; for (row = 0; row < height; row++) { for (col = 0; col < width; col += 16) { __m256i T0, T1, sum; __m256i S = _mm256_loadu_si256((__m256i*)(src + col)); __m256i S0 = _mm256_permute4x64_epi64(S, 0x94); T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch2), mCoef); sum = _mm256_srai_epi16(_mm256_add_epi16(_mm256_hadd_epi16(T0, T1), mAddOffset), shift); sum = _mm256_packus_epi16(sum, sum); sum = _mm256_permute4x64_epi64(sum, 0xd8); _mm256_maskstore_epi32((int*)(dst + col), mask16, sum); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_hor_w24_avx2(pel_t *dst, int i_dst, const pel_t *src, int i_src, int height, const int8_t *coeff) { const int offset = 32; const int shift = 6; const __m256i mCoef = _mm256_set1_epi32(*(int32_t*)coeff); const __m256i mSwitch = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); const __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); const __m256i mSwitch2 = _mm256_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); const __m256i mask24 = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0); const __m256i mAddOffset = _mm256_set1_epi16((short)offset); const __m256i index = _mm256_setr_epi32(0, 1, 2, 6, 4, 5, 3, 7); int row; src -= 1; for (row = 0; row < height; row++) { __m256i T0, T1, T2, sum1, sum2; __m256i S = _mm256_loadu_si256((__m256i*)(src)); __m256i S0 = _mm256_permute4x64_epi64(S, 0x99); T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S, mSwitch2), mCoef); T2 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch), mCoef); sum1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_hadd_epi16(T0, T1), mAddOffset), shift); sum2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_hadd_epi16(T2, T2), mAddOffset), shift); sum1 = _mm256_permutevar8x32_epi32(_mm256_packus_epi16(sum1, sum2), index); _mm256_maskstore_epi32((int*)(dst), mask24, sum1); src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ver_w32_avx2(pel_t *dst, int i_dst, const pel_t *src, int i_src, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[2]); __m256i mAddOffset = _mm256_set1_epi16((short)offset); const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; src -= i_src; if (bsym) { __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); for (row = 0; row < height; row++) { __m256i S0, S1, S2, S3; __m256i T0, T1, T2, T3, mVal1, mVal2; S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S3), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S2), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S3), coeff0); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S2), coeff1); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T2, T3), mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); src += i_src; dst += i_dst; } } else { __m256i coeff0 = _mm256_set1_epi16(*(int16_t*)coeff); __m256i coeff1 = _mm256_set1_epi16(*(int16_t*)(coeff + 2)); for (row = 0; row < height; row++) { __m256i T0, T1, T2, T3, mVal1, mVal2; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T2, T3), mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ver_w24_avx2(pel_t *dst, int i_dst, const pel_t *src, int i_src, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[2]); __m256i mAddOffset = _mm256_set1_epi16((short)offset); __m256i mask24 = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0); const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; src -= i_src; if (bsym) { __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); for (row = 0; row < height; row++) { __m256i T0, T1, T2, T3, mVal1, mVal2; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S3), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S2), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S3), coeff0); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S2), coeff1); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T2, T3), mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_maskstore_epi32((int*)(dst), mask24, mVal1); src += i_src; dst += i_dst; } } else { __m256i coeff0 = _mm256_set1_epi16(*(int16_t*)coeff); __m256i coeff1 = _mm256_set1_epi16(*(int16_t*)(coeff + 2)); for (row = 0; row < height; row++) { __m256i T0, T1, T2, T3, mVal1, mVal2; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T2, T3), mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_maskstore_epi32((int*)(dst), mask24, mVal1); src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ver_w16_avx2(pel_t *dst, int i_dst, const pel_t *src, int i_src, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[2]); __m256i mAddOffset = _mm256_set1_epi16((short)offset); const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; const int i_src4 = i_src * 4; src -= i_src; if (bsym) { __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); for (row = 0; row < height; row = row + 2) { __m256i T0, T1, T2, T3, mVal1, mVal2; __m256i R0, R1, R2, R3; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + i_src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + i_src2)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + i_src3)); __m128i S4 = _mm_loadu_si128((__m128i*)(src + i_src4)); R0 = _mm256_set_m128i(S0, S1); R1 = _mm256_set_m128i(S1, S2); R2 = _mm256_set_m128i(S2, S3); R3 = _mm256_set_m128i(S3, S4); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R0, R3), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R0, R3), coeff0); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R1, R2), coeff1); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R1, R2), coeff1); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T0, T2), mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T1, T3), mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu2_m128i((__m128i*)dst, (__m128i*)(dst + i_dst), mVal1); src += 2 * i_src; dst += 2 * i_dst; } } else { __m256i coeff0 = _mm256_set1_epi16(*(int16_t*)coeff); __m256i coeff1 = _mm256_set1_epi16(*(int16_t*)(coeff + 2)); for (row = 0; row < height; row = row + 2) { __m256i T0, T1, T2, T3, mVal1, mVal2; __m256i R0, R1, R2, R3; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + i_src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + i_src2)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + i_src3)); __m128i S4 = _mm_loadu_si128((__m128i*)(src + i_src4)); R0 = _mm256_set_m128i(S0, S1); R1 = _mm256_set_m128i(S1, S2); R2 = _mm256_set_m128i(S2, S3); R3 = _mm256_set_m128i(S3, S4); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R0, R1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R0, R1), coeff0); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R2, R3), coeff1); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R2, R3), coeff1); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T0, T2), mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T1, T3), mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu2_m128i((__m128i*)dst, (__m128i*)(dst + i_dst), mVal1); src += 2 * i_src; dst += 2 * i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ext_w16_avx2(pel_t *dst, int i_dst, const pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { ALIGN32(int16_t tmp_res[(32 + 3) * 32]); int16_t *tmp = tmp_res; const int i_tmp = 32; const int i_tmp2 = 2 * i_tmp; const int i_tmp3 = 3 * i_tmp; const int shift = 12; int row, col; int bsymy = (coef_y[1] == coef_y[6]); __m256i mAddOffset = _mm256_set1_epi32(1 << (shift - 1)); __m256i mCoef = _mm256_set1_epi32(*(int32_t*)coef_x); __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); __m256i mSwitch2 = _mm256_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); // HOR src -= (i_src + 1); for (row = -1; row < height + 2; row++) { for (col = 0; col < width; col += 16) { __m256i T0, T1, S, S0, sum; S = _mm256_loadu_si256((__m256i*)(src + col)); S0 = _mm256_permute4x64_epi64(S, 0x94); T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch2), mCoef); sum = _mm256_hadd_epi16(T0, T1); _mm256_storeu_si256((__m256i*)(tmp + col), sum); } src += i_src; tmp += i_tmp; } // VER tmp = tmp_res; if (bsymy) { __m256i mCoefy1 = _mm256_set1_epi16(coef_y[0]); __m256i mCoefy2 = _mm256_set1_epi16(coef_y[1]); for (row = 0; row < height; row++) { for (col = 0; col < width; col += 16) { __m256i mVal1, mVal2, mVal; __m256i T0, T1, T2, T3, S0, S1, S2, S3; S0 = _mm256_load_si256((__m256i*)(tmp + col)); S1 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp)); S2 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp2)); S3 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp3)); T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S0, S3), mCoefy1); T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S1, S2), mCoefy2); T2 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S0, S3), mCoefy1); T3 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S1, S2), mCoefy2); mVal1 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T2, T3), mAddOffset), shift); mVal = _mm256_packus_epi16(_mm256_packs_epi32(mVal1, mVal2), /*no-use*/mVal1); mVal = _mm256_permute4x64_epi64(mVal, 0xd8); _mm_storeu_si128((__m128i*)(dst + col), _mm256_castsi256_si128(mVal)); } tmp += i_tmp; dst += i_dst; } } else { __m256i mCoefy1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)coef_y)); __m256i mCoefy2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 2))); for (row = 0; row < height; row++) { for (col = 0; col < width; col += 16) { __m256i mVal1, mVal2, mVal; __m256i T0, T1, T2, T3, S0, S1, S2, S3; S0 = _mm256_load_si256((__m256i*)(tmp + col)); S1 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp)); S2 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp2)); S3 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp3)); T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S0, S1), mCoefy1); T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S2, S3), mCoefy2); T2 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S0, S1), mCoefy1); T3 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S2, S3), mCoefy2); mVal1 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T2, T3), mAddOffset), shift); mVal = _mm256_packus_epi16(_mm256_packs_epi32(mVal1, mVal2), /*no-use*/mVal1); mVal = _mm256_permute4x64_epi64(mVal, 0xd8); _mm_storeu_si128((__m128i*)(dst + col), _mm256_castsi256_si128(mVal)); } tmp += i_tmp; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ext_w24_avx2(pel_t *dst, int i_dst, const pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { ALIGN32(int16_t tmp_res[(32 + 3) * 32]); int16_t *tmp = tmp_res; const int i_tmp = 32; const int i_tmp2 = 2 * i_tmp; const int i_tmp3 = 3 * i_tmp; int row; int bsymy = (coef_y[1] == coef_y[6]); const int shift = 12; __m256i mAddOffset = _mm256_set1_epi32(1 << (shift - 1)); __m256i mCoef = _mm256_set1_epi32(*(int32_t*)coef_x); __m256i mSwitch = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); __m256i mSwitch2 = _mm256_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); __m256i mask24 = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0); //HOR src = src - i_src - 1; UNUSED_PARAMETER(width); for (row = -1; row < height + 2; row++) { __m256i T0, T1, T2, S, S0; S = _mm256_loadu_si256((__m256i*)(src)); S0 = _mm256_permute4x64_epi64(S, 0x99); T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S, mSwitch2), mCoef); T2 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch), mCoef); T0 = _mm256_hadd_epi16(T0, T1); T2 = _mm256_hadd_epi16(T2, T2); T2 = _mm256_permute4x64_epi64(T2, 0xd8); T2 = _mm256_permute2x128_si256(T0, T2, 0x13); _mm_storeu_si128((__m128i*)(tmp), _mm256_castsi256_si128(T0)); _mm256_storeu_si256((__m256i*)(tmp + 8), T2); src += i_src; tmp += i_tmp; } // VER tmp = tmp_res; if (bsymy) { __m256i mCoefy1 = _mm256_set1_epi16(coef_y[0]); __m256i mCoefy2 = _mm256_set1_epi16(coef_y[1]); for (row = 0; row < height; row++) { __m256i mVal1, mVal2, mVal3, mVal4, mVal; __m256i S0, S1, S2, S3, S4, S5, S6, S7; __m256i T0, T1, T2, T3, T4, T5, T6, T7; S0 = _mm256_load_si256((__m256i*)(tmp)); S1 = _mm256_load_si256((__m256i*)(tmp + i_tmp)); S2 = _mm256_load_si256((__m256i*)(tmp + i_tmp2)); S3 = _mm256_load_si256((__m256i*)(tmp + i_tmp3)); S4 = _mm256_load_si256((__m256i*)(tmp + 16)); S5 = _mm256_load_si256((__m256i*)(tmp + 16 + i_tmp)); S6 = _mm256_load_si256((__m256i*)(tmp + 16 + i_tmp2)); S7 = _mm256_load_si256((__m256i*)(tmp + 16 + i_tmp3)); T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S0, S3), mCoefy1); T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S1, S2), mCoefy2); T2 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S0, S3), mCoefy1); T3 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S1, S2), mCoefy2); T4 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S4, S7), mCoefy1); T5 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S5, S6), mCoefy2); T6 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S4, S7), mCoefy1); T7 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S5, S6), mCoefy2); mVal1 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T2, T3), mAddOffset), shift); mVal3 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T4, T5), mAddOffset), shift); mVal4 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T6, T7), mAddOffset), shift); mVal = _mm256_packus_epi16(_mm256_packs_epi32(mVal1, mVal2), _mm256_packs_epi32(mVal3, mVal4)); mVal = _mm256_permute4x64_epi64(mVal, 0xd8); _mm256_maskstore_epi32((int*)(dst), mask24, mVal); tmp += i_tmp; dst += i_dst; } } else { __m256i mCoefy1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)coef_y)); __m256i mCoefy2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 2))); for (row = 0; row < height; row++) { __m256i mVal1, mVal2, mVal3, mVal4, mVal; __m256i S0, S1, S2, S3, S4, S5, S6, S7; __m256i T0, T1, T2, T3, T4, T5, T6, T7; S0 = _mm256_load_si256((__m256i*)(tmp)); S1 = _mm256_load_si256((__m256i*)(tmp + i_tmp)); S2 = _mm256_load_si256((__m256i*)(tmp + i_tmp2)); S3 = _mm256_load_si256((__m256i*)(tmp + i_tmp3)); S4 = _mm256_load_si256((__m256i*)(tmp + 16)); S5 = _mm256_load_si256((__m256i*)(tmp + 16 + i_tmp)); S6 = _mm256_load_si256((__m256i*)(tmp + 16 + i_tmp2)); S7 = _mm256_load_si256((__m256i*)(tmp + 16 + i_tmp3)); T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S0, S1), mCoefy1); T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S2, S3), mCoefy2); T2 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S0, S1), mCoefy1); T3 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S2, S3), mCoefy2); T4 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S4, S5), mCoefy1); T5 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S6, S7), mCoefy2); T6 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S4, S5), mCoefy1); T7 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S6, S7), mCoefy2); mVal1 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T2, T3), mAddOffset), shift); mVal3 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T4, T5), mAddOffset), shift); mVal4 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T6, T7), mAddOffset), shift); mVal = _mm256_packus_epi16(_mm256_packs_epi32(mVal1, mVal2), _mm256_packs_epi32(mVal3, mVal4)); mVal = _mm256_permute4x64_epi64(mVal, 0xd8); _mm256_maskstore_epi32((int*)(dst), mask24, mVal); tmp += i_tmp; dst += i_dst; } } } /*--------------------------------------- ֵ ------------------------------------------------------*/ /* --------------------------------------------------------------------------- */ void intpl_luma_block_hor_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { switch (width / 4 - 1) { case 3: case 7: case 11: case 15: intpl_luma_block_hor_w16_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 5: intpl_luma_block_hor_w24_avx2(dst, i_dst, src, i_src, width, height, coeff); break; default: intpl_luma_block_hor_sse128(dst, i_dst, src, i_src, width, height, coeff); } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { switch (width / 4 - 1) { case 3: intpl_luma_block_ver_w16_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 5: intpl_luma_block_ver_w24_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 7: intpl_luma_block_ver_w32_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 11: intpl_luma_block_ver_w48_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 15: intpl_luma_block_ver_w64_avx2(dst, i_dst, src, i_src, width, height, coeff); break; default: intpl_luma_block_ver_sse128(dst, i_dst, src, i_src, width, height, coeff); } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ext_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { switch (width / 4 - 1) { case 3: case 7: case 11: case 15: intpl_luma_block_ext_w16_avx2(dst, i_dst, src, i_src, width, height, coef_x, coef_y); break; case 5: intpl_luma_block_ext_w24_avx2(dst, i_dst, src, i_src, height, coef_x, coef_y); break; default: intpl_luma_block_ext_sse128(dst, i_dst, src, i_src, width, height, coef_x, coef_y); } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_hor_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { switch (width / 2 - 1) { case 7: case 15: intpl_chroma_block_hor_w16_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 11: intpl_chroma_block_hor_w24_avx2(dst, i_dst, src, i_src, height, coeff); break; default: intpl_chroma_block_hor_sse128(dst, i_dst, src, i_src, width, height, coeff); } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ver_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { switch (width / 2 - 1) { case 7: intpl_chroma_block_ver_w16_avx2(dst, i_dst, src, i_src, height, coeff); break; case 11: intpl_chroma_block_ver_w24_avx2(dst, i_dst, src, i_src, height, coeff); break; case 15: intpl_chroma_block_ver_w32_avx2(dst, i_dst, src, i_src, height, coeff); break; default: intpl_chroma_block_ver_sse128(dst, i_dst, src, i_src, width, height, coeff); } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ext_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { switch (width / 2 - 1) { case 7: case 15: intpl_chroma_block_ext_w16_avx2(dst, i_dst, src, i_src, width, height, coef_x, coef_y); break; case 11: intpl_chroma_block_ext_w24_avx2(dst, i_dst, src, i_src, width, height, coef_x, coef_y); break; default: intpl_chroma_block_ext_sse128(dst, i_dst, src, i_src, width, height, coef_x, coef_y); } } /* --------------------------------------------------------------------------- */ #define INTPL_LUMA_EXT_COMPUT(W0,W1,W2,W3,W4,W5,W6,W7,result) \ T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(W0, W1), mCoefy01); \ T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(W2, W3), mCoefy23); \ T2 = _mm256_madd_epi16(_mm256_unpacklo_epi16(W4, W5), mCoefy45); \ T3 = _mm256_madd_epi16(_mm256_unpacklo_epi16(W6, W7), mCoefy67); \ T4 = _mm256_madd_epi16(_mm256_unpackhi_epi16(W0, W1), mCoefy01); \ T5 = _mm256_madd_epi16(_mm256_unpackhi_epi16(W2, W3), mCoefy23); \ T6 = _mm256_madd_epi16(_mm256_unpackhi_epi16(W4, W5), mCoefy45); \ T7 = _mm256_madd_epi16(_mm256_unpackhi_epi16(W6, W7), mCoefy67); \ \ mVal1 = _mm256_add_epi32(_mm256_add_epi32(T0, T1), _mm256_add_epi32(T2, T3)); \ mVal2 = _mm256_add_epi32(_mm256_add_epi32(T4, T5), _mm256_add_epi32(T6, T7)); \ \ mVal1 = _mm256_srai_epi32(_mm256_add_epi32(mVal1, mAddOffset), shift); \ mVal2 = _mm256_srai_epi32(_mm256_add_epi32(mVal2, mAddOffset), shift); \ result = _mm256_packs_epi32(mVal1, mVal2); #define INTPL_LUMA_EXT_STORE(a, b, c) \ mVal = _mm256_permute4x64_epi64(_mm256_packus_epi16(a, b), 216); \ _mm256_storeu_si256((__m256i*)(c), mVal); /* --------------------------------------------------------------------------- */ void intpl_luma_ext_avx2(pel_t *dst, int i_dst, int16_t *tmp, int i_tmp, int width, int height, const int8_t *coeff) { const int shift = 12; int row, col; int16_t const *p; __m256i mAddOffset = _mm256_set1_epi32(1 << (shift - 1)); __m256i mCoefy01 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coeff + 0))); __m256i mCoefy23 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coeff + 2))); __m256i mCoefy45 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coeff + 4))); __m256i mCoefy67 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coeff + 6))); tmp -= 3 * i_tmp; for (row = 0; row < height; row = row + 4) { __m256i T00, T10, T20, T30, T40, T50, T60, T70, T80, T90, Ta0; __m256i T0, T1, T2, T3, T4, T5, T6, T7; __m256i U0, U1, U2, U3; __m256i V0, V1, V2, V3; __m256i mVal1, mVal2, mVal; p = tmp; for (col = 0; col < width - 31; col += 32) { T00 = _mm256_loadu_si256((__m256i*)(p)); T10 = _mm256_loadu_si256((__m256i*)(p + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, U0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, U1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, U2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, U3); //col + 16 T00 = _mm256_loadu_si256((__m256i*)(p + 16)); T10 = _mm256_loadu_si256((__m256i*)(p + 16 + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 16 + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 16 + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 16 + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 16 + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 16 + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 16 + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 16 + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 16 + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 16 + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, V0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, V1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, V2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, V3); INTPL_LUMA_EXT_STORE(U0, V0, dst + col); INTPL_LUMA_EXT_STORE(U1, V1, dst + i_dst + col); INTPL_LUMA_EXT_STORE(U2, V2, dst + 2 * i_dst + col); INTPL_LUMA_EXT_STORE(U3, V3, dst + 3 * i_dst + col); p += 32; } if (col < width - 16) { T00 = _mm256_loadu_si256((__m256i*)(p)); T10 = _mm256_loadu_si256((__m256i*)(p + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, U0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, U1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, U2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, U3); //col + 16 T00 = _mm256_loadu_si256((__m256i*)(p + 16)); T10 = _mm256_loadu_si256((__m256i*)(p + 16 + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 16 + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 16 + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 16 + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 16 + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 16 + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 16 + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 16 + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 16 + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 16 + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, V0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, V1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, V2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, V3); INTPL_LUMA_EXT_STORE(U0, V0, dst + col); INTPL_LUMA_EXT_STORE(U1, V1, dst + i_dst + col); INTPL_LUMA_EXT_STORE(U2, V2, dst + 2 * i_dst + col); INTPL_LUMA_EXT_STORE(U3, V3, dst + 3 * i_dst + col); p += 32; col += 32; } if (col < width) { T00 = _mm256_loadu_si256((__m256i*)(p)); T10 = _mm256_loadu_si256((__m256i*)(p + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, U0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, U1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, U2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, U3); INTPL_LUMA_EXT_STORE(U0, U0, dst + col); INTPL_LUMA_EXT_STORE(U1, U1, dst + i_dst + col); INTPL_LUMA_EXT_STORE(U2, U2, dst + 2 * i_dst + col); INTPL_LUMA_EXT_STORE(U3, U3, dst + 3 * i_dst + col); p += 16; col += 16; } tmp += i_tmp * 4; dst += i_dst * 4; } } /* --------------------------------------------------------------------------- */ void intpl_luma_ext_x3_avx2(pel_t *const dst[3], int i_dst, int16_t *tmp, int i_tmp, int width, int height, const int8_t **coeff) { intpl_luma_ext_avx2(dst[0], i_dst, tmp, i_tmp, width, height, coeff[0]); intpl_luma_ext_avx2(dst[1], i_dst, tmp, i_tmp, width, height, coeff[1]); intpl_luma_ext_avx2(dst[2], i_dst, tmp, i_tmp, width, height, coeff[2]); } /* --------------------------------------------------------------------------- */ void intpl_luma_hor_avx2(pel_t *dst, int i_dst, int16_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int row, col = 0; const short offset = 32; const int shift = 6; __m256i mAddOffset = _mm256_set1_epi16(offset); __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m256i mSwitch3 = _mm256_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m256i mSwitch4 = _mm256_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m256i mask4 = _mm256_setr_epi16(-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); __m256i mask8 = _mm256_setr_epi16(-1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); __m256i mask16 = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0); #if ARCH_X86_64 __m256i mCoef = _mm256_set1_epi64x(*(int64_t*)coeff); #else __m256i mCoef = _mm256_loadu_si256((__m256i*)coeff); mCoef = _mm256_permute4x64_epi64(mCoef, 0x0); #endif src -= 3; for (row = 0; row < height; row++) { __m256i srcCoeff1, srcCoeff2; __m256i T20, T40, T60, T80; __m256i sum10, sum20; for (col = 0; col < width - 31; col += 32) { srcCoeff1 = _mm256_loadu_si256((__m256i*)(src + col)); srcCoeff2 = _mm256_loadu_si256((__m256i*)(src + col + 8)); T20 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch1), mCoef); T40 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch2), mCoef); T60 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch3), mCoef); T80 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch4), mCoef); sum10 = _mm256_hadd_epi16(_mm256_hadd_epi16(T20, T40), _mm256_hadd_epi16(T60, T80)); T20 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff2, mSwitch1), mCoef); T40 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff2, mSwitch2), mCoef); T60 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff2, mSwitch3), mCoef); T80 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff2, mSwitch4), mCoef); sum20 = _mm256_hadd_epi16(_mm256_hadd_epi16(T20, T40), _mm256_hadd_epi16(T60, T80)); // store 32bit _mm256_storeu_si256((__m256i*)&tmp[col], _mm256_permute2x128_si256(sum10, sum20, 32)); _mm256_storeu_si256((__m256i*)&tmp[col + 16], _mm256_permute2x128_si256(sum10, sum20, 49)); // store 16bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mAddOffset), shift); sum20 = _mm256_srai_epi16(_mm256_add_epi16(sum20, mAddOffset), shift); _mm256_storeu_si256((__m256i*)&dst[col], _mm256_packus_epi16(sum10, sum20)); } // width 16 if (col < width - 15) { srcCoeff1 = _mm256_loadu_si256((__m256i*)(src + col)); srcCoeff2 = _mm256_loadu_si256((__m256i*)(src + col + 8)); srcCoeff1 = _mm256_permute2x128_si256(srcCoeff1, srcCoeff2, 32); T20 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch1), mCoef); T40 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch2), mCoef); T60 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch3), mCoef); T80 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch4), mCoef); sum10 = _mm256_hadd_epi16(_mm256_hadd_epi16(T20, T40), _mm256_hadd_epi16(T60, T80)); // store 32bit _mm256_storeu_si256((__m256i*)&tmp[col], sum10); // store 16bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mAddOffset), shift); sum10 = _mm256_permute4x64_epi64(_mm256_packus_epi16(sum10, sum10), 8); _mm256_maskstore_epi32((int*)&dst[col], mask16, sum10); col += 16; } // width 8 if (col < width - 7) { srcCoeff1 = _mm256_loadu_si256((__m256i*)(src + col)); T20 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch1), mCoef); T40 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch2), mCoef); T60 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch3), mCoef); T80 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch4), mCoef); sum10 = _mm256_hadd_epi16(_mm256_hadd_epi16(T20, T40), _mm256_hadd_epi16(T60, T80)); // store 16bit _mm256_maskstore_epi32((int*)&tmp[col], mask16, sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mAddOffset), shift); sum10 = _mm256_packus_epi16(sum10, sum10); _mm256_maskstore_epi32((int*)&dst[col], mask8, sum10); col += 8; } if (col < width - 3) { srcCoeff1 = _mm256_loadu_si256((__m256i*)(src + col)); T20 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch1), mCoef); T40 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch2), mCoef); T60 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch3), mCoef); T80 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch4), mCoef); sum10 = _mm256_hadd_epi16(_mm256_hadd_epi16(T20, T40), _mm256_hadd_epi16(T60, T80)); // store 8bit _mm256_maskstore_epi32((int*)&tmp[col], mask8, sum10); // store 4bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mAddOffset), shift); sum10 = _mm256_packus_epi16(sum10, sum10); _mm256_maskstore_epi32((int*)&dst[col], mask4, sum10); } src += i_src; tmp += i_tmp; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_hor_x3_avx2(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff) { int row, col = 0; const short offset = 32; const int shift = 6; __m256i mOffset = _mm256_set1_epi16(offset); __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m256i mSwitch3 = _mm256_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m256i mSwitch4 = _mm256_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m256i mCoef0, mCoef1, mCoef2; mct_t *tmp0 = tmp[0]; mct_t *tmp1 = tmp[1]; mct_t *tmp2 = tmp[2]; pel_t *dst0 = dst[0]; pel_t *dst1 = dst[1]; pel_t *dst2 = dst[2]; __m256i mask4 = _mm256_setr_epi16(-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); __m256i mask8 = _mm256_setr_epi16(-1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); __m256i mask16 = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0); #if ARCH_X86_64 mCoef0 = _mm256_set1_epi64x(*(int64_t*)coeff[0]); mCoef1 = _mm256_set1_epi64x(*(int64_t*)coeff[1]); mCoef2 = _mm256_set1_epi64x(*(int64_t*)coeff[2]); #else mCoef0 = _mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)coeff[0]), 0x0); mCoef1 = _mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)coeff[1]), 0x0); mCoef2 = _mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)coeff[2]), 0x0); #endif src -= 3; for (row = 0; row < height; row++) { __m256i srcCoeff1, srcCoeff2; __m256i S11, S12, S13, S14; __m256i S21, S22, S23, S24; __m256i sum10, sum20; for (col = 0; col < width - 31; col += 32) { srcCoeff1 = _mm256_loadu_si256((__m256i*)(src + col)); srcCoeff2 = _mm256_loadu_si256((__m256i*)(src + col + 8)); S11 = _mm256_shuffle_epi8(srcCoeff1, mSwitch1); S12 = _mm256_shuffle_epi8(srcCoeff1, mSwitch2); S13 = _mm256_shuffle_epi8(srcCoeff1, mSwitch3); S14 = _mm256_shuffle_epi8(srcCoeff1, mSwitch4); S21 = _mm256_shuffle_epi8(srcCoeff2, mSwitch1); S22 = _mm256_shuffle_epi8(srcCoeff2, mSwitch2); S23 = _mm256_shuffle_epi8(srcCoeff2, mSwitch3); S24 = _mm256_shuffle_epi8(srcCoeff2, mSwitch4); #define INTPL_HOR_FLT(Coef, S1, S2, S3, S4, Res) do { \ __m256i T0 = _mm256_maddubs_epi16(S1, Coef); \ __m256i T1 = _mm256_maddubs_epi16(S2, Coef); \ __m256i T2 = _mm256_maddubs_epi16(S3, Coef); \ __m256i T3 = _mm256_maddubs_epi16(S4, Coef); \ Res = _mm256_hadd_epi16(_mm256_hadd_epi16(T0, T1), _mm256_hadd_epi16(T2, T3)); \ } while (0) /* 1st */ INTPL_HOR_FLT(mCoef0, S11, S12, S13, S14, sum10); INTPL_HOR_FLT(mCoef0, S21, S22, S23, S24, sum20); // store 16bit _mm256_storeu_si256((__m256i*)&tmp0[col], _mm256_permute2x128_si256(sum10, sum20, 32)); _mm256_storeu_si256((__m256i*)&tmp0[col + 16], _mm256_permute2x128_si256(sum10, sum20, 49)); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum20 = _mm256_srai_epi16(_mm256_add_epi16(sum20, mOffset), shift); _mm256_storeu_si256((__m256i*)&dst0[col], _mm256_packus_epi16(sum10, sum20)); /* 2nd */ INTPL_HOR_FLT(mCoef1, S11, S12, S13, S14, sum10); INTPL_HOR_FLT(mCoef1, S21, S22, S23, S24, sum20); // store 16bit _mm256_storeu_si256((__m256i*)&tmp1[col], _mm256_permute2x128_si256(sum10, sum20, 32)); _mm256_storeu_si256((__m256i*)&tmp1[col + 16], _mm256_permute2x128_si256(sum10, sum20, 49)); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum20 = _mm256_srai_epi16(_mm256_add_epi16(sum20, mOffset), shift); _mm256_storeu_si256((__m256i*)&dst1[col], _mm256_packus_epi16(sum10, sum20)); /* 3rd */ INTPL_HOR_FLT(mCoef2, S11, S12, S13, S14, sum10); INTPL_HOR_FLT(mCoef2, S21, S22, S23, S24, sum20); // store 16bit _mm256_storeu_si256((__m256i*)&tmp2[col], _mm256_permute2x128_si256(sum10, sum20, 32)); _mm256_storeu_si256((__m256i*)&tmp2[col + 16], _mm256_permute2x128_si256(sum10, sum20, 49)); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum20 = _mm256_srai_epi16(_mm256_add_epi16(sum20, mOffset), shift); _mm256_storeu_si256((__m256i*)&dst2[col], _mm256_packus_epi16(sum10, sum20)); } // width 16 if (col < width - 15) { srcCoeff1 = _mm256_loadu_si256((__m256i*)(src + col)); srcCoeff2 = _mm256_loadu_si256((__m256i*)(src + col + 8)); srcCoeff1 = _mm256_permute2x128_si256(srcCoeff1, srcCoeff2, 32); S11 = _mm256_shuffle_epi8(srcCoeff1, mSwitch1); S12 = _mm256_shuffle_epi8(srcCoeff1, mSwitch2); S13 = _mm256_shuffle_epi8(srcCoeff1, mSwitch3); S14 = _mm256_shuffle_epi8(srcCoeff1, mSwitch4); /* 1st */ INTPL_HOR_FLT(mCoef0, S11, S12, S13, S14, sum10); // store 16bit _mm256_storeu_si256((__m256i*)&tmp0[col], sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_permute4x64_epi64(_mm256_packus_epi16(sum10, sum10), 8); //_mm256_storeu_si256((__m256i*)&dst0[col], sum10); _mm256_maskstore_epi32((int*)&dst0[col], mask16, sum10); /* 2nd */ INTPL_HOR_FLT(mCoef1, S11, S12, S13, S14, sum10); // store 16bit _mm256_storeu_si256((__m256i*)&tmp1[col], sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_permute4x64_epi64(_mm256_packus_epi16(sum10, sum10), 8); //_mm256_storeu_si256((__m256i*)&dst1[col], sum10); _mm256_maskstore_epi32((int*)&dst1[col], mask16, sum10); /* 3rd */ INTPL_HOR_FLT(mCoef2, S11, S12, S13, S14, sum10); // store 16bit _mm256_storeu_si256((__m256i*)&tmp2[col], sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_permute4x64_epi64(_mm256_packus_epi16(sum10, sum10), 8); //_mm256_storeu_si256((__m256i*)&dst2[col], sum10); _mm256_maskstore_epi32((int*)&dst2[col], mask16, sum10); col += 16; } // width 8 if (col < width - 7) { srcCoeff1 = _mm256_loadu_si256((__m256i*)(src + col)); S11 = _mm256_shuffle_epi8(srcCoeff1, mSwitch1); S12 = _mm256_shuffle_epi8(srcCoeff1, mSwitch2); S13 = _mm256_shuffle_epi8(srcCoeff1, mSwitch3); S14 = _mm256_shuffle_epi8(srcCoeff1, mSwitch4); /* 1st */ INTPL_HOR_FLT(mCoef0, S11, S12, S13, S14, sum10); // store 16bit //_mm256_storeu_si256((__m256i*)&tmp0[col], sum10); _mm256_maskstore_epi32((int*)&tmp0[col], mask16, sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_packus_epi16(sum10, sum10); //_mm256_storeu_si256((__m256i*)&dst0[col], sum10); _mm256_maskstore_epi32((int*)&dst0[col], mask8, sum10); /* 2nd */ INTPL_HOR_FLT(mCoef1, S11, S12, S13, S14, sum10); // store 16bit //_mm256_storeu_si256((__m256i*)&tmp1[col], sum10); _mm256_maskstore_epi32((int*)&tmp1[col], mask16, sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_packus_epi16(sum10, sum10); //_mm256_storeu_si256((__m256i*)&dst1[col], sum10); _mm256_maskstore_epi32((int*)&dst1[col], mask8, sum10); /* 3rd */ INTPL_HOR_FLT(mCoef2, S11, S12, S13, S14, sum10); // store 16bit //_mm256_storeu_si256((__m256i*)&tmp2[col], sum10); _mm256_maskstore_epi32((int*)&tmp2[col], mask16, sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_packus_epi16(sum10, sum10); //_mm256_storeu_si256((__m256i*)&dst2[col], sum10); _mm256_maskstore_epi32((int*)&dst2[col], mask8, sum10); col += 8; } // width 4 if (col < width - 3) { srcCoeff1 = _mm256_loadu_si256((__m256i*)(src + col)); S11 = _mm256_shuffle_epi8(srcCoeff1, mSwitch1); S12 = _mm256_shuffle_epi8(srcCoeff1, mSwitch2); S13 = _mm256_shuffle_epi8(srcCoeff1, mSwitch3); S14 = _mm256_shuffle_epi8(srcCoeff1, mSwitch4); /* 1st */ INTPL_HOR_FLT(mCoef0, S11, S12, S13, S14, sum10); // store 8bit //_mm256_storeu_si256((__m256i*)&tmp0[col], sum10); _mm256_maskstore_epi32((int*)&tmp0[col], mask8, sum10); // store 4bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_packus_epi16(sum10, sum10); //_mm256_storeu_si256((__m256i*)&dst0[col], sum10); _mm256_maskstore_epi32((int*)&dst0[col], mask4, sum10); /* 2nd */ INTPL_HOR_FLT(mCoef1, S11, S12, S13, S14, sum10); // store 16bit //_mm256_storeu_si256((__m256i*)&tmp1[col], sum10); _mm256_maskstore_epi32((int*)&tmp1[col], mask8, sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_packus_epi16(sum10, sum10); //_mm256_storeu_si256((__m256i*)&dst1[col], sum10); _mm256_maskstore_epi32((int*)&dst1[col], mask4, sum10); /* 3rd */ INTPL_HOR_FLT(mCoef2, S11, S12, S13, S14, sum10); // store 16bit //_mm256_storeu_si256((__m256i*)&tmp2[col], sum10); _mm256_maskstore_epi32((int*)&tmp2[col], mask8, sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_packus_epi16(sum10, sum10); //_mm256_storeu_si256((__m256i*)&dst2[col], sum10); _mm256_maskstore_epi32((int*)&dst2[col], mask4, sum10); } src += i_src; tmp0 += i_tmp; tmp1 += i_tmp; tmp2 += i_tmp; dst0 += i_dst; dst1 += i_dst; dst2 += i_dst; } #undef INTPL_HOR_FLT } /* --------------------------------------------------------------------------- */ #define INTPL_LUMA_VER_COMPUT(W0,W1,W2,W3,W4,W5,W6,W7,result) \ T0 = _mm256_maddubs_epi16(D0, W0); \ T1 = _mm256_maddubs_epi16(D1, W1); \ T2 = _mm256_maddubs_epi16(D2, W2); \ T3 = _mm256_maddubs_epi16(D3, W3); \ T4 = _mm256_maddubs_epi16(D4, W4); \ T5 = _mm256_maddubs_epi16(D5, W5); \ T6 = _mm256_maddubs_epi16(D6, W6); \ T7 = _mm256_maddubs_epi16(D7, W7); \ \ mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); \ mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); \ \ mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); \ mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); \ result = _mm256_packus_epi16(mVal1, mVal2); #define INTPL_LUMA_VER_STORE(a, b) \ _mm256_storeu_si256((__m256i*)(b), a); #define INTPL_LUMA_VER_COMPUT_LOW(W0,W1,W2,W3,W4,W5,W6,W7,result) \ T0 = _mm256_maddubs_epi16(D0, W0); \ T1 = _mm256_maddubs_epi16(D1, W1); \ T2 = _mm256_maddubs_epi16(D2, W2); \ T3 = _mm256_maddubs_epi16(D3, W3); \ \ mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); \ \ mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); \ result = _mm256_packus_epi16(mVal1, mVal1); /* --------------------------------------------------------------------------- */ void intpl_luma_ver_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int row, col; const short offset = 32; const int shift = 6; __m256i mAddOffset = _mm256_set1_epi16(offset); pel_t const *p; src -= 3 * i_src; __m256i mVal1, mVal2; int8_t coeff_tmp[2]; coeff_tmp[0] = coeff[7], coeff_tmp[1] = coeff[0]; __m256i mCoefy70 = _mm256_set1_epi16(*(short*)coeff_tmp); __m256i mCoefy12 = _mm256_set1_epi16(*(short*)(coeff + 1)); __m256i mCoefy34 = _mm256_set1_epi16(*(short*)(coeff + 3)); __m256i mCoefy56 = _mm256_set1_epi16(*(short*)(coeff + 5)); __m256i mCoefy01 = _mm256_set1_epi16(*(short*)coeff); __m256i mCoefy23 = _mm256_set1_epi16(*(short*)(coeff + 2)); __m256i mCoefy45 = _mm256_set1_epi16(*(short*)(coeff + 4)); __m256i mCoefy67 = _mm256_set1_epi16(*(short*)(coeff + 6)); __m256i T00, T10, T20, T30, T40, T50, T60, T70, T80, T90, Ta0; __m256i T0, T1, T2, T3, T4, T5, T6, T7; __m256i D0, D1, D2, D3, D4, D5, D6, D7; __m256i U0, U1, U2, U3; for (row = 0; row < height; row = row + 4) { p = src; for (col = 0; col < width; col += 32) { T00 = _mm256_loadu_si256((__m256i*)(p)); T10 = _mm256_loadu_si256((__m256i*)(p + i_src)); T20 = _mm256_loadu_si256((__m256i*)(p + 2 * i_src)); T30 = _mm256_loadu_si256((__m256i*)(p + 3 * i_src)); T40 = _mm256_loadu_si256((__m256i*)(p + 4 * i_src)); T50 = _mm256_loadu_si256((__m256i*)(p + 5 * i_src)); T60 = _mm256_loadu_si256((__m256i*)(p + 6 * i_src)); T70 = _mm256_loadu_si256((__m256i*)(p + 7 * i_src)); T80 = _mm256_loadu_si256((__m256i*)(p + 8 * i_src)); T90 = _mm256_loadu_si256((__m256i*)(p + 9 * i_src)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 10 * i_src)); D0 = _mm256_unpacklo_epi8(T00, T10); D1 = _mm256_unpacklo_epi8(T20, T30); D2 = _mm256_unpacklo_epi8(T40, T50); D3 = _mm256_unpacklo_epi8(T60, T70); D4 = _mm256_unpackhi_epi8(T00, T10); D5 = _mm256_unpackhi_epi8(T20, T30); D6 = _mm256_unpackhi_epi8(T40, T50); D7 = _mm256_unpackhi_epi8(T60, T70); INTPL_LUMA_VER_COMPUT(mCoefy01, mCoefy23, mCoefy45, mCoefy67, mCoefy01, mCoefy23, mCoefy45, mCoefy67, U0); INTPL_LUMA_VER_STORE(U0, dst + col); D0 = _mm256_unpacklo_epi8(T80, T10); D4 = _mm256_unpackhi_epi8(T80, T10); INTPL_LUMA_VER_COMPUT(mCoefy70, mCoefy12, mCoefy34, mCoefy56, mCoefy70, mCoefy12, mCoefy34, mCoefy56, U1); INTPL_LUMA_VER_STORE(U1, dst + i_dst + col); D0 = _mm256_unpacklo_epi8(T80, T90); D4 = _mm256_unpackhi_epi8(T80, T90); INTPL_LUMA_VER_COMPUT(mCoefy67, mCoefy01, mCoefy23, mCoefy45, mCoefy67, mCoefy01, mCoefy23, mCoefy45, U2); INTPL_LUMA_VER_STORE(U2, dst + 2 * i_dst + col); D1 = _mm256_unpacklo_epi8(Ta0, T30); D5 = _mm256_unpackhi_epi8(Ta0, T30); INTPL_LUMA_VER_COMPUT(mCoefy56, mCoefy70, mCoefy12, mCoefy34, mCoefy56, mCoefy70, mCoefy12, mCoefy34, U3); INTPL_LUMA_VER_STORE(U3, dst + 3 * i_dst + col); p += 32; } src += 4 * i_src; dst += 4 * i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_ver_x3_avx2(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, const int8_t **coeff) { intpl_luma_ver_avx2(dst[0], i_dst, src, i_src, width, height, coeff[0]); intpl_luma_ver_avx2(dst[1], i_dst, src, i_src, width, height, coeff[1]); intpl_luma_ver_avx2(dst[2], i_dst, src, i_src, width, height, coeff[2]); } xavs2-1.3/source/common/vec/intrinsic_intra-filledge.c000066400000000000000000000452431340660520300231100ustar00rootroot00000000000000/* * intrinsic_intra-fiiledge.c * * Description of this file: * SSE assembly functions of Intra-Filledge module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../avs2_defs.h" #include "../basic_types.h" #include "intrinsic.h" #include #include #include #include /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCU߽ϵPU */ void fill_edge_samples_0_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { __m128i T0, T1; int i, k, j; int num_padding; UNUSED_PARAMETER(pTL); UNUSED_PARAMETER(i_TL); /* fill default value */ k = ((bsy + bsx) << 1) + 1; j = (k >> 4) << 4; T0 = _mm_set1_epi8((uint8_t)g_dc_value); for (i = 0; i < j; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1); EP[2 * bsx] = (pel_t)g_dc_value; /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ if (bsx == 4) { memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)&pLcuEP[1]); _mm_storel_epi64((__m128i *)&EP[1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(pLcuEP + i + 1)); _mm_store_si128((__m128i *)(&EP[1] + i), T1); } } } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { if (bsx == 4) { memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)&pLcuEP[bsx + 1]); _mm_storel_epi64((__m128i *)&EP[bsx + 1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(&pLcuEP[bsx + i + 1])); _mm_store_si128((__m128i *)(&EP[bsx + 1] + i), T1); } } } else { if (bsx == 4) { memset(&EP[bsx + 1], EP[bsx], bsx); } else if (bsx == 8) { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel _mm_storel_epi64((__m128i *)&EP[bsx + 1], T0); } else { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel for (i = 0; i < bsx; i += 16) { _mm_store_si128((__m128i *)(&EP[bsx + 1 + i]), T0); } } } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { memset(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { /* fill left pixels */ memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel_t)); } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel_t)); } else { if (bsy == 4) { memset(&EP[-(bsy << 1)], EP[-bsy], bsy); } else if (bsy == 8) { T0 = _mm_set1_epi8(EP[-bsy]); _mm_storel_epi64((__m128i *)&EP[-(bsy << 1)], T0); } else { T0 = _mm_set1_epi8(EP[-bsy]); for (i = 0; i < bsy; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } } } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { memset(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } /* fill EP[0] */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pLcuEP[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pLcuEP[1]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pLcuEP[-1]; } } /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCU߽ϵPU */ void fill_edge_samples_x_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { __m128i T0, T1; int i, k, j; int num_padding; const pel_t *pL = pTL + i_TL; /* fill default value */ k = ((bsy + bsx) << 1) + 1; j = (k >> 4) << 4; T0 = _mm_set1_epi8((uint8_t)g_dc_value); for (i = 0; i < j; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1); EP[2 * bsx] = (pel_t)g_dc_value; /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ if (bsx == 4) { memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)&pLcuEP[1]); _mm_storel_epi64((__m128i *)&EP[1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(pLcuEP + i + 1)); _mm_store_si128((__m128i *)(&EP[1] + i), T1); } } } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { if (bsx == 4) { memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)&pLcuEP[bsx + 1]); _mm_storel_epi64((__m128i *)&EP[bsx + 1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(&pLcuEP[bsx + i + 1])); _mm_store_si128((__m128i *)(&EP[bsx + 1] + i), T1); } } } else { if (bsx == 4) { memset(&EP[bsx + 1], EP[bsx], bsx); } else if (bsx == 8) { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel _mm_storel_epi64((__m128i *)&EP[bsx + 1], T0); } else { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel for (i = 0; i < bsx; i += 16) { _mm_store_si128((__m128i *)(&EP[bsx + 1 + i]), T0); } } } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { memset(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { const pel_t *p_l = pL; int y; /* fill left pixels */ for (y = 0; y < bsy; y++) { EP[-1 - y] = *p_l; p_l += i_TL; } } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { const pel_t *p_l = pL + bsy * i_TL; int y; for (y = 0; y < bsy; y++) { EP[-bsy - 1 - y] = *p_l; p_l += i_TL; } } else { if (bsy == 4) { memset(&EP[-(bsy << 1)], EP[-bsy], bsy); } else if (bsy == 8) { T0 = _mm_set1_epi8(EP[-bsy]); _mm_storel_epi64((__m128i *)&EP[-(bsy << 1)], T0); } else { T0 = _mm_set1_epi8(EP[-bsy]); for (i = 0; i < bsy; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } } } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { memset(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } /* fill EP[0] */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pLcuEP[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pLcuEP[1]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pL[0]; } } /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCU߽ϵPU */ void fill_edge_samples_y_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { __m128i T0, T1; int i, k, j; int num_padding; const pel_t *pT = pTL + 1; UNUSED_PARAMETER(i_TL); /* fill default value */ k = ((bsy + bsx) << 1) + 1; j = (k >> 4) << 4; T0 = _mm_set1_epi8((uint8_t)g_dc_value); for (i = 0; i < j; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1); EP[2 * bsx] = (pel_t)g_dc_value; /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ if (bsx == 4) { memcpy(&EP[1], pT, bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)pT); _mm_storel_epi64((__m128i *)&EP[1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(pT + i)); _mm_store_si128((__m128i *)(&EP[1] + i), T1); } } } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { if (bsx == 4) { memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)&pT[bsx]); _mm_storel_epi64((__m128i *)&EP[bsx + 1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(&pT[bsx + i])); _mm_store_si128((__m128i *)(&EP[bsx + 1] + i), T1); } } } else { if (bsx == 4) { memset(&EP[bsx + 1], EP[bsx], bsx); } else if (bsx == 8) { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel _mm_storel_epi64((__m128i *)&EP[bsx + 1], T0); } else { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel for (i = 0; i < bsx; i += 16) { _mm_store_si128((__m128i *)(&EP[bsx + 1 + i]), T0); } } } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { memset(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { /* fill left pixels */ memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel_t)); } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel_t)); } else { if (bsy == 4) { memset(&EP[-(bsy << 1)], EP[-bsy], bsy); } else if (bsy == 8) { T0 = _mm_set1_epi8(EP[-bsy]); _mm_storel_epi64((__m128i *)&EP[-(bsy << 1)], T0); } else { T0 = _mm_set1_epi8(EP[-bsy]); for (i = 0; i < bsy; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } } } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { memset(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } /* fill EP[0] */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pLcuEP[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pT[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pLcuEP[-1]; } } /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCU߽ϵPU */ void fill_edge_samples_xy_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { __m128i T0, T1; int i, k, j; int num_padding; const pel_t *pT = pTL + 1; const pel_t *pL = pTL + i_TL; UNUSED_PARAMETER(pLcuEP); /* fill default value */ k = ((bsy + bsx) << 1) + 1; j = (k >> 4) << 4; T0 = _mm_set1_epi8((uint8_t)g_dc_value); for (i = 0; i < j; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1); EP[2 * bsx] = (pel_t)g_dc_value; /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ if (bsx == 4) { memcpy(&EP[1], pT, bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)pT); _mm_storel_epi64((__m128i *)&EP[1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(pT + i)); _mm_store_si128((__m128i *)(&EP[1] + i), T1); } } } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { if (bsx == 4) { memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)&pT[bsx]); _mm_storel_epi64((__m128i *)&EP[bsx + 1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(&pT[bsx + i])); _mm_store_si128((__m128i *)(&EP[bsx + 1] + i), T1); } } } else { if (bsx == 4) { memset(&EP[bsx + 1], EP[bsx], bsx); } else if (bsx == 8) { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel _mm_storel_epi64((__m128i *)&EP[bsx + 1], T0); } else { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel for (i = 0; i < bsx; i += 16) { _mm_store_si128((__m128i *)(&EP[bsx + 1 + i]), T0); } } } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { memset(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { const pel_t *p_l = pL; int y; /* fill left pixels */ for (y = 0; y < bsy; y++) { EP[-1 - y] = *p_l; p_l += i_TL; } } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { const pel_t *p_l = pL + bsy * i_TL; int y; for (y = 0; y < bsy; y++) { EP[-bsy - 1 - y] = *p_l; p_l += i_TL; } } else { if (bsy == 4) { memset(&EP[-(bsy << 1)], EP[-bsy], bsy); } else if (bsy == 8) { T0 = _mm_set1_epi8(EP[-bsy]); _mm_storel_epi64((__m128i *)&EP[-(bsy << 1)], T0); } else { T0 = _mm_set1_epi8(EP[-bsy]); for (i = 0; i < bsy; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } } } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { memset(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } /* fill EP[0] */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pTL[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pT[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pL[0]; } } xavs2-1.3/source/common/vec/intrinsic_intra-pred.c000066400000000000000000011216201340660520300222620ustar00rootroot00000000000000/* * intrinsic_intra-pred.c * * Description of this file: * SSE assembly functions of Intra-Prediction module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../avs2_defs.h" #include "../basic_types.h" #include "intrinsic.h" #include #include #include #include static ALIGN16(int8_t tab_coeff_mode_5[8][16]) = { { 20, 52, 44, 12, 20, 52, 44, 12, 20, 52, 44, 12, 20, 52, 44, 12 }, { 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24 }, { 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4 }, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20 }, { 32, 64, 32, 0, 32, 64, 32, 0, 32, 64, 32, 0, 32, 64, 32, 0 } }; static uint8_t tab_idx_mode_5[64] = { 1, 2, 4, 5, 6, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 26, 27, 28, 30, 31, 33, 34, 35, 37, 38, 39, 41, 42, 44, 45, 46, 48, 49, 50, 52, 53, 55, 56, 57, 59, 60, 61, 63, 64, 66, 67, 68, 70, 71, 72, 74, 75, 77, 78, 79, 81, 82, 83, 85, 86, 88 }; /* --------------------------------------------------------------------------- */ void intra_pred_ver_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int y; pel_t *rpSrc = src + 1; __m128i T1, T2, T3, T4; UNUSED_PARAMETER(dir_mode); switch (bsx) { case 4: for (y = 0; y < bsy; y += 2) { CP32(dst, rpSrc); CP32(dst + i_dst, rpSrc); dst += i_dst << 1; } break; case 8: for (y = 0; y < bsy; y += 2) { CP64(dst, rpSrc); CP64(dst + i_dst, rpSrc); dst += i_dst << 1; } break; case 16: T1 = _mm_loadu_si128((__m128i*)rpSrc); for (y = 0; y < bsy; y++) { _mm_storeu_si128((__m128i*)(dst), T1); dst += i_dst; } break; case 32: T1 = _mm_loadu_si128((__m128i*)(rpSrc + 0)); T2 = _mm_loadu_si128((__m128i*)(rpSrc + 16)); for (y = 0; y < bsy; y++) { _mm_storeu_si128((__m128i*)(dst + 0), T1); _mm_storeu_si128((__m128i*)(dst + 16), T2); dst += i_dst; } break; case 64: T1 = _mm_loadu_si128((__m128i*)(rpSrc + 0)); T2 = _mm_loadu_si128((__m128i*)(rpSrc + 16)); T3 = _mm_loadu_si128((__m128i*)(rpSrc + 32)); T4 = _mm_loadu_si128((__m128i*)(rpSrc + 48)); for (y = 0; y < bsy; y++) { _mm_storeu_si128((__m128i*)(dst + 0), T1); _mm_storeu_si128((__m128i*)(dst + 16), T2); _mm_storeu_si128((__m128i*)(dst + 32), T3); _mm_storeu_si128((__m128i*)(dst + 48), T4); dst += i_dst; } break; default: assert(0); break; } } /* --------------------------------------------------------------------------- */ void intra_pred_hor_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int y; pel_t *rpSrc = src - 1; __m128i T; UNUSED_PARAMETER(dir_mode); switch (bsx) { case 4: for (y = 0; y < bsy; y++) { M32(dst) = 0x01010101 * rpSrc[-y]; dst += i_dst; } break; case 8: for (y = 0; y < bsy; y++) { M64(dst) = 0x0101010101010101 * rpSrc[-y]; dst += i_dst; } break; case 16: for (y = 0; y < bsy; y++) { T = _mm_set1_epi8((char)rpSrc[-y]); _mm_storeu_si128((__m128i*)(dst), T); dst += i_dst; } break; case 32: for (y = 0; y < bsy; y++) { T = _mm_set1_epi8((char)rpSrc[-y]); _mm_storeu_si128((__m128i*)(dst + 0), T); _mm_storeu_si128((__m128i*)(dst + 16), T); dst += i_dst; } break; case 64: for (y = 0; y < bsy; y++) { T = _mm_set1_epi8((char)rpSrc[-y]); _mm_storeu_si128((__m128i*)(dst + 0), T); _mm_storeu_si128((__m128i*)(dst + 16), T); _mm_storeu_si128((__m128i*)(dst + 32), T); _mm_storeu_si128((__m128i*)(dst + 48), T); dst += i_dst; } break; default: assert(0); break; } } /* --------------------------------------------------------------------------- */ void intra_pred_dc_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int avail_above = dir_mode >> 8; int avail_left = dir_mode & 0xFF; int dc_value; int sum_above = 0; int sum_left = 0; int x, y; pel_t *p_src; __m128i zero = _mm_setzero_si128(); __m128i S0; __m128i p00, p10, p20, p30; /* sum of left samples */ // for (y = 0; y < bsy; y++) dc_value += p_src[-y]; p_src = src - bsy; if (bsy == 4) { sum_left += p_src[0] + p_src[1] + p_src[2] + p_src[3]; } else if (bsy == 8) { S0 = _mm_loadu_si128((__m128i*)(p_src)); p00 = _mm_unpacklo_epi8(S0, zero); p10 = _mm_srli_si128(p00, 8); p00 = _mm_add_epi16(p00, p10); sum_left += M128_U16(p00, 0) + M128_U16(p00, 1) + M128_U16(p00, 2) + M128_U16(p00, 3); } else { p30 = zero; for (y = 0; y < bsy - 8; y += 16, p_src += 16) { S0 = _mm_loadu_si128((__m128i*)(p_src)); p00 = _mm_unpacklo_epi8(S0, zero); p10 = _mm_unpackhi_epi8(S0, zero); p20 = _mm_add_epi16(p00, p10); p30 = _mm_add_epi16(p30, p20); } p00 = _mm_srli_si128(p30, 8); p00 = _mm_add_epi16(p30, p00); sum_left += M128_U16(p00, 0) + M128_U16(p00, 1) + M128_U16(p00, 2) + M128_U16(p00, 3); } /* sum of above samples */ //for (x = 0; x < bsx; x++) dc_value += p_src[x]; p_src = src + 1; if (bsx == 4) { sum_above += p_src[0] + p_src[1] + p_src[2] + p_src[3]; } else if (bsx == 8) { S0 = _mm_loadu_si128((__m128i*)(p_src)); p00 = _mm_unpacklo_epi8(S0, zero); p10 = _mm_srli_si128(p00, 8); p00 = _mm_add_epi16(p00, p10); sum_above += M128_U16(p00, 0) + M128_U16(p00, 1) + M128_U16(p00, 2) + M128_U16(p00, 3); } else { p30 = zero; for (x = 0; x < bsx - 8; x += 16, p_src += 16) { S0 = _mm_loadu_si128((__m128i*)(p_src)); p00 = _mm_unpacklo_epi8(S0, zero); p10 = _mm_unpackhi_epi8(S0, zero); p20 = _mm_add_epi16(p00, p10); p30 = _mm_add_epi16(p30, p20); } p00 = _mm_srli_si128(p30, 8); p00 = _mm_add_epi16(p30, p00); sum_above += M128_U16(p00, 0) + M128_U16(p00, 1) + M128_U16(p00, 2) + M128_U16(p00, 3); } if (avail_left && avail_above) { x = bsx + bsy; dc_value = ((sum_above + sum_left + (x >> 1)) * (512 / x)) >> 9; } else if (avail_left) { dc_value = (sum_left + (bsy >> 1)) >> xavs2_log2u(bsy); } else if (avail_above) { dc_value = (sum_above + (bsx >> 1)) >> xavs2_log2u(bsx); } else { dc_value = g_dc_value; } p00 = _mm_set1_epi8((pel_t)dc_value); for (y = 0; y < bsy; y++) { if (bsx == 8) { _mm_storel_epi64((__m128i*)dst, p00); } else if (bsx == 4) { *(int*)(dst) = _mm_cvtsi128_si32(p00); } else { for (x = 0; x < bsx - 8; x += 16) { _mm_storeu_si128((__m128i*)(dst + x), p00); } } dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intra_pred_plane_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { pel_t *rpSrc; int iH = 0; int iV = 0; int iA, iB, iC; int x, y; int iW2 = bsx >> 1; int iH2 = bsy >> 1; int ib_mult[5] = { 13, 17, 5, 11, 23 }; int ib_shift[5] = { 7, 10, 11, 15, 19 }; int im_h = ib_mult[tab_log2[bsx] - 2]; int is_h = ib_shift[tab_log2[bsx] - 2]; int im_v = ib_mult[tab_log2[bsy] - 2]; int is_v = ib_shift[tab_log2[bsy] - 2]; int iTmp; UNUSED_PARAMETER(dir_mode); rpSrc = src + iW2; for (x = 1; x < iW2 + 1; x++) { iH += x * (rpSrc[x] - rpSrc[-x]); } rpSrc = src - iH2; for (y = 1; y < iH2 + 1; y++) { iV += y * (rpSrc[-y] - rpSrc[y]); } iA = (src[-1 - (bsy - 1)] + src[1 + bsx - 1]) << 4; iB = ((iH << 5) * im_h + (1 << (is_h - 1))) >> is_h; iC = ((iV << 5) * im_v + (1 << (is_v - 1))) >> is_v; iTmp = iA - (iH2 - 1) * iC - (iW2 - 1) * iB + 16; __m128i TC, TB, TA, T_Start, T, D, D1; TA = _mm_set1_epi16((int16_t)iTmp); TB = _mm_set1_epi16((int16_t)iB); TC = _mm_set1_epi16((int16_t)iC); T_Start = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); T_Start = _mm_mullo_epi16(TB, T_Start); T_Start = _mm_add_epi16(T_Start, TA); TB = _mm_mullo_epi16(TB, _mm_set1_epi16(8)); if (bsx == 4) { for (y = 0; y < bsy; y++) { D = _mm_srai_epi16(T_Start, 5); D = _mm_packus_epi16(D, D); // extract low 32 bits from the packed result , and put it into a integer . (Redundant operation?) _mm_stream_si32((int *)dst, _mm_extract_epi32(D, 0)); T_Start = _mm_add_epi16(T_Start, TC); dst += i_dst; } } else if (bsx == 8) { for (y = 0; y < bsy; y++) { D = _mm_srai_epi16(T_Start, 5); D = _mm_packus_epi16(D, D); _mm_storel_epi64((__m128i*)dst, D); T_Start = _mm_add_epi16(T_Start, TC); dst += i_dst; } } else { for (y = 0; y < bsy; y++) { T = T_Start; for (x = 0; x < bsx; x += 16) { D = _mm_srai_epi16(T, 5); T = _mm_add_epi16(T, TB); D1 = _mm_srai_epi16(T, 5); T = _mm_add_epi16(T, TB); D = _mm_packus_epi16(D, D1); _mm_storeu_si128((__m128i*)(dst + x), D); } T_Start = _mm_add_epi16(T_Start, TC); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_bilinear_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int x, y; int ishift_x = tab_log2[bsx]; int ishift_y = tab_log2[bsy]; int ishift = XAVS2_MIN(ishift_x, ishift_y); int ishift_xy = ishift_x + ishift_y + 1; int offset = 1 << (ishift_x + ishift_y); int a, b, c, w, val; pel_t *p; __m128i T, T1, T2, T3, C1, C2, ADD; __m128i ZERO = _mm_setzero_si128(); ALIGN32(itr_t pTop [MAX_CU_SIZE + 32]); ALIGN32(itr_t pLeft[MAX_CU_SIZE + 32]); ALIGN32(itr_t pT [MAX_CU_SIZE + 32]); ALIGN32(itr_t pL [MAX_CU_SIZE + 32]); ALIGN32(itr_t wy [MAX_CU_SIZE + 32]); UNUSED_PARAMETER(dir_mode); p = src + 1; for (x = 0; x < bsx; x += 16) { T = _mm_loadu_si128((__m128i*)(p + x)); T1 = _mm_unpacklo_epi8(T, ZERO); T2 = _mm_unpackhi_epi8(T, ZERO); _mm_store_si128((__m128i*)(pTop + x), T1); _mm_store_si128((__m128i*)(pTop + x + 8), T2); } for (y = 0; y < bsy; y++) { pLeft[y] = src[-1 - y]; } a = pTop[bsx - 1]; b = pLeft[bsy - 1]; if (bsx == bsy) { c = (a + b + 1) >> 1; } else { c = (((a << ishift_x) + (b << ishift_y)) * 13 + (1 << (ishift + 5))) >> (ishift + 6); } w = (c << 1) - a - b; T = _mm_set1_epi16((int16_t)b); for (x = 0; x < bsx; x += 8) { T1 = _mm_load_si128((__m128i*)(pTop + x)); T2 = _mm_sub_epi16(T, T1); T1 = _mm_slli_epi16(T1, ishift_y); _mm_store_si128((__m128i*)(pT + x), T2); _mm_store_si128((__m128i*)(pTop + x), T1); } T = _mm_set1_epi16((int16_t)a); for (y = 0; y < bsy; y += 8) { T1 = _mm_load_si128((__m128i*)(pLeft + y)); T2 = _mm_sub_epi16(T, T1); T1 = _mm_slli_epi16(T1, ishift_x); _mm_store_si128((__m128i*)(pL + y), T2); _mm_store_si128((__m128i*)(pLeft + y), T1); } T = _mm_set1_epi16((int16_t)w); T = _mm_mullo_epi16(T, _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); T1 = _mm_set1_epi16((int16_t)(8 * w)); for (y = 0; y < bsy; y += 8) { _mm_store_si128((__m128i*)(wy + y), T); T = _mm_add_epi16(T, T1); } C1 = _mm_set_epi32(3, 2, 1, 0); C2 = _mm_set1_epi32(4); if (bsx == 4) { __m128i pTT = _mm_loadl_epi64((__m128i*)pT); T = _mm_loadl_epi64((__m128i*)pTop); for (y = 0; y < bsy; y++) { int add = (pL[y] << ishift_y) + wy[y]; ADD = _mm_set1_epi32(add); ADD = _mm_mullo_epi32(C1, ADD); val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); ADD = _mm_add_epi32(ADD, _mm_set1_epi32(val)); T = _mm_add_epi16(T, pTT); T1 = _mm_cvtepi16_epi32(T); T1 = _mm_slli_epi32(T1, ishift_x); T1 = _mm_add_epi32(T1, ADD); T1 = _mm_srai_epi32(T1, ishift_xy); T1 = _mm_packus_epi32(T1, T1); T1 = _mm_packus_epi16(T1, T1); M32(dst) = _mm_cvtsi128_si32(T1); dst += i_dst; } } else if (bsx == 8) { __m128i pTT = _mm_load_si128((__m128i*)pT); T = _mm_load_si128((__m128i*)pTop); for (y = 0; y < bsy; y++) { int add = (pL[y] << ishift_y) + wy[y]; ADD = _mm_set1_epi32(add); T3 = _mm_mullo_epi32(C2, ADD); ADD = _mm_mullo_epi32(C1, ADD); val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); ADD = _mm_add_epi32(ADD, _mm_set1_epi32(val)); T = _mm_add_epi16(T, pTT); T1 = _mm_cvtepi16_epi32(T); T2 = _mm_cvtepi16_epi32(_mm_srli_si128(T, 8)); T1 = _mm_slli_epi32(T1, ishift_x); T2 = _mm_slli_epi32(T2, ishift_x); T1 = _mm_add_epi32(T1, ADD); T1 = _mm_srai_epi32(T1, ishift_xy); ADD = _mm_add_epi32(ADD, T3); T2 = _mm_add_epi32(T2, ADD); T2 = _mm_srai_epi32(T2, ishift_xy); ADD = _mm_add_epi32(ADD, T3); T1 = _mm_packus_epi32(T1, T2); T1 = _mm_packus_epi16(T1, T1); _mm_storel_epi64((__m128i*)dst, T1); dst += i_dst; } } else { __m128i TT[16]; __m128i PTT[16]; for (x = 0; x < bsx; x += 8) { int idx = x >> 2; __m128i M0 = _mm_load_si128((__m128i*)(pTop + x)); __m128i M1 = _mm_load_si128((__m128i*)(pT + x)); TT[idx] = _mm_unpacklo_epi16(M0, ZERO); TT[idx + 1] = _mm_unpackhi_epi16(M0, ZERO); PTT[idx] = _mm_cvtepi16_epi32(M1); PTT[idx + 1] = _mm_cvtepi16_epi32(_mm_srli_si128(M1, 8)); } for (y = 0; y < bsy; y++) { int add = (pL[y] << ishift_y) + wy[y]; ADD = _mm_set1_epi32(add); T3 = _mm_mullo_epi32(C2, ADD); ADD = _mm_mullo_epi32(C1, ADD); val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); ADD = _mm_add_epi32(ADD, _mm_set1_epi32(val)); for (x = 0; x < bsx; x += 8) { int idx = x >> 2; TT[idx] = _mm_add_epi32(TT[idx], PTT[idx]); TT[idx + 1] = _mm_add_epi32(TT[idx + 1], PTT[idx + 1]); T1 = _mm_slli_epi32(TT[idx], ishift_x); T2 = _mm_slli_epi32(TT[idx + 1], ishift_x); T1 = _mm_add_epi32(T1, ADD); T1 = _mm_srai_epi32(T1, ishift_xy); ADD = _mm_add_epi32(ADD, T3); T2 = _mm_add_epi32(T2, ADD); T2 = _mm_srai_epi32(T2, ishift_xy); ADD = _mm_add_epi32(ADD, T3); T1 = _mm_packus_epi32(T1, T2); T1 = _mm_packus_epi16(T1, T1); _mm_storel_epi64((__m128i*)(dst + x), T1); } dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_3_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; UNUSED_PARAMETER(dir_mode); if ((bsy > 4) && (bsx > 8)) { ALIGN16(pel_t first_line[(64 + 176 + 16) << 2]); int line_size = bsx + (((bsy - 4) * 11) >> 2); int aligned_line_size = 64 + 176 + 16; int i; pel_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; for (i = 0; i < line_size - 8; i += 16, src += 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i SS2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L2 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L3 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L4 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L5 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L6 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L7 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L8 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L9 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L10 = _mm_unpacklo_epi8(SS2, zero); __m128i H2 = L10; __m128i SS11 = _mm_loadu_si128((__m128i*)(src + 11)); __m128i L11 = _mm_unpacklo_epi8(SS11, zero); __m128i H3 = L11; SS11 = _mm_srli_si128(SS11, 1); __m128i L12 = _mm_unpacklo_epi8(SS11, zero); __m128i H4 = L12; SS11 = _mm_srli_si128(SS11, 1); __m128i L13 = _mm_unpacklo_epi8(SS11, zero); __m128i H5 = L13; SS11 = _mm_srli_si128(SS11, 1); __m128i H6 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H7 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H8 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H9 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H10 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H11 = _mm_unpacklo_epi8(SS11, zero); __m128i SS20 = _mm_loadu_si128((__m128i*)(src + 20)); __m128i H12 = _mm_unpacklo_epi8(SS20, zero); SS20 = _mm_srli_si128(SS20, 1); __m128i H13 = _mm_unpacklo_epi8(SS20, zero); p00 = _mm_add_epi16(L2, coeff8); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_add_epi16(H2, coeff8); p11 = _mm_mullo_epi16(H3, coeff5); p21 = _mm_mullo_epi16(H4, coeff7); p31 = _mm_mullo_epi16(H5, coeff3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[0][i], p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H5, H8); p11 = _mm_add_epi16(H6, H7); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[1][i], p00); p00 = _mm_mullo_epi16(L8, coeff3); p10 = _mm_mullo_epi16(L9, coeff7); p20 = _mm_mullo_epi16(L10, coeff5); p30 = _mm_add_epi16(L11, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H8, coeff3); p11 = _mm_mullo_epi16(H9, coeff7); p21 = _mm_mullo_epi16(H10, coeff5); p31 = _mm_add_epi16(H11, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L11, L13); p10 = _mm_mullo_epi16(L12, coeff2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H11, H13); p11 = _mm_mullo_epi16(H12, coeff2); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[3][i], p00); } if (i < line_size) { __m128i p00, p10, p20, p30; __m128i SS2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L2 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L3 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L4 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L5 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L6 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L7 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L8 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L9 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L10 = _mm_unpacklo_epi8(SS2, zero); __m128i SS11 = _mm_loadu_si128((__m128i*)(src + 11)); __m128i L11 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i L12 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i L13 = _mm_unpacklo_epi8(SS11, zero); p00 = _mm_add_epi16(L2, coeff8); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[0][i], p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[1][i], p00); p00 = _mm_mullo_epi16(L8, coeff3); p10 = _mm_mullo_epi16(L9, coeff7); p20 = _mm_mullo_epi16(L10, coeff5); p30 = _mm_add_epi16(L11, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L11, L13); p10 = _mm_mullo_epi16(L12, coeff2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[3][i], p00); } bsy >>= 2; for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel_t)); memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel_t)); memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel_t)); memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel_t)); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; } } else if (bsx == 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i SS2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L2 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L3 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L4 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L5 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L6 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L7 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L8 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L9 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L10 = _mm_unpacklo_epi8(SS2, zero); __m128i H2 = L10; __m128i SS11 = _mm_loadu_si128((__m128i*)(src + 11)); __m128i L11 = _mm_unpacklo_epi8(SS11, zero); __m128i H3 = L11; SS11 = _mm_srli_si128(SS11, 1); __m128i L12 = _mm_unpacklo_epi8(SS11, zero); __m128i H4 = L12; SS11 = _mm_srli_si128(SS11, 1); __m128i L13 = _mm_unpacklo_epi8(SS11, zero); __m128i H5 = L13; SS11 = _mm_srli_si128(SS11, 1); __m128i H6 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H7 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H8 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H9 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H10 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H11 = _mm_unpacklo_epi8(SS11, zero); __m128i SS20 = _mm_loadu_si128((__m128i*)(src + 20)); __m128i H12 = _mm_unpacklo_epi8(SS20, zero); SS20 = _mm_srli_si128(SS20, 1); __m128i H13 = _mm_unpacklo_epi8(SS20, zero); p00 = _mm_add_epi16(L2, coeff8); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_add_epi16(H2, coeff8); p11 = _mm_mullo_epi16(H3, coeff5); p21 = _mm_mullo_epi16(H4, coeff7); p31 = _mm_mullo_epi16(H5, coeff3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst1, p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H5, H8); p11 = _mm_add_epi16(H6, H7); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst2, p00); p00 = _mm_mullo_epi16(L8, coeff3); p10 = _mm_mullo_epi16(L9, coeff7); p20 = _mm_mullo_epi16(L10, coeff5); p30 = _mm_add_epi16(L11, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H8, coeff3); p11 = _mm_mullo_epi16(H9, coeff7); p21 = _mm_mullo_epi16(H10, coeff5); p31 = _mm_add_epi16(H11, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst3, p00); p00 = _mm_add_epi16(L11, L13); p10 = _mm_mullo_epi16(L12, coeff2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H11, H13); p11 = _mm_mullo_epi16(H12, coeff2); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst4, p00); } else if (bsx == 8) { __m128i p00, p10, p20, p30; __m128i SS2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L2 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L3 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L4 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L5 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L6 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L7 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L8 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L9 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L10 = _mm_unpacklo_epi8(SS2, zero); __m128i SS11 = _mm_loadu_si128((__m128i*)(src + 11)); __m128i L11 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i L12 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i L13 = _mm_unpacklo_epi8(SS11, zero); p00 = _mm_add_epi16(L2, coeff8); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst1, p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst2, p00); p00 = _mm_mullo_epi16(L8, coeff3); p10 = _mm_mullo_epi16(L9, coeff7); p20 = _mm_mullo_epi16(L10, coeff5); p30 = _mm_add_epi16(L11, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst3, p00); p00 = _mm_add_epi16(L11, L13); p10 = _mm_mullo_epi16(L12, coeff2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst4, p00); __m128i pad1 = _mm_set1_epi8(src[16]); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad1); _mm_storel_epi64((__m128i*)dst3, pad1); _mm_storel_epi64((__m128i*)dst4, pad1); dst1[0] = (pel_t)((src[13] + 5 * src[14] + 7 * src[15] + 3 * src[16] + 8) >> 4); dst1[1] = (pel_t)((src[14] + 5 * src[15] + 7 * src[16] + 3 * src[17] + 8) >> 4); dst1[2] = (pel_t)((src[15] + 5 * src[16] + 7 * src[17] + 3 * src[18] + 8) >> 4); if (bsy == 32) { for (int i = 0; i < 6; i++) { dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad1); _mm_storel_epi64((__m128i*)dst3, pad1); _mm_storel_epi64((__m128i*)dst4, pad1); } } } else { if (bsy == 16) { __m128i p00, p10, p20, p30; __m128i SS2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L2 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L3 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L4 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L5 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L6 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L7 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L8 = _mm_unpacklo_epi8(SS2, zero); p00 = _mm_add_epi16(L2, coeff8); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst1)) = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst2)) = _mm_cvtsi128_si32(p00); __m128i pad1 = _mm_set1_epi8(src[8]); *((int*)(dst3)) = _mm_cvtsi128_si32(pad1); *((int*)(dst4)) = _mm_cvtsi128_si32(pad1); for (int i = 0; i < 3; i++) { dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; *((int*)(dst1)) = _mm_cvtsi128_si32(pad1); *((int*)(dst2)) = _mm_cvtsi128_si32(pad1); *((int*)(dst3)) = _mm_cvtsi128_si32(pad1); *((int*)(dst4)) = _mm_cvtsi128_si32(pad1); } } else { __m128i p00, p10, p20, p30; __m128i SS2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L2 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L3 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L4 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L5 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L6 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L7 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L8 = _mm_unpacklo_epi8(SS2, zero); p00 = _mm_add_epi16(L2, coeff8); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst1)) = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst2)) = _mm_cvtsi128_si32(p00); __m128i pad1 = _mm_set1_epi8(src[8]); *((int*)(dst3)) = _mm_cvtsi128_si32(pad1); *((int*)(dst4)) = _mm_cvtsi128_si32(pad1); } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_4_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); int line_size = bsx + ((bsy - 1) << 1); int iHeight2 = bsy << 1; int i; __m128i zero = _mm_setzero_si128(); __m128i offset = _mm_set1_epi16(2); UNUSED_PARAMETER(dir_mode); src += 3; for (i = 0; i < line_size - 8; i += 16, src += 16) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); __m128i sum3 = _mm_add_epi16(H0, H1); __m128i sum4 = _mm_add_epi16(H1, H2); sum1 = _mm_add_epi16(sum1, sum2); sum3 = _mm_add_epi16(sum3, sum4); sum1 = _mm_add_epi16(sum1, offset); sum3 = _mm_add_epi16(sum3, offset); sum1 = _mm_srli_epi16(sum1, 2); sum3 = _mm_srli_epi16(sum3, 2); sum1 = _mm_packus_epi16(sum1, sum3); _mm_store_si128((__m128i*)&first_line[i], sum1); } if (i < line_size) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); sum1 = _mm_add_epi16(sum1, sum2); sum1 = _mm_add_epi16(sum1, offset); sum1 = _mm_srli_epi16(sum1, 2); sum1 = _mm_packus_epi16(sum1, sum1); _mm_storel_epi64((__m128i*)&first_line[i], sum1); } if (bsx == bsy || bsx > 16) { for (i = 0; i < iHeight2; i += 2) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 16) { pel_t *dst1 = dst; __m128i M = _mm_loadu_si128((__m128i*)&first_line[0]); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst = dst1 + 8; M = _mm_loadu_si128((__m128i*)&first_line[8]); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); } else if (bsx == 8) { for (i = 0; i < iHeight2; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; } } else { for (i = 0; i < iHeight2; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_5_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i coeff9 = _mm_set1_epi16(9); __m128i coeff11 = _mm_set1_epi16(11); __m128i coeff13 = _mm_set1_epi16(13); __m128i coeff15 = _mm_set1_epi16(15); __m128i coeff16 = _mm_set1_epi16(16); UNUSED_PARAMETER(dir_mode); int i; if (((bsy > 4) && (bsx > 8))) { ALIGN16(pel_t first_line[(64 + 80 + 16) << 3]); int line_size = bsx + ((bsy - 8) >> 3) * 11; int aligned_line_size = (((line_size + 15) >> 4) << 4) + 16; pel_t *pfirst[8]; pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; for (i = 0; i < line_size - 8; i += 16, src += 16) { __m128i SS1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L1 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L2 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L3 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L4 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L5 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L6 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L7 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L8 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L9 = _mm_unpacklo_epi8(SS1, zero); __m128i H1 = L9; __m128i SS10 = _mm_loadu_si128((__m128i*)(src + 10)); __m128i L10 = _mm_unpacklo_epi8(SS10, zero); __m128i H2 = L10; SS10 = _mm_srli_si128(SS10, 1); __m128i L11 = _mm_unpacklo_epi8(SS10, zero); __m128i H3 = L11; SS10 = _mm_srli_si128(SS10, 1); __m128i L12 = _mm_unpacklo_epi8(SS10, zero); __m128i H4 = L12; SS10 = _mm_srli_si128(SS10, 1); __m128i L13 = _mm_unpacklo_epi8(SS10, zero); __m128i H5 = L13; SS10 = _mm_srli_si128(SS10, 1); __m128i H6 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H7 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H8 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H9 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H10 = _mm_unpacklo_epi8(SS10, zero); __m128i SS19 = _mm_loadu_si128((__m128i*)(src + 19)); __m128i H11 = _mm_unpacklo_epi8(SS19, zero); SS19 = _mm_srli_si128(SS19, 1); __m128i H12 = _mm_unpacklo_epi8(SS19, zero); SS19 = _mm_srli_si128(SS19, 1); __m128i H13 = _mm_unpacklo_epi8(SS19, zero); p00 = _mm_mullo_epi16(L1, coeff5); p10 = _mm_mullo_epi16(L2, coeff13); p20 = _mm_mullo_epi16(L3, coeff11); p30 = _mm_mullo_epi16(L4, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H1, coeff5); p11 = _mm_mullo_epi16(H2, coeff13); p21 = _mm_mullo_epi16(H3, coeff11); p31 = _mm_mullo_epi16(H4, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[0][i], p00); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(L2, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H3, coeff5); p21 = _mm_mullo_epi16(H4, coeff7); p31 = _mm_mullo_epi16(H5, coeff3); p01 = _mm_add_epi16(H2, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[1][i], p00); p00 = _mm_mullo_epi16(L4, coeff7); p10 = _mm_mullo_epi16(L5, coeff15); p20 = _mm_mullo_epi16(L6, coeff9); p30 = _mm_add_epi16(L7, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H4, coeff7); p11 = _mm_mullo_epi16(H5, coeff15); p21 = _mm_mullo_epi16(H6, coeff9); p31 = _mm_add_epi16(H7, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H5, H8); p11 = _mm_add_epi16(H6, H7); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[3][i], p00); p00 = _mm_add_epi16(L6, coeff16); p10 = _mm_mullo_epi16(L7, coeff9); p20 = _mm_mullo_epi16(L8, coeff15); p30 = _mm_mullo_epi16(L9, coeff7); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_add_epi16(H6, coeff16); p11 = _mm_mullo_epi16(H7, coeff9); p21 = _mm_mullo_epi16(H8, coeff15); p31 = _mm_mullo_epi16(H9, coeff7); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[4][i], p00); p00 = _mm_mullo_epi16(L8, coeff3); p10 = _mm_mullo_epi16(L9, coeff7); p20 = _mm_mullo_epi16(L10, coeff5); p30 = _mm_add_epi16(L11, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H8, coeff3); p11 = _mm_mullo_epi16(H9, coeff7); p21 = _mm_mullo_epi16(H10, coeff5); p31 = _mm_add_epi16(H11, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[5][i], p00); p00 = _mm_mullo_epi16(L9, coeff3); p10 = _mm_mullo_epi16(L10, coeff11); p20 = _mm_mullo_epi16(L11, coeff13); p30 = _mm_mullo_epi16(L12, coeff5); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H9, coeff3); p11 = _mm_mullo_epi16(H10, coeff11); p21 = _mm_mullo_epi16(H11, coeff13); p31 = _mm_mullo_epi16(H12, coeff5); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[6][i], p00); p00 = _mm_add_epi16(L11, L13); p10 = _mm_add_epi16(L12, L12); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H11, H13); p11 = _mm_add_epi16(H12, H12); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[7][i], p00); } if (i < line_size) { __m128i SS1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L1 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L2 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L3 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L4 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L5 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L6 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L7 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L8 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L9 = _mm_unpacklo_epi8(SS1, zero); __m128i SS10 = _mm_loadu_si128((__m128i*)(src + 10)); __m128i L10 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i L11 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i L12 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i L13 = _mm_unpacklo_epi8(SS10, zero); p00 = _mm_mullo_epi16(L1, coeff5); p10 = _mm_mullo_epi16(L2, coeff13); p20 = _mm_mullo_epi16(L3, coeff11); p30 = _mm_mullo_epi16(L4, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[0][i], p00); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(L2, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[1][i], p00); p00 = _mm_mullo_epi16(L4, coeff7); p10 = _mm_mullo_epi16(L5, coeff15); p20 = _mm_mullo_epi16(L6, coeff9); p30 = _mm_add_epi16(L7, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[3][i], p00); p00 = _mm_add_epi16(L6, coeff16); p10 = _mm_mullo_epi16(L7, coeff9); p20 = _mm_mullo_epi16(L8, coeff15); p30 = _mm_mullo_epi16(L9, coeff7); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[4][i], p00); p00 = _mm_mullo_epi16(L8, coeff3); p10 = _mm_mullo_epi16(L9, coeff7); p20 = _mm_mullo_epi16(L10, coeff5); p30 = _mm_add_epi16(L11, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[5][i], p00); p00 = _mm_mullo_epi16(L9, coeff3); p10 = _mm_mullo_epi16(L10, coeff11); p20 = _mm_mullo_epi16(L11, coeff13); p30 = _mm_mullo_epi16(L12, coeff5); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[6][i], p00); p00 = _mm_add_epi16(L11, L13); p10 = _mm_add_epi16(L12, L12); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[7][i], p00); } bsy >>= 3; for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel_t)); memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel_t)); memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel_t)); memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel_t)); memcpy(dst5, pfirst[4] + i * 11, bsx * sizeof(pel_t)); memcpy(dst6, pfirst[5] + i * 11, bsx * sizeof(pel_t)); memcpy(dst7, pfirst[6] + i * 11, bsx * sizeof(pel_t)); memcpy(dst8, pfirst[7] + i * 11, bsx * sizeof(pel_t)); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; } } else if (bsx == 16) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i SS1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L1 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L2 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L3 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L4 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L5 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L6 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L7 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L8 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i H1 = _mm_unpacklo_epi8(SS1, zero); __m128i SS10 = _mm_loadu_si128((__m128i*)(src + 10)); __m128i H2 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H3 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H4 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H5 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H6 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H7 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H8 = _mm_unpacklo_epi8(SS10, zero); p00 = _mm_mullo_epi16(L1, coeff5); p10 = _mm_mullo_epi16(L2, coeff13); p20 = _mm_mullo_epi16(L3, coeff11); p30 = _mm_mullo_epi16(L4, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H1, coeff5); p11 = _mm_mullo_epi16(H2, coeff13); p21 = _mm_mullo_epi16(H3, coeff11); p31 = _mm_mullo_epi16(H4, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst1, p00); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(L2, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H3, coeff5); p21 = _mm_mullo_epi16(H4, coeff7); p31 = _mm_mullo_epi16(H5, coeff3); p01 = _mm_add_epi16(H2, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst2, p00); p00 = _mm_mullo_epi16(L4, coeff7); p10 = _mm_mullo_epi16(L5, coeff15); p20 = _mm_mullo_epi16(L6, coeff9); p30 = _mm_add_epi16(L7, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H4, coeff7); p11 = _mm_mullo_epi16(H5, coeff15); p21 = _mm_mullo_epi16(H6, coeff9); p31 = _mm_add_epi16(H7, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst3, p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H5, H8); p11 = _mm_add_epi16(H6, H7); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst4, p00); } else if (bsx == 8) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; for (i = 0; i < 8; src++, i++) { dst1[i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); dst2[i] = (pel_t)((src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); dst3[i] = (pel_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + 1 * src[7] + 16) >> 5); dst4[i] = (pel_t)((src[5] + 3 * src[6] + 3 * src[7] + 1 * src[8] + 4) >> 3); dst5[i] = (pel_t)((src[6] + 9 * src[7] + 15 * src[8] + 7 * src[9] + 16) >> 5); dst6[i] = (pel_t)((3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); dst7[i] = (pel_t)((3 * src[9] + 11 * src[10] + 13 * src[11] + 5 * src[12] + 16) >> 5); dst8[i] = (pel_t)((src[11] + 2 * src[12] + src[13] + 2) >> 2); } if (bsy == 32) { //src -> 8,src[7] -> 15 __m128i pad1 = _mm_set1_epi8(src[8]); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad1); _mm_storel_epi64((__m128i*)dst3, pad1); _mm_storel_epi64((__m128i*)dst4, pad1); _mm_storel_epi64((__m128i*)dst5, pad1); _mm_storel_epi64((__m128i*)dst6, pad1); _mm_storel_epi64((__m128i*)dst7, pad1); _mm_storel_epi64((__m128i*)dst8, pad1); src += 4; dst1[0] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); dst1[1] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); dst1[2] = (pel_t)((5 * src[2] + 13 * src[3] + 11 * src[4] + 3 * src[5] + 16) >> 5); dst1[3] = (pel_t)((5 * src[3] + 13 * src[4] + 11 * src[5] + 3 * src[6] + 16) >> 5); dst2[0] = (pel_t)((src[1] + 5 * src[2] + 7 * src[3] + 3 * src[4] + 8) >> 4); dst2[1] = (pel_t)((src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); dst2[2] = (pel_t)((src[3] + 5 * src[4] + 7 * src[5] + 3 * src[6] + 8) >> 4); dst3[0] = (pel_t)((7 * src[3] + 15 * src[4] + 9 * src[5] + src[6] + 16) >> 5); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad1); _mm_storel_epi64((__m128i*)dst3, pad1); _mm_storel_epi64((__m128i*)dst4, pad1); _mm_storel_epi64((__m128i*)dst5, pad1); _mm_storel_epi64((__m128i*)dst6, pad1); _mm_storel_epi64((__m128i*)dst7, pad1); _mm_storel_epi64((__m128i*)dst8, pad1); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad1); _mm_storel_epi64((__m128i*)dst3, pad1); _mm_storel_epi64((__m128i*)dst4, pad1); _mm_storel_epi64((__m128i*)dst5, pad1); _mm_storel_epi64((__m128i*)dst6, pad1); _mm_storel_epi64((__m128i*)dst7, pad1); _mm_storel_epi64((__m128i*)dst8, pad1); } } else { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i p00, p10, p20, p30; __m128i SS1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L1 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L2 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L3 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L4 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L5 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L6 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L7 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L8 = _mm_unpacklo_epi8(SS1, zero); p00 = _mm_mullo_epi16(L1, coeff5); p10 = _mm_mullo_epi16(L2, coeff13); p20 = _mm_mullo_epi16(L3, coeff11); p30 = _mm_mullo_epi16(L4, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst1)) = _mm_cvtsi128_si32(p00); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(L2, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst2)) = _mm_cvtsi128_si32(p00); p00 = _mm_mullo_epi16(L4, coeff7); p10 = _mm_mullo_epi16(L5, coeff15); p20 = _mm_mullo_epi16(L6, coeff9); p30 = _mm_add_epi16(L7, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst3)) = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst4)) = _mm_cvtsi128_si32(p00); if (bsy == 16) { pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; src += 8; __m128i pad1 = _mm_set1_epi8(src[0]); *(int*)(dst5) = _mm_cvtsi128_si32(pad1); *(int*)(dst6) = _mm_cvtsi128_si32(pad1); *(int*)(dst7) = _mm_cvtsi128_si32(pad1); *(int*)(dst8) = _mm_cvtsi128_si32(pad1); dst5[0] = (pel_t)((src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5); dst5[1] = (pel_t)((src[-1] + 9 * src[0] + 15 * src[1] + 7 * src[2] + 16) >> 5); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; *(int*)(dst1) = _mm_cvtsi128_si32(pad1); *(int*)(dst2) = _mm_cvtsi128_si32(pad1); *(int*)(dst3) = _mm_cvtsi128_si32(pad1); *(int*)(dst4) = _mm_cvtsi128_si32(pad1); *(int*)(dst5) = _mm_cvtsi128_si32(pad1); *(int*)(dst6) = _mm_cvtsi128_si32(pad1); *(int*)(dst7) = _mm_cvtsi128_si32(pad1); *(int*)(dst8) = _mm_cvtsi128_si32(pad1); } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_6_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; int i; __m128i zero = _mm_setzero_si128(); __m128i offset = _mm_set1_epi16(2); UNUSED_PARAMETER(dir_mode); src += 2; for (i = 0; i < line_size - 8; i += 16, src += 16) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); __m128i sum3 = _mm_add_epi16(H0, H1); __m128i sum4 = _mm_add_epi16(H1, H2); sum1 = _mm_add_epi16(sum1, sum2); sum3 = _mm_add_epi16(sum3, sum4); sum1 = _mm_add_epi16(sum1, offset); sum3 = _mm_add_epi16(sum3, offset); sum1 = _mm_srli_epi16(sum1, 2); sum3 = _mm_srli_epi16(sum3, 2); sum1 = _mm_packus_epi16(sum1, sum3); _mm_store_si128((__m128i*)&first_line[i], sum1); } if (i < line_size) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); sum1 = _mm_add_epi16(sum1, sum2); sum1 = _mm_add_epi16(sum1, offset); sum1 = _mm_srli_epi16(sum1, 2); sum1 = _mm_packus_epi16(sum1, sum1); _mm_storel_epi64((__m128i*)&first_line[i], sum1); } if (bsx > 16 || bsx == 4) { for (i = 0; i < bsy; i++) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 16) { pel_t *dst1 = dst; pel_t *dst2; if (bsy == 4) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[0]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 = dst + 8; M = _mm_loadu_si128((__m128i*)&first_line[8]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); } else { __m128i M = _mm_loadu_si128((__m128i*)&first_line[0]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst2 = dst1 + i_dst; dst1 = dst + 8; M = _mm_loadu_si128((__m128i*)&first_line[8]); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; M = _mm_loadu_si128((__m128i*)&first_line[16]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); } } else { for (i = 0; i < bsy; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_7_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j; int iWidth2 = bsx << 1; __m128i zero = _mm_setzero_si128(); __m128i S0, S1, S2, S3; __m128i t0, t1, t2, t3; __m128i off = _mm_set1_epi16(64); __m128i c0; UNUSED_PARAMETER(dir_mode); if (bsx >= bsy) { if (bsx & 0x07) { __m128i D0; int i_dst2 = i_dst << 1; for (j = 0; j < bsy; j += 2) { int idx = tab_idx_mode_7[j]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_7[j]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); idx = tab_idx_mode_7[j + 1]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_7[j + 1]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t1 = _mm_unpacklo_epi8(S0, S1); t2 = _mm_unpacklo_epi8(S2, S3); t1 = _mm_unpacklo_epi16(t1, t2); t1 = _mm_maddubs_epi16(t1, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, zero); ((uint32_t*)(dst))[0] = _mm_cvtsi128_si32(D0); D0= _mm_srli_si128(D0, 4); ((uint32_t*)(dst + i_dst))[0] = _mm_cvtsi128_si32(D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); dst += i_dst2; } } else if (bsx & 0x0f) { __m128i D0; for (j = 0; j < bsy; j++) { int idx = tab_idx_mode_7[j]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_7[j]); S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, _mm_setzero_si128()); _mm_storel_epi64((__m128i*)(dst), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); dst += i_dst; } } else { for (j = 0; j < bsy; j++) { __m128i D0, D1; int idx = tab_idx_mode_7[j]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_7[j]); for (i = 0; i < bsx; i += 16, idx += 16) { S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_loadu_si128((__m128i*)(src + idx + 1)); S2 = _mm_loadu_si128((__m128i*)(src + idx + 2)); S3 = _mm_loadu_si128((__m128i*)(src + idx + 3)); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); t0 = _mm_unpackhi_epi8(S0, S1); t1 = _mm_unpackhi_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D1 = _mm_hadds_epi16(t0, t1); D1 = _mm_add_epi16(D1, off); D1 = _mm_srli_epi16(D1, 7); D0 = _mm_packus_epi16(D0, D1); _mm_storeu_si128((__m128i*)(dst + i), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); } dst += i_dst; } } } else { if (bsx & 0x07) { for (j = 0; j < bsy; j++) { int real_width; int idx = tab_idx_mode_7[j]; real_width = XAVS2_MIN(bsx, iWidth2 - idx + 1); if (real_width <= 0) { pel_t val = (pel_t)((src[iWidth2] * tab_coeff_mode_7[j][0] + src[iWidth2 + 1] * tab_coeff_mode_7[j][1] + src[iWidth2 + 2] * tab_coeff_mode_7[j][2] + src[iWidth2 + 3] * tab_coeff_mode_7[j][3] + 64) >> 7); __m128i D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; j++; for (; j < bsy; j++) { val = (pel_t)((src[iWidth2] * tab_coeff_mode_7[j][0] + src[iWidth2 + 1] * tab_coeff_mode_7[j][1] + src[iWidth2 + 2] * tab_coeff_mode_7[j][2] + src[iWidth2 + 3] * tab_coeff_mode_7[j][3] + 64) >> 7); D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; } break; } else { __m128i D0; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_7[j]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); D0 = _mm_hadds_epi16(t0, zero); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, zero); _mm_storel_epi64((__m128i*)(dst), D0); if (real_width < bsx) { D0 = _mm_set1_epi8((char)dst[real_width - 1]); _mm_storel_epi64((__m128i*)(dst + real_width), D0); } } dst += i_dst; } } else if (bsx & 0x0f) { for (j = 0; j < bsy; j++) { int real_width; int idx = tab_idx_mode_7[j]; real_width = XAVS2_MIN(bsx, iWidth2 - idx + 1); if (real_width <= 0) { pel_t val = (pel_t)((src[iWidth2] * tab_coeff_mode_7[j][0] + src[iWidth2 + 1] * tab_coeff_mode_7[j][1] + src[iWidth2 + 2] * tab_coeff_mode_7[j][2] + src[iWidth2 + 3] * tab_coeff_mode_7[j][3] + 64) >> 7); __m128i D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; j++; for (; j < bsy; j++) { val = (pel_t)((src[iWidth2] * tab_coeff_mode_7[j][0] + src[iWidth2 + 1] * tab_coeff_mode_7[j][1] + src[iWidth2 + 2] * tab_coeff_mode_7[j][2] + src[iWidth2 + 3] * tab_coeff_mode_7[j][3] + 64) >> 7); D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; } break; } else { __m128i D0; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_7[j]); S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, zero); _mm_storel_epi64((__m128i*)(dst), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); if (real_width < bsx) { D0 = _mm_set1_epi8((char)dst[real_width - 1]); _mm_storel_epi64((__m128i*)(dst + real_width), D0); } } dst += i_dst; } } else { for (j = 0; j < bsy; j++) { int real_width; int idx = tab_idx_mode_7[j]; real_width = XAVS2_MIN(bsx, iWidth2 - idx + 1); if (real_width <= 0) { pel_t val = (pel_t)((src[iWidth2] * tab_coeff_mode_7[j][0] + src[iWidth2 + 1] * tab_coeff_mode_7[j][1] + src[iWidth2 + 2] * tab_coeff_mode_7[j][2] + src[iWidth2 + 3] * tab_coeff_mode_7[j][3] + 64) >> 7); __m128i D0 = _mm_set1_epi8((char)val); for (i = 0; i < bsx; i += 16) { _mm_storeu_si128((__m128i*)(dst + i), D0); } dst += i_dst; j++; for (; j < bsy; j++) { val = (pel_t)((src[iWidth2] * tab_coeff_mode_7[j][0] + src[iWidth2 + 1] * tab_coeff_mode_7[j][1] + src[iWidth2 + 2] * tab_coeff_mode_7[j][2] + src[iWidth2 + 3] * tab_coeff_mode_7[j][3] + 64) >> 7); D0 = _mm_set1_epi8((char)val); for (i = 0; i < bsx; i += 16) { _mm_storeu_si128((__m128i*)(dst + i), D0); } dst += i_dst; } break; } else { __m128i D0, D1; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_7[j]); for (i = 0; i < real_width; i += 16, idx += 16) { S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_loadu_si128((__m128i*)(src + idx + 1)); S2 = _mm_loadu_si128((__m128i*)(src + idx + 2)); S3 = _mm_loadu_si128((__m128i*)(src + idx + 3)); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); t0 = _mm_unpackhi_epi8(S0, S1); t1 = _mm_unpackhi_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D1 = _mm_hadds_epi16(t0, t1); D1 = _mm_add_epi16(D1, off); D1 = _mm_srli_epi16(D1, 7); D0 = _mm_packus_epi16(D0, D1); _mm_store_si128((__m128i*)(dst + i), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); } if (real_width < bsx) { D0 = _mm_set1_epi8((char)dst[real_width - 1]); for (i = real_width; i < bsx; i += 16) { _mm_storeu_si128((__m128i*)(dst + i), D0); //dst[i] = dst[real_width - 1]; } } } dst += i_dst; } } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_8_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (64 + 48)]); int line_size = bsx + (bsy >> 1) - 1; int i; int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[2]; __m128i zero = _mm_setzero_si128(); __m128i coeff = _mm_set1_epi16(3); __m128i offset1 = _mm_set1_epi16(4); __m128i offset2 = _mm_set1_epi16(2); int i_dst2 = i_dst * 2; UNUSED_PARAMETER(dir_mode); pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; for (i = 0; i < line_size - 8; i += 16, src += 16) { __m128i p01, p02, p11, p12; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p01 = _mm_add_epi16(L1, L2); p01 = _mm_mullo_epi16(p01, coeff); p02 = _mm_add_epi16(L0, L3); p02 = _mm_add_epi16(p02, offset1); p01 = _mm_add_epi16(p01, p02); p01 = _mm_srli_epi16(p01, 3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff); p12 = _mm_add_epi16(H0, H3); p12 = _mm_add_epi16(p12, offset1); p11 = _mm_add_epi16(p11, p12); p11 = _mm_srli_epi16(p11, 3); p01 = _mm_packus_epi16(p01, p11); _mm_store_si128((__m128i*)&pfirst[0][i], p01); p01 = _mm_add_epi16(L1, L2); p02 = _mm_add_epi16(L2, L3); p11 = _mm_add_epi16(H1, H2); p12 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p02); p11 = _mm_add_epi16(p11, p12); p01 = _mm_add_epi16(p01, offset2); p11 = _mm_add_epi16(p11, offset2); p01 = _mm_srli_epi16(p01, 2); p11 = _mm_srli_epi16(p11, 2); p01 = _mm_packus_epi16(p01, p11); _mm_store_si128((__m128i*)&pfirst[1][i], p01); } if (i < line_size) { __m128i p01, p02; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p01 = _mm_add_epi16(L1, L2); p01 = _mm_mullo_epi16(p01, coeff); p02 = _mm_add_epi16(L0, L3); p02 = _mm_add_epi16(p02, offset1); p01 = _mm_add_epi16(p01, p02); p01 = _mm_srli_epi16(p01, 3); p01 = _mm_packus_epi16(p01, p01); _mm_storel_epi64((__m128i*)&pfirst[0][i], p01); p01 = _mm_add_epi16(L1, L2); p02 = _mm_add_epi16(L2, L3); p01 = _mm_add_epi16(p01, p02); p01 = _mm_add_epi16(p01, offset2); p01 = _mm_srli_epi16(p01, 2); p01 = _mm_packus_epi16(p01, p01); _mm_storel_epi64((__m128i*)&pfirst[1][i], p01); } bsy >>= 1; if (bsx != 8) { for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); dst += i_dst2; } } else if (bsy == 4) { __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][0]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][0]); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); } else { for (i = 0; i < 16; i = i + 8) { __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][i]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][i]); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_9_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j; int iWidth2 = bsx << 1; __m128i zero = _mm_setzero_si128(); __m128i S0, S1, S2, S3; __m128i t0, t1, t2, t3; __m128i off = _mm_set1_epi16(64); __m128i c0; UNUSED_PARAMETER(dir_mode); if (bsx >= bsy) { if (bsx & 0x07) { __m128i D0; int i_dst2 = i_dst << 1; for (j = 0; j < bsy; j += 2) { int idx = tab_idx_mode_9[j]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_9[j]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); idx = tab_idx_mode_9[j + 1]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_9[j + 1]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t1 = _mm_unpacklo_epi8(S0, S1); t2 = _mm_unpacklo_epi8(S2, S3); t1 = _mm_unpacklo_epi16(t1, t2); t1 = _mm_maddubs_epi16(t1, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, zero); ((uint32_t*)(dst))[0] = _mm_cvtsi128_si32(D0); D0 = _mm_srli_si128(D0, 4); ((uint32_t*)(dst + i_dst))[0] = _mm_cvtsi128_si32(D0); //_mm_maskmoveu_si128(D0, mask, (char*)(dst + i_dst)); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); dst += i_dst2; } } else if (bsx & 0x0f) { __m128i D0; for (j = 0; j < bsy; j++) { int idx = tab_idx_mode_9[j]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_9[j]); S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, _mm_setzero_si128()); _mm_storel_epi64((__m128i*)(dst), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); dst += i_dst; } } else { for (j = 0; j < bsy; j++) { __m128i D0, D1; int idx = tab_idx_mode_9[j]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_9[j]); for (i = 0; i < bsx; i += 16, idx += 16) { S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_loadu_si128((__m128i*)(src + idx + 1)); S2 = _mm_loadu_si128((__m128i*)(src + idx + 2)); S3 = _mm_loadu_si128((__m128i*)(src + idx + 3)); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); t0 = _mm_unpackhi_epi8(S0, S1); t1 = _mm_unpackhi_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D1 = _mm_hadds_epi16(t0, t1); D1 = _mm_add_epi16(D1, off); D1 = _mm_srli_epi16(D1, 7); D0 = _mm_packus_epi16(D0, D1); _mm_storeu_si128((__m128i*)(dst + i), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); } dst += i_dst; } } } else { if (bsx & 0x07) { for (j = 0; j < bsy; j++) { int real_width; int idx = tab_idx_mode_9[j]; real_width = XAVS2_MIN(bsx, iWidth2 - idx + 1); if (real_width <= 0) { pel_t val = (pel_t)((src[iWidth2] * tab_coeff_mode_9[j][0] + src[iWidth2 + 1] * tab_coeff_mode_9[j][1] + src[iWidth2 + 2] * tab_coeff_mode_9[j][2] + src[iWidth2 + 3] * tab_coeff_mode_9[j][3] + 64) >> 7); __m128i D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; j++; for (; j < bsy; j++) { val = (pel_t)((src[iWidth2] * tab_coeff_mode_9[j][0] + src[iWidth2 + 1] * tab_coeff_mode_9[j][1] + src[iWidth2 + 2] * tab_coeff_mode_9[j][2] + src[iWidth2 + 3] * tab_coeff_mode_9[j][3] + 64) >> 7); D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; } break; } else { __m128i D0; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_9[j]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); D0 = _mm_hadds_epi16(t0, zero); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, zero); _mm_storel_epi64((__m128i*)(dst), D0); if (real_width < bsx) { D0 = _mm_set1_epi8((char)dst[real_width - 1]); _mm_storel_epi64((__m128i*)(dst + real_width), D0); } } dst += i_dst; } } else if (bsx & 0x0f) { for (j = 0; j < bsy; j++) { int real_width; int idx = tab_idx_mode_9[j]; real_width = XAVS2_MIN(bsx, iWidth2 - idx + 1); if (real_width <= 0) { pel_t val = (pel_t)((src[iWidth2] * tab_coeff_mode_9[j][0] + src[iWidth2 + 1] * tab_coeff_mode_9[j][1] + src[iWidth2 + 2] * tab_coeff_mode_9[j][2] + src[iWidth2 + 3] * tab_coeff_mode_9[j][3] + 64) >> 7); __m128i D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; j++; for (; j < bsy; j++) { val = (pel_t)((src[iWidth2] * tab_coeff_mode_9[j][0] + src[iWidth2 + 1] * tab_coeff_mode_9[j][1] + src[iWidth2 + 2] * tab_coeff_mode_9[j][2] + src[iWidth2 + 3] * tab_coeff_mode_9[j][3] + 64) >> 7); D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; } break; } else { __m128i D0; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_9[j]); S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, zero); _mm_storel_epi64((__m128i*)(dst), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); if (real_width < bsx) { D0 = _mm_set1_epi8((char)dst[real_width - 1]); _mm_storel_epi64((__m128i*)(dst + real_width), D0); } } dst += i_dst; } } else { for (j = 0; j < bsy; j++) { int real_width; int idx = tab_idx_mode_9[j]; real_width = XAVS2_MIN(bsx, iWidth2 - idx + 1); if (real_width <= 0) { pel_t val = (pel_t)((src[iWidth2] * tab_coeff_mode_9[j][0] + src[iWidth2 + 1] * tab_coeff_mode_9[j][1] + src[iWidth2 + 2] * tab_coeff_mode_9[j][2] + src[iWidth2 + 3] * tab_coeff_mode_9[j][3] + 64) >> 7); __m128i D0 = _mm_set1_epi8((char)val); for (i = 0; i < bsx; i += 16) { _mm_storeu_si128((__m128i*)(dst + i), D0); } dst += i_dst; j++; for (; j < bsy; j++) { val = (pel_t)((src[iWidth2] * tab_coeff_mode_9[j][0] + src[iWidth2 + 1] * tab_coeff_mode_9[j][1] + src[iWidth2 + 2] * tab_coeff_mode_9[j][2] + src[iWidth2 + 3] * tab_coeff_mode_9[j][3] + 64) >> 7); D0 = _mm_set1_epi8((char)val); for (i = 0; i < bsx; i += 16) { _mm_storeu_si128((__m128i*)(dst + i), D0); } dst += i_dst; } break; } else { __m128i D0, D1; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_9[j]); for (i = 0; i < real_width; i += 16, idx += 16) { S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_loadu_si128((__m128i*)(src + idx + 1)); S2 = _mm_loadu_si128((__m128i*)(src + idx + 2)); S3 = _mm_loadu_si128((__m128i*)(src + idx + 3)); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); t0 = _mm_unpackhi_epi8(S0, S1); t1 = _mm_unpackhi_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D1 = _mm_hadds_epi16(t0, t1); D1 = _mm_add_epi16(D1, off); D1 = _mm_srli_epi16(D1, 7); D0 = _mm_packus_epi16(D0, D1); _mm_store_si128((__m128i*)(dst + i), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); } if (real_width < bsx) { D0 = _mm_set1_epi8((char)dst[real_width - 1]); for (i = real_width; i < bsx; i += 16) { _mm_storeu_si128((__m128i*)(dst + i), D0); //dst[i] = dst[real_width - 1]; } } } dst += i_dst; } } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_10_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); UNUSED_PARAMETER(dir_mode); if (bsy != 4) { ALIGN16(pel_t first_line[4 * (64 + 32)]); int line_size = bsx + bsy / 4 - 1; int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; pfirst[2] = first_line + aligned_line_size * 2; pfirst[3] = first_line + aligned_line_size * 3; for (i = 0; i < line_size - 8; i += 16, src += 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[0][i], p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[1][i], p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H1, H2); p11 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[3][i], p00); } if (i < line_size) { __m128i p00, p10, p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[0][i], p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[1][i], p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[3][i], p00); } bsy >>= 2; if (bsx != 8) { int i_dstx4 = i_dst << 2; switch (bsx) { case 4: for (i = 0; i < bsy; i++) { CP32(dst1, pfirst[0] + i); dst1 += i_dstx4; CP32(dst2, pfirst[1] + i); dst2 += i_dstx4; CP32(dst3, pfirst[2] + i); dst3 += i_dstx4; CP32(dst4, pfirst[3] + i); dst4 += i_dstx4; } break; case 16: for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i, 16 * sizeof(pel_t)); dst1 += i_dstx4; memcpy(dst2, pfirst[1] + i, 16 * sizeof(pel_t)); dst2 += i_dstx4; memcpy(dst3, pfirst[2] + i, 16 * sizeof(pel_t)); dst3 += i_dstx4; memcpy(dst4, pfirst[3] + i, 16 * sizeof(pel_t)); dst4 += i_dstx4; } break; case 32: for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i, 32 * sizeof(pel_t)); dst1 += i_dstx4; memcpy(dst2, pfirst[1] + i, 32 * sizeof(pel_t)); dst2 += i_dstx4; memcpy(dst3, pfirst[2] + i, 32 * sizeof(pel_t)); dst3 += i_dstx4; memcpy(dst4, pfirst[3] + i, 32 * sizeof(pel_t)); dst4 += i_dstx4; } break; case 64: for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i, 64 * sizeof(pel_t)); dst1 += i_dstx4; memcpy(dst2, pfirst[1] + i, 64 * sizeof(pel_t)); dst2 += i_dstx4; memcpy(dst3, pfirst[2] + i, 64 * sizeof(pel_t)); dst3 += i_dstx4; memcpy(dst4, pfirst[3] + i, 64 * sizeof(pel_t)); dst4 += i_dstx4; } break; default: assert(0); break; } } else { if (bsy == 2) { for (i = 0; i < bsy; i++) { CP64(dst1, pfirst[0] + i); CP64(dst2, pfirst[1] + i); CP64(dst3, pfirst[2] + i); CP64(dst4, pfirst[3] + i); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; } } else { __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][0]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][0]); __m128i M3 = _mm_loadu_si128((__m128i*)&pfirst[2][0]); __m128i M4 = _mm_loadu_si128((__m128i*)&pfirst[3][0]); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); } } } else { if (bsx == 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst1, p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst2, p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst3, p00); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H1, H2); p11 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst4, p00); } else { __m128i p00, p10, p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst1))[0] = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst2))[0] = _mm_cvtsi128_si32(p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst3))[0] = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst4))[0] = _mm_cvtsi128_si32(p00); } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_11_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j, idx; __m128i zero = _mm_setzero_si128(); __m128i S0, S1, S2, S3; __m128i t0, t1, t2, t3; __m128i off = _mm_set1_epi16(64); __m128i c0; UNUSED_PARAMETER(dir_mode); if (bsx & 0x07) { __m128i D0; int i_dst2 = i_dst << 1; for (j = 0; j < bsy; j += 2) { idx = (j + 1) >> 3; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_11[j & 0x07]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); idx = (j + 2) >> 3; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_11[(j + 1) & 0x07]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t1 = _mm_unpacklo_epi8(S0, S1); t2 = _mm_unpacklo_epi8(S2, S3); t1 = _mm_unpacklo_epi16(t1, t2); t1 = _mm_maddubs_epi16(t1, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, zero); ((uint32_t*)(dst))[0] = _mm_cvtsi128_si32(D0); D0 = _mm_srli_si128(D0, 4); ((uint32_t*)(dst + i_dst))[0] = _mm_cvtsi128_si32(D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); dst += i_dst2; } } else if (bsx & 0x0f) { __m128i D0; for (j = 0; j < bsy; j++) { idx = (j + 1) >> 3; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_11[j & 0x07]); S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, _mm_setzero_si128()); _mm_storel_epi64((__m128i*)(dst), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); dst += i_dst; } } else { for (j = 0; j < bsy; j++) { __m128i D0, D1; idx = (j + 1) >> 3; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_11[j & 0x07]); for (i = 0; i < bsx; i += 16, idx += 16) { S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_loadu_si128((__m128i*)(src + idx + 1)); S2 = _mm_loadu_si128((__m128i*)(src + idx + 2)); S3 = _mm_loadu_si128((__m128i*)(src + idx + 3)); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); t0 = _mm_unpackhi_epi8(S0, S1); t1 = _mm_unpackhi_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D1 = _mm_hadds_epi16(t0, t1); D1 = _mm_add_epi16(D1, off); D1 = _mm_srli_epi16(D1, 7); D0 = _mm_packus_epi16(D0, D1); _mm_storeu_si128((__m128i*)(dst + i), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); } dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_y_25_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsx > 8) { ALIGN16(pel_t first_line[64 + (64 << 3)]); int line_size = bsx + ((bsy - 1) << 3); int iHeight8 = bsy << 3; pel_t *pfirst = first_line; __m128i coeff0 = _mm_setr_epi16(7, 3, 5, 1, 3, 1, 1, 0); __m128i coeff1 = _mm_setr_epi16(15, 7, 13, 3, 11, 5, 9, 1); __m128i coeff2 = _mm_setr_epi16(9, 5, 11, 3, 13, 7, 15, 2); __m128i coeff3 = _mm_setr_epi16(1, 1, 3, 1, 5, 3, 7, 1); __m128i coeff4 = _mm_setr_epi16(16, 8, 16, 4, 16, 8, 16, 2); __m128i coeff5 = _mm_setr_epi16(1, 2, 1, 4, 1, 2, 1, 8); __m128i p00, p10, p20, p30; __m128i L0 = _mm_set1_epi16(src[0]); __m128i L1 = _mm_set1_epi16(src[-1]); __m128i L2 = _mm_set1_epi16(src[-2]); __m128i L3 = _mm_set1_epi16(src[-3]); src -= 4; for (i = 0; i < line_size - 24; i += 32, src -= 4) { p00 = _mm_mullo_epi16(L0, coeff0); p10 = _mm_mullo_epi16(L1, coeff1); p20 = _mm_mullo_epi16(L2, coeff2); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); pfirst += 8; L0 = _mm_set1_epi16(src[0]); p00 = _mm_mullo_epi16(L1, coeff0); p10 = _mm_mullo_epi16(L2, coeff1); p20 = _mm_mullo_epi16(L3, coeff2); p30 = _mm_mullo_epi16(L0, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); pfirst += 8; L1 = _mm_set1_epi16(src[-1]); p00 = _mm_mullo_epi16(L2, coeff0); p10 = _mm_mullo_epi16(L3, coeff1); p20 = _mm_mullo_epi16(L0, coeff2); p30 = _mm_mullo_epi16(L1, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); pfirst += 8; L2 = _mm_set1_epi16(src[-2]); p00 = _mm_mullo_epi16(L3, coeff0); p10 = _mm_mullo_epi16(L0, coeff1); p20 = _mm_mullo_epi16(L1, coeff2); p30 = _mm_mullo_epi16(L2, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); pfirst += 8; L3 = _mm_set1_epi16(src[-3]); } if (bsx == 16) { p00 = _mm_mullo_epi16(L0, coeff0); p10 = _mm_mullo_epi16(L1, coeff1); p20 = _mm_mullo_epi16(L2, coeff2); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); } else { p00 = _mm_mullo_epi16(L0, coeff0); p10 = _mm_mullo_epi16(L1, coeff1); p20 = _mm_mullo_epi16(L2, coeff2); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); pfirst += 8; L0 = _mm_set1_epi16(src[0]); p00 = _mm_mullo_epi16(L1, coeff0); p10 = _mm_mullo_epi16(L2, coeff1); p20 = _mm_mullo_epi16(L3, coeff2); p30 = _mm_mullo_epi16(L0, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); pfirst += 8; L1 = _mm_set1_epi16(src[-1]); p00 = _mm_mullo_epi16(L2, coeff0); p10 = _mm_mullo_epi16(L3, coeff1); p20 = _mm_mullo_epi16(L0, coeff2); p30 = _mm_mullo_epi16(L1, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); } for (i = 0; i < iHeight8; i += 8) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 8) { __m128i coeff0 = _mm_setr_epi16(7, 3, 5, 1, 3, 1, 1, 0); __m128i coeff1 = _mm_setr_epi16(15, 7, 13, 3, 11, 5, 9, 1); __m128i coeff2 = _mm_setr_epi16(9, 5, 11, 3, 13, 7, 15, 2); __m128i coeff3 = _mm_setr_epi16(1, 1, 3, 1, 5, 3, 7, 1); __m128i coeff4 = _mm_setr_epi16(16, 8, 16, 4, 16, 8, 16, 2); __m128i coeff5 = _mm_setr_epi16(1, 2, 1, 4, 1, 2, 1, 8); __m128i p00, p10, p20, p30; __m128i L0 = _mm_set1_epi16(src[0]); __m128i L1 = _mm_set1_epi16(src[-1]); __m128i L2 = _mm_set1_epi16(src[-2]); __m128i L3 = _mm_set1_epi16(src[-3]); src -= 4; bsy >>= 2; for (i = 0; i < bsy; i++, src -= 4) { p00 = _mm_mullo_epi16(L0, coeff0); p10 = _mm_mullo_epi16(L1, coeff1); p20 = _mm_mullo_epi16(L2, coeff2); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L0 = _mm_set1_epi16(src[0]); p00 = _mm_mullo_epi16(L1, coeff0); p10 = _mm_mullo_epi16(L2, coeff1); p20 = _mm_mullo_epi16(L3, coeff2); p30 = _mm_mullo_epi16(L0, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L1 = _mm_set1_epi16(src[-1]); p00 = _mm_mullo_epi16(L2, coeff0); p10 = _mm_mullo_epi16(L3, coeff1); p20 = _mm_mullo_epi16(L0, coeff2); p30 = _mm_mullo_epi16(L1, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L2 = _mm_set1_epi16(src[-2]); p00 = _mm_mullo_epi16(L3, coeff0); p10 = _mm_mullo_epi16(L0, coeff1); p20 = _mm_mullo_epi16(L1, coeff2); p30 = _mm_mullo_epi16(L2, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L3 = _mm_set1_epi16(src[-3]); } } else { __m128i zero = _mm_setzero_si128(); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i coeff9 = _mm_set1_epi16(9); __m128i coeff11 = _mm_set1_epi16(11); __m128i coeff13 = _mm_set1_epi16(13); __m128i coeff15 = _mm_set1_epi16(15); __m128i coeff16 = _mm_set1_epi16(16); __m128i shuffle = _mm_setr_epi8(7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8); if (bsy == 4) { src -= 15; __m128i p01, p11, p21, p31; __m128i M2, M4, M6, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M2 = _mm_srli_epi16(p01, 5); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M4 = _mm_srli_epi16(p01, 4); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 5); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); M8 = _mm_srli_epi16(p01, 3); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M4 = _mm_unpacklo_epi16(M2, M6); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); } else { src -= 15; __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i M1, M2, M3, M4, M5, M6, M7, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M1 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M2 = _mm_srli_epi16(p01, 5); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M3 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M4 = _mm_srli_epi16(p01, 4); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M5 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 5); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); M7 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); M8 = _mm_srli_epi16(p01, 3); M1 = _mm_packus_epi16(M1, M3); M5 = _mm_packus_epi16(M5, M7); M1 = _mm_shuffle_epi8(M1, shuffle); M5 = _mm_shuffle_epi8(M5, shuffle); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M3 = _mm_unpacklo_epi16(M1, M5); M7 = _mm_unpackhi_epi16(M1, M5); M4 = _mm_unpacklo_epi16(M2, M6); M8 = _mm_unpackhi_epi16(M2, M6); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; *((int*)dst) = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); *((int*)dst) = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); *((int*)dst) = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); *((int*)dst) = _mm_cvtsi128_si32(M7); } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_y_26_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsx != 4) { __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i shuffle = _mm_setr_epi8(7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8); ALIGN16(pel_t first_line[64 + 256]); int line_size = bsx + (bsy - 1) * 4; int iHeight4 = bsy << 2; src -= 15; for (i = 0; i < line_size - 32; i += 64, src -= 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i M1, M2, M3, M4, M5, M6, M7, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); M1 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); M2 = _mm_srli_epi16(p01, 4); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); M3 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); M4 = _mm_srli_epi16(p01, 3); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M5 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 4); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); M7 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H1, H2); p11 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); M8 = _mm_srli_epi16(p01, 2); M1 = _mm_packus_epi16(M1, M3); M5 = _mm_packus_epi16(M5, M7); M1 = _mm_shuffle_epi8(M1, shuffle); M5 = _mm_shuffle_epi8(M5, shuffle); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M3 = _mm_unpacklo_epi16(M1, M5); M7 = _mm_unpackhi_epi16(M1, M5); M4 = _mm_unpacklo_epi16(M2, M6); M8 = _mm_unpackhi_epi16(M2, M6); _mm_store_si128((__m128i*)&first_line[i], M4); _mm_store_si128((__m128i*)&first_line[16 + i], M8); _mm_store_si128((__m128i*)&first_line[32 + i], M3); _mm_store_si128((__m128i*)&first_line[48 + i], M7); } if (i < line_size) { __m128i p01, p11, p21, p31; __m128i M2, M4, M6, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); M2 = _mm_srli_epi16(p01, 4); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); M4 = _mm_srli_epi16(p01, 3); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 4); p01 = _mm_add_epi16(H1, H2); p11 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); M8 = _mm_srli_epi16(p01, 2); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M4 = _mm_unpacklo_epi16(M2, M6); M8 = _mm_unpackhi_epi16(M2, M6); _mm_store_si128((__m128i*)&first_line[i], M4); _mm_store_si128((__m128i*)&first_line[16 + i], M8); } switch (bsx) { case 4: for (i = 0; i < iHeight4; i += 4) { CP32(dst, first_line + i); dst += i_dst; } break; case 8: for (i = 0; i < iHeight4; i += 4) { CP64(dst, first_line + i); dst += i_dst; } break; default: for (i = 0; i < iHeight4; i += 4) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } break; } } else { __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i shuffle = _mm_setr_epi8(7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8); src -= 15; if (bsy == 4) { __m128i p01, p11, p21, p31; __m128i M2, M4, M6, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); M2 = _mm_srli_epi16(p01, 4); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); M4 = _mm_srli_epi16(p01, 3); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 4); p01 = _mm_add_epi16(H1, H2); p11 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); M8 = _mm_srli_epi16(p01, 2); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M4 = _mm_unpacklo_epi16(M2, M6); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); } else { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i M1, M2, M3, M4, M5, M6, M7, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); M1 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); M2 = _mm_srli_epi16(p01, 4); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); M3 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); M4 = _mm_srli_epi16(p01, 3); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M5 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 4); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); M7 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H1, H2); p11 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); M8 = _mm_srli_epi16(p01, 2); M1 = _mm_packus_epi16(M1, M3); M5 = _mm_packus_epi16(M5, M7); M1 = _mm_shuffle_epi8(M1, shuffle); M5 = _mm_shuffle_epi8(M5, shuffle); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M3 = _mm_unpacklo_epi16(M1, M5); M7 = _mm_unpackhi_epi16(M1, M5); M4 = _mm_unpacklo_epi16(M2, M6); M8 = _mm_unpackhi_epi16(M2, M6); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); dst += i_dst; ((int*)dst)[0] = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M8); dst += i_dst; ((int*)dst)[0] = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M3); dst += i_dst; ((int*)dst)[0] = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M7); } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_y_28_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); int line_size = bsx + (bsy - 1) * 2; int i; int iHeight2 = bsy << 1; __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i shuffle = _mm_setr_epi8(7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8); __m128i zero = _mm_setzero_si128(); UNUSED_PARAMETER(dir_mode); src -= 15; for (i = 0; i < line_size - 16; i += 32, src -= 16) { __m128i p00, p10, p01, p11; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_adds_epi16(L1, L2); p01 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_adds_epi16(L0, L3); p11 = _mm_add_epi16(L2, L3); p10 = _mm_adds_epi16(p10, coeff4); p00 = _mm_adds_epi16(p00, p10); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); p00 = _mm_shuffle_epi8(p00, shuffle); _mm_store_si128((__m128i*)&first_line[i + 16], p00); p00 = _mm_adds_epi16(H1, H2); p01 = _mm_add_epi16(H1, H2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_adds_epi16(H0, H3); p11 = _mm_add_epi16(H2, H3); p10 = _mm_adds_epi16(p10, coeff4); p00 = _mm_adds_epi16(p00, p10); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); p00 = _mm_shuffle_epi8(p00, shuffle); _mm_store_si128((__m128i*)&first_line[i], p00); } if (i < line_size) { __m128i p00, p10, p01, p11; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_adds_epi16(H1, H2); p01 = _mm_add_epi16(H1, H2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_adds_epi16(H0, H3); p11 = _mm_add_epi16(H2, H3); p10 = _mm_adds_epi16(p10, coeff4); p00 = _mm_adds_epi16(p00, p10); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); p00 = _mm_shuffle_epi8(p00, shuffle); _mm_store_si128((__m128i*)&first_line[i], p00); } if (bsx >= 16) { for (i = 0; i < iHeight2; i += 2) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 8) { for (i = 0; i < iHeight2; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; } } else { for (i = 0; i < iHeight2; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_y_30_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; int i; __m128i coeff2 = _mm_set1_epi16(2); __m128i shuffle = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __m128i zero = _mm_setzero_si128(); UNUSED_PARAMETER(dir_mode); src -= 17; for (i = 0; i < line_size - 8; i += 16, src -= 16) { __m128i p00, p10, p01, p11; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); p00 = _mm_add_epi16(L0, L1); p10 = _mm_add_epi16(L1, L2); p01 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p00 = _mm_add_epi16(p00, p10); p01 = _mm_add_epi16(p01, p11); p00 = _mm_add_epi16(p00, coeff2); p01 = _mm_add_epi16(p01, coeff2); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); p00 = _mm_shuffle_epi8(p00, shuffle); _mm_store_si128((__m128i*)&first_line[i], p00); } if (i < line_size) { __m128i p01, p11; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); p01 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_srli_epi16(p01, 2); p01 = _mm_packus_epi16(p01, p01); p01 = _mm_shuffle_epi8(p01, shuffle); _mm_store_si128((__m128i*)&first_line[i], p01); } if (bsx > 16) { for (i = 0; i < bsy; i++) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 16) { pel_t *dst1 = dst; pel_t *dst2; if (bsy == 4) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[0]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 = dst + 8; M = _mm_loadu_si128((__m128i*)&first_line[8]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); } else { __m128i M = _mm_loadu_si128((__m128i*)&first_line[0]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst2 = dst1 + i_dst; dst1 = dst + 8; M = _mm_loadu_si128((__m128i*)&first_line[8]); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; M = _mm_loadu_si128((__m128i*)&first_line[16]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); } } else if (bsx == 8) { for (i = 0; i < bsy; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; } } else { for (i = 0; i < bsy; i += 4) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 1); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 1); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 1); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_y_31_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t dst_tran[64 * 80]); ALIGN16(pel_t src_tran[64 * 8]); int i_dst2 = (((bsy + 15) >> 4) << 4) + 16; int i; UNUSED_PARAMETER(dir_mode); //transposition for (i = 0; i < (bsy + bsx * 11 / 8 + 3); i++) { src_tran[i] = src[-i]; } intra_pred_ang_x_5_sse128(src_tran, dst_tran, i_dst2, 5, bsy, bsx); if ((bsy > 4) && (bsx > 4)) { pel_t *pDst_128[64]; pel_t *pTra_128[64]; int iSize_x = bsx >> 3; int iSize_y = bsy >> 3; int iSize = iSize_x * iSize_y; for (int y = 0; y < iSize_y; y++) { for (int x = 0; x < iSize_x; x++) { pDst_128[x + y * iSize_x] = dst + x * 8 + y * 8 * i_dst; pTra_128[x + y * iSize_x] = dst_tran + y * 8 + x * 8 * i_dst2; } } for (i = 0; i < iSize; i++) { pel_t *dst_tran_org = pTra_128[i]; pel_t *dst1 = pDst_128[i]; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; __m128i Org_8_0, Org_8_1, Org_8_2, Org_8_3, Org_8_4, Org_8_5, Org_8_6, Org_8_7; __m128i p00, p10, p20, p30; __m128i t00, t10, t20, t30; Org_8_0 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_1 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_2 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_3 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_4 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_5 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_6 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_7 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; p00 = _mm_unpacklo_epi8(Org_8_0, Org_8_1); p10 = _mm_unpacklo_epi8(Org_8_2, Org_8_3); p20 = _mm_unpacklo_epi8(Org_8_4, Org_8_5); p30 = _mm_unpacklo_epi8(Org_8_6, Org_8_7); t00 = _mm_unpacklo_epi16(p00, p10); t20 = _mm_unpacklo_epi16(p20, p30); t10 = _mm_unpackhi_epi16(p00, p10); t30 = _mm_unpackhi_epi16(p20, p30); p00 = _mm_unpacklo_epi32(t00, t20); p10 = _mm_unpackhi_epi32(t00, t20); p20 = _mm_unpacklo_epi32(t10, t30); p30 = _mm_unpackhi_epi32(t10, t30); _mm_storel_epi64((__m128i*)dst1, p00); p00 = _mm_srli_si128(p00, 8); _mm_storel_epi64((__m128i*)dst2, p00); _mm_storel_epi64((__m128i*)dst3, p10); p10 = _mm_srli_si128(p10, 8); _mm_storel_epi64((__m128i*)dst4, p10); _mm_storel_epi64((__m128i*)dst5, p20); p20 = _mm_srli_si128(p20, 8); _mm_storel_epi64((__m128i*)dst6, p20); _mm_storel_epi64((__m128i*)dst7, p30); p30 = _mm_srli_si128(p30, 8); _mm_storel_epi64((__m128i*)dst8, p30); } } else if (bsx == 16) { for (i = 0; i < 2; i++) { pel_t *dst_tran_org = dst_tran + i * 8 * i_dst2; pel_t *dst1 = dst + i * 8; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i Org_8_0, Org_8_1, Org_8_2, Org_8_3, Org_8_4, Org_8_5, Org_8_6, Org_8_7; __m128i p00, p10, p20, p30; __m128i t00, t20; Org_8_0 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_1 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_2 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_3 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_4 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_5 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_6 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_7 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; p00 = _mm_unpacklo_epi8(Org_8_0, Org_8_1); p10 = _mm_unpacklo_epi8(Org_8_2, Org_8_3); p20 = _mm_unpacklo_epi8(Org_8_4, Org_8_5); p30 = _mm_unpacklo_epi8(Org_8_6, Org_8_7); t00 = _mm_unpacklo_epi16(p00, p10); t20 = _mm_unpacklo_epi16(p20, p30); p00 = _mm_unpacklo_epi32(t00, t20); p10 = _mm_unpackhi_epi32(t00, t20); _mm_storel_epi64((__m128i*)dst1, p00); p00 = _mm_srli_si128(p00, 8); _mm_storel_epi64((__m128i*)dst2, p00); _mm_storel_epi64((__m128i*)dst3, p10); p10 = _mm_srli_si128(p10, 8); _mm_storel_epi64((__m128i*)dst4, p10); } } else if (bsy == 16) {//bsx == 4 pel_t *dst_tran_org = dst_tran; pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; __m128i Org_8_0, Org_8_1, Org_8_2, Org_8_3; __m128i p00, p10; __m128i t00, t10; Org_8_0 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_1 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_2 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_3 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; p00 = _mm_unpacklo_epi8(Org_8_0, Org_8_1); p10 = _mm_unpacklo_epi8(Org_8_2, Org_8_3); t00 = _mm_unpacklo_epi16(p00, p10); t10 = _mm_unpackhi_epi16(p00, p10); *((int*)(dst1)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst2)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst3)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst4)) = _mm_cvtsi128_si32(t00); *((int*)(dst5)) = _mm_cvtsi128_si32(t10); t10 = _mm_srli_si128(t10, 4); *((int*)(dst6)) = _mm_cvtsi128_si32(t10); t10 = _mm_srli_si128(t10, 4); *((int*)(dst7)) = _mm_cvtsi128_si32(t10); t10 = _mm_srli_si128(t10, 4); *((int*)(dst8)) = _mm_cvtsi128_si32(t10); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; p00 = _mm_unpackhi_epi8(Org_8_0, Org_8_1); p10 = _mm_unpackhi_epi8(Org_8_2, Org_8_3); t00 = _mm_unpacklo_epi16(p00, p10); t10 = _mm_unpackhi_epi16(p00, p10); *((int*)(dst1)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst2)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst3)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst4)) = _mm_cvtsi128_si32(t00); *((int*)(dst5)) = _mm_cvtsi128_si32(t10); t10 = _mm_srli_si128(t10, 4); *((int*)(dst6)) = _mm_cvtsi128_si32(t10); t10 = _mm_srli_si128(t10, 4); *((int*)(dst7)) = _mm_cvtsi128_si32(t10); t10 = _mm_srli_si128(t10, 4); *((int*)(dst8)) = _mm_cvtsi128_si32(t10); } else {// bsx == 4 bsy ==4 pel_t *dst_tran_org = dst_tran; pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i Org_8_0, Org_8_1, Org_8_2, Org_8_3; __m128i p00, p10; __m128i t00; Org_8_0 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_1 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_2 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_3 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; p00 = _mm_unpacklo_epi8(Org_8_0, Org_8_1); p10 = _mm_unpacklo_epi8(Org_8_2, Org_8_3); t00 = _mm_unpacklo_epi16(p00, p10); *((int*)(dst1)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst2)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst3)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst4)) = _mm_cvtsi128_si32(t00); } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_y_32_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (64 + 64)]); int line_size = (bsy >> 1) + bsx - 1; int i; int aligned_line_size = ((line_size + 63) >> 4) << 4; pel_t *pfirst[2]; __m128i coeff2 = _mm_set1_epi16(2); __m128i zero = _mm_setzero_si128(); __m128i shuffle1 = _mm_setr_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); __m128i shuffle2 = _mm_setr_epi8(14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1); int i_dst2 = i_dst * 2; UNUSED_PARAMETER(dir_mode); pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; src -= 18; for (i = 0; i < line_size - 4; i += 8, src -= 16) { __m128i p00, p01, p10, p11; __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); p00 = _mm_add_epi16(L0, L1); p01 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p01); p00 = _mm_srli_epi16(p00, 2); p10 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p10 = _mm_add_epi16(p10, coeff2); p10 = _mm_add_epi16(p10, p11); p10 = _mm_srli_epi16(p10, 2); p00 = _mm_packus_epi16(p00, p10); p10 = _mm_shuffle_epi8(p00, shuffle2); p00 = _mm_shuffle_epi8(p00, shuffle1); _mm_storel_epi64((__m128i*)&pfirst[0][i], p00); _mm_storel_epi64((__m128i*)&pfirst[1][i], p10); } if (i < line_size) { __m128i p10, p11; __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); p10 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p10 = _mm_add_epi16(p10, coeff2); p10 = _mm_add_epi16(p10, p11); p10 = _mm_srli_epi16(p10, 2); p11 = _mm_packus_epi16(p10, p10); p10 = _mm_shuffle_epi8(p11, shuffle2); p11 = _mm_shuffle_epi8(p11, shuffle1); ((int*)&pfirst[0][i])[0] = _mm_cvtsi128_si32(p11); ((int*)&pfirst[1][i])[0] = _mm_cvtsi128_si32(p10); } bsy >>= 1; if (bsx >= 16 || bsx == 4) { for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); dst += i_dst2; } } else { if (bsy == 4) { __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][0]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][0]); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); } else { for (i = 0; i < 16; i = i + 8) { __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][i]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][i]); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; } } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_xy_13_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i coeff9 = _mm_set1_epi16(9); __m128i coeff11 = _mm_set1_epi16(11); __m128i coeff13 = _mm_set1_epi16(13); __m128i coeff15 = _mm_set1_epi16(15); __m128i coeff16 = _mm_set1_epi16(16); UNUSED_PARAMETER(dir_mode); int i; if (bsy > 8) { ALIGN16(pel_t first_line[(64 + 16) << 3]); int line_size = bsx + (bsy >> 3) - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 15) >> 4) << 4; pel_t *pfirst[8]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; src -= bsy - 8; for (i = 0; i < left_size; i++, src += 8) { pfirst[0][i] = (pel_t)((src[6] + (src[7] << 1) + src[8] + 2) >> 2); pfirst[1][i] = (pel_t)((src[5] + (src[6] << 1) + src[7] + 2) >> 2); pfirst[2][i] = (pel_t)((src[4] + (src[5] << 1) + src[6] + 2) >> 2); pfirst[3][i] = (pel_t)((src[3] + (src[4] << 1) + src[5] + 2) >> 2); pfirst[4][i] = (pel_t)((src[2] + (src[3] << 1) + src[4] + 2) >> 2); pfirst[5][i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); pfirst[6][i] = (pel_t)((src[0] + (src[1] << 1) + src[2] + 2) >> 2); pfirst[7][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (; i < line_size - 8; i += 16, src += 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[0][i], p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[1][i], p00); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[3][i], p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff11); p20 = _mm_mullo_epi16(L2, coeff13); p30 = _mm_mullo_epi16(L3, coeff5); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff11); p21 = _mm_mullo_epi16(H2, coeff13); p31 = _mm_mullo_epi16(H3, coeff5); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[4][i], p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[5][i], p00); p10 = _mm_mullo_epi16(L1, coeff9); p20 = _mm_mullo_epi16(L2, coeff15); p30 = _mm_mullo_epi16(L3, coeff7); p00 = _mm_add_epi16(L0, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p11 = _mm_mullo_epi16(H1, coeff9); p21 = _mm_mullo_epi16(H2, coeff15); p31 = _mm_mullo_epi16(H3, coeff7); p01 = _mm_add_epi16(H0, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[6][i], p00); p10 = _mm_mullo_epi16(L2, coeff2); p00 = _mm_add_epi16(L1, L3); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p11 = _mm_mullo_epi16(H2, coeff2); p01 = _mm_add_epi16(H1, H3); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[7][i], p00); } if (i < line_size) { __m128i p00, p10, p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[0][i], p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[1][i], p00); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[3][i], p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff11); p20 = _mm_mullo_epi16(L2, coeff13); p30 = _mm_mullo_epi16(L3, coeff5); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[4][i], p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[5][i], p00); p10 = _mm_mullo_epi16(L1, coeff9); p20 = _mm_mullo_epi16(L2, coeff15); p30 = _mm_mullo_epi16(L3, coeff7); p00 = _mm_add_epi16(L0, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[6][i], p00); p10 = _mm_mullo_epi16(L2, coeff2); p00 = _mm_add_epi16(L1, L3); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[7][i], p00); } pfirst[0] += left_size; pfirst[1] += left_size; pfirst[2] += left_size; pfirst[3] += left_size; pfirst[4] += left_size; pfirst[5] += left_size; pfirst[6] += left_size; pfirst[7] += left_size; bsy >>= 3; for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[4] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[5] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[6] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[7] - i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsy == 8) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; if (bsx == 32) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst1, p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst2, p00); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst3, p00); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst4, p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff11); p20 = _mm_mullo_epi16(L2, coeff13); p30 = _mm_mullo_epi16(L3, coeff5); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff11); p21 = _mm_mullo_epi16(H2, coeff13); p31 = _mm_mullo_epi16(H3, coeff5); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst5, p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst6, p00); p10 = _mm_mullo_epi16(L1, coeff9); p20 = _mm_mullo_epi16(L2, coeff15); p30 = _mm_mullo_epi16(L3, coeff7); p00 = _mm_add_epi16(L0, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p11 = _mm_mullo_epi16(H1, coeff9); p21 = _mm_mullo_epi16(H2, coeff15); p31 = _mm_mullo_epi16(H3, coeff7); p01 = _mm_add_epi16(H0, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst7, p00); p10 = _mm_mullo_epi16(L2, coeff2); p00 = _mm_add_epi16(L1, L3); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p11 = _mm_mullo_epi16(H2, coeff2); p01 = _mm_add_epi16(H1, H3); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst8, p00); src += 16; dst1 += 16; dst2 += 16; dst3 += 16; dst4 += 16; dst5 += 16; dst6 += 16; dst7 += 16; dst8 += 16; S0 = _mm_loadu_si128((__m128i*)(src + 2)); S1 = _mm_loadu_si128((__m128i*)(src + 1)); S2 = _mm_loadu_si128((__m128i*)(src)); S3 = _mm_loadu_si128((__m128i*)(src - 1)); L0 = _mm_unpacklo_epi8(S0, zero); L1 = _mm_unpacklo_epi8(S1, zero); L2 = _mm_unpacklo_epi8(S2, zero); L3 = _mm_unpacklo_epi8(S3, zero); H0 = _mm_unpackhi_epi8(S0, zero); H1 = _mm_unpackhi_epi8(S1, zero); H2 = _mm_unpackhi_epi8(S2, zero); H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst1, p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst2, p00); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst3, p00); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst4, p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff11); p20 = _mm_mullo_epi16(L2, coeff13); p30 = _mm_mullo_epi16(L3, coeff5); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff11); p21 = _mm_mullo_epi16(H2, coeff13); p31 = _mm_mullo_epi16(H3, coeff5); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst5, p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst6, p00); p10 = _mm_mullo_epi16(L1, coeff9); p20 = _mm_mullo_epi16(L2, coeff15); p30 = _mm_mullo_epi16(L3, coeff7); p00 = _mm_add_epi16(L0, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p11 = _mm_mullo_epi16(H1, coeff9); p21 = _mm_mullo_epi16(H2, coeff15); p31 = _mm_mullo_epi16(H3, coeff7); p01 = _mm_add_epi16(H0, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst7, p00); p10 = _mm_mullo_epi16(L2, coeff2); p00 = _mm_add_epi16(L1, L3); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p11 = _mm_mullo_epi16(H2, coeff2); p01 = _mm_add_epi16(H1, H3); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst8, p00); } else { __m128i p00, p10, p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst1, p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst2, p00); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst3, p00); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst4, p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff11); p20 = _mm_mullo_epi16(L2, coeff13); p30 = _mm_mullo_epi16(L3, coeff5); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst5, p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst6, p00); p10 = _mm_mullo_epi16(L1, coeff9); p20 = _mm_mullo_epi16(L2, coeff15); p30 = _mm_mullo_epi16(L3, coeff7); p00 = _mm_add_epi16(L0, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst7, p00); p10 = _mm_mullo_epi16(L2, coeff2); p00 = _mm_add_epi16(L1, L3); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst8, p00); } } else { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; if (bsx == 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst1, p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst2, p00); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst3, p00); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst4, p00); } else { __m128i p00, p10, p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst1))[0] = _mm_cvtsi128_si32(p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst2))[0] = _mm_cvtsi128_si32(p00); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst3))[0] = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst4))[0] = _mm_cvtsi128_si32(p00); } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_xy_14_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i zero = _mm_setzero_si128(); UNUSED_PARAMETER(dir_mode); if (bsy != 4) { ALIGN16(pel_t first_line[4 * (64 + 32)]); int line_size = bsx + bsy / 4 - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[4]; __m128i shuffle1 = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); __m128i shuffle2 = _mm_setr_epi8(1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12); __m128i shuffle3 = _mm_setr_epi8(2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13); __m128i shuffle4 = _mm_setr_epi8(3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14); pel_t *pSrc1 = src; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; pfirst[2] = first_line + aligned_line_size * 2; pfirst[3] = first_line + aligned_line_size * 3; src -= bsy - 4; for (i = 0; i < left_size - 1; i += 4, src += 16) { __m128i p00, p01, p10, p11; __m128i p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); p00 = _mm_add_epi16(L0, L1); p01 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p00 = _mm_add_epi16(p00, coeff2); p10 = _mm_add_epi16(p10, coeff2); p00 = _mm_add_epi16(p00, p01); p10 = _mm_add_epi16(p10, p11); p00 = _mm_srli_epi16(p00, 2); p10 = _mm_srli_epi16(p10, 2); p00 = _mm_packus_epi16(p00, p10); p10 = _mm_shuffle_epi8(p00, shuffle2); p20 = _mm_shuffle_epi8(p00, shuffle3); p30 = _mm_shuffle_epi8(p00, shuffle4); p00 = _mm_shuffle_epi8(p00, shuffle1); ((int*)&pfirst[0][i])[0] = _mm_cvtsi128_si32(p30); ((int*)&pfirst[1][i])[0] = _mm_cvtsi128_si32(p20); ((int*)&pfirst[2][i])[0] = _mm_cvtsi128_si32(p10); ((int*)&pfirst[3][i])[0] = _mm_cvtsi128_si32(p00); } if (i < left_size) { //ʹcԿܻ __m128i p00, p01, p10; __m128i p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); p00 = _mm_add_epi16(L0, L1); p01 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p01); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); p10 = _mm_shuffle_epi8(p00, shuffle2); p20 = _mm_shuffle_epi8(p00, shuffle3); p30 = _mm_shuffle_epi8(p00, shuffle4); p00 = _mm_shuffle_epi8(p00, shuffle1); ((int*)&pfirst[0][i])[0] = _mm_cvtsi128_si32(p30); ((int*)&pfirst[1][i])[0] = _mm_cvtsi128_si32(p20); ((int*)&pfirst[2][i])[0] = _mm_cvtsi128_si32(p10); ((int*)&pfirst[3][i])[0] = _mm_cvtsi128_si32(p00); } src = pSrc1; for (i = left_size; i < line_size; i++, src++) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[1][i], p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[0][i], p00); p00 = _mm_add_epi16(L0, L1); p10 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[3][i], p00); } pfirst[0] += left_size; pfirst[1] += left_size; pfirst[2] += left_size; pfirst[3] += left_size; bsy >>= 2; for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t)); dst += i_dst; } } else { if (bsx == 16) { pel_t *dst2 = dst + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)dst3, p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)dst2, p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)dst, p00); p00 = _mm_add_epi16(L0, L1); p10 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)dst4, p00); } else { pel_t *dst2 = dst + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i p00, p10, p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); ((int*)dst3)[0] = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); ((int*)dst2)[0] = _mm_cvtsi128_si32(p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); ((int*)dst)[0] = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L0, L1); p10 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); ((int*)dst4)[0] = _mm_cvtsi128_si32(p00); } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_xy_16_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (64 + 48)]); int line_size = bsx + bsy / 2 - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[2]; __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i shuffle1 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); __m128i shuffle2 = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14); int i; pel_t *pSrc1; UNUSED_PARAMETER(dir_mode); pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; src -= bsy - 2; pSrc1 = src; for (i = 0; i < left_size - 4; i += 8, src += 16) { __m128i p00, p01, p10, p11; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); p00 = _mm_add_epi16(L0, L1); p01 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p00 = _mm_add_epi16(p00, coeff2); p10 = _mm_add_epi16(p10, coeff2); p00 = _mm_add_epi16(p00, p01); p10 = _mm_add_epi16(p10, p11); p00 = _mm_srli_epi16(p00, 2); p10 = _mm_srli_epi16(p10, 2); p00 = _mm_packus_epi16(p00, p10); p10 = _mm_shuffle_epi8(p00, shuffle2); p00 = _mm_shuffle_epi8(p00, shuffle1); _mm_storel_epi64((__m128i*)&pfirst[1][i], p00); _mm_storel_epi64((__m128i*)&pfirst[0][i], p10); } if (i < left_size) { __m128i p00, p01; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); p00 = _mm_add_epi16(L0, L1); p01 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p01); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); p01 = _mm_shuffle_epi8(p00, shuffle2); p00 = _mm_shuffle_epi8(p00, shuffle1); ((int*)&pfirst[1][i])[0] = _mm_cvtsi128_si32(p00); ((int*)&pfirst[0][i])[0] = _mm_cvtsi128_si32(p01); } src = pSrc1 + left_size + left_size; for (i = left_size; i < line_size; i += 16, src += 16) { __m128i p00, p01, p10, p11; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(H1, H2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_mullo_epi16(p10, coeff3); p01 = _mm_add_epi16(L0, L3); p11 = _mm_add_epi16(H0, H3); p00 = _mm_add_epi16(p00, coeff4); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p00, p01); p10 = _mm_add_epi16(p10, p11); p00 = _mm_srli_epi16(p00, 3); p10 = _mm_srli_epi16(p10, 3); p00 = _mm_packus_epi16(p00, p10); _mm_storeu_si128((__m128i*)&pfirst[0][i], p00); p00 = _mm_add_epi16(L0, L1); p01 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p00 = _mm_add_epi16(p00, coeff2); p10 = _mm_add_epi16(p10, coeff2); p00 = _mm_add_epi16(p00, p01); p10 = _mm_add_epi16(p10, p11); p00 = _mm_srli_epi16(p00, 2); p10 = _mm_srli_epi16(p10, 2); p00 = _mm_packus_epi16(p00, p10); _mm_storeu_si128((__m128i*)&pfirst[1][i], p00); } pfirst[0] += left_size; pfirst[1] += left_size; bsy >>= 1; switch (bsx) { case 4: for (i = 0; i < bsy; i++) { CP32(dst, pfirst[0] - i); CP32(dst + i_dst, pfirst[1] - i); dst += (i_dst << 1); } break; case 8: for (i = 0; i < bsy; i++) { CP64(dst, pfirst[0] - i); CP64(dst + i_dst, pfirst[1] - i); dst += (i_dst << 1); } break; default: for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += (i_dst << 1); } break; } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_xy_18_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; int i; pel_t *pfirst = first_line + bsy - 1; __m128i coeff2 = _mm_set1_epi16(2); __m128i zero = _mm_setzero_si128(); UNUSED_PARAMETER(dir_mode); src -= bsy - 1; for (i = 0; i < line_size - 8; i += 16, src += 16) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); __m128i sum3 = _mm_add_epi16(H0, H1); __m128i sum4 = _mm_add_epi16(H1, H2); sum1 = _mm_add_epi16(sum1, sum2); sum3 = _mm_add_epi16(sum3, sum4); sum1 = _mm_add_epi16(sum1, coeff2); sum3 = _mm_add_epi16(sum3, coeff2); sum1 = _mm_srli_epi16(sum1, 2); sum3 = _mm_srli_epi16(sum3, 2); sum1 = _mm_packus_epi16(sum1, sum3); _mm_store_si128((__m128i*)&first_line[i], sum1); } if (i < line_size) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); sum1 = _mm_add_epi16(sum1, sum2); sum1 = _mm_add_epi16(sum1, coeff2); sum1 = _mm_srli_epi16(sum1, 2); sum1 = _mm_packus_epi16(sum1, sum1); _mm_storel_epi64((__m128i*)&first_line[i], sum1); } switch (bsx) { case 4: for (i = 0; i < bsy; i++) { CP32(dst, pfirst--); dst += i_dst; } break; case 8: for (i = 0; i < bsy; i++) { CP64(dst, pfirst--); dst += i_dst; } break; default: for (i = 0; i < bsy; i++) { memcpy(dst, pfirst--, bsx * sizeof(pel_t)); dst += i_dst; } break; break; } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_xy_20_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); int left_size = (bsy - 1) * 2 + 1; int top_size = bsx - 1; int line_size = left_size + top_size; int i; pel_t *pfirst = first_line + left_size - 1; __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i shuffle = _mm_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); pel_t *pSrc1 = src; UNUSED_PARAMETER(dir_mode); src -= bsy; for (i = 0; i < left_size - 16; i += 32, src += 16) { __m128i p00, p01, p10, p11; __m128i p20, p21, p30, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(H1, H2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_mullo_epi16(p10, coeff3); p01 = _mm_add_epi16(L0, L3); p11 = _mm_add_epi16(H0, H3); p00 = _mm_add_epi16(p00, coeff4); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p00, p01); p10 = _mm_add_epi16(p10, p11); p00 = _mm_srli_epi16(p00, 3); p10 = _mm_srli_epi16(p10, 3); p20 = _mm_add_epi16(L1, L2); p30 = _mm_add_epi16(H1, H2); p21 = _mm_add_epi16(L2, L3); p31 = _mm_add_epi16(H2, H3); p20 = _mm_add_epi16(p20, coeff2); p30 = _mm_add_epi16(p30, coeff2); p20 = _mm_add_epi16(p20, p21); p30 = _mm_add_epi16(p30, p31); p20 = _mm_srli_epi16(p20, 2); p30 = _mm_srli_epi16(p30, 2); p00 = _mm_packus_epi16(p00, p20); p10 = _mm_packus_epi16(p10, p30); p00 = _mm_shuffle_epi8(p00, shuffle); p10 = _mm_shuffle_epi8(p10, shuffle); _mm_store_si128((__m128i*)&first_line[i], p00); _mm_store_si128((__m128i*)&first_line[i + 16], p10); } if (i < left_size) { __m128i p00, p01; __m128i p20, p21; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p01 = _mm_add_epi16(L0, L3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p01); p00 = _mm_srli_epi16(p00, 3); p20 = _mm_add_epi16(L1, L2); p21 = _mm_add_epi16(L2, L3); p20 = _mm_add_epi16(p20, coeff2); p20 = _mm_add_epi16(p20, p21); p20 = _mm_srli_epi16(p20, 2); p00 = _mm_packus_epi16(p00, p20); p00 = _mm_shuffle_epi8(p00, shuffle); _mm_store_si128((__m128i*)&first_line[i], p00); } src = pSrc1; for (i = left_size; i < line_size - 8; i += 16, src += 16) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); __m128i sum3 = _mm_add_epi16(H0, H1); __m128i sum4 = _mm_add_epi16(H1, H2); sum1 = _mm_add_epi16(sum1, sum2); sum3 = _mm_add_epi16(sum3, sum4); sum1 = _mm_add_epi16(sum1, coeff2); sum3 = _mm_add_epi16(sum3, coeff2); sum1 = _mm_srli_epi16(sum1, 2); sum3 = _mm_srli_epi16(sum3, 2); sum1 = _mm_packus_epi16(sum1, sum3); _mm_storeu_si128((__m128i*)&first_line[i], sum1); } if (i < line_size) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); sum1 = _mm_add_epi16(sum1, sum2); sum1 = _mm_add_epi16(sum1, coeff2); sum1 = _mm_srli_epi16(sum1, 2); sum1 = _mm_packus_epi16(sum1, sum1); _mm_storel_epi64((__m128i*)&first_line[i], sum1); } for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); pfirst -= 2; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_xy_22_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); src -= bsy; if (bsx != 4) { ALIGN16(pel_t first_line[64 + 256]); int left_size = (bsy - 1) * 4 + 3; int top_size = bsx - 3; int line_size = left_size + top_size; pel_t *pfirst = first_line + left_size - 3; pel_t *pSrc1 = src; __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i shuffle = _mm_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); for (i = 0; i < line_size - 32; i += 64, src += 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i M1, M2, M3, M4, M5, M6, M7, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); M1 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); M2 = _mm_srli_epi16(p01, 4); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); M3 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); M4 = _mm_srli_epi16(p01, 3); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M5 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 4); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); M7 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H1, H2); p11 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); M8 = _mm_srli_epi16(p01, 2); M1 = _mm_packus_epi16(M1, M3); M5 = _mm_packus_epi16(M5, M7); M1 = _mm_shuffle_epi8(M1, shuffle); M5 = _mm_shuffle_epi8(M5, shuffle); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M3 = _mm_unpacklo_epi16(M1, M5); M7 = _mm_unpackhi_epi16(M1, M5); M4 = _mm_unpacklo_epi16(M2, M6); M8 = _mm_unpackhi_epi16(M2, M6); _mm_store_si128((__m128i*)&first_line[i], M3); _mm_store_si128((__m128i*)&first_line[16 + i], M7); _mm_store_si128((__m128i*)&first_line[32 + i], M4); _mm_store_si128((__m128i*)&first_line[48 + i], M8); } if (i < left_size) { __m128i p00, p10, p20, p30; __m128i M1, M3, M5, M7; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); M1 = _mm_srli_epi16(p00, 4); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); M3 = _mm_srli_epi16(p00, 3); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M5 = _mm_srli_epi16(p00, 4); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); M7 = _mm_srli_epi16(p00, 2); M1 = _mm_packus_epi16(M1, M3); M5 = _mm_packus_epi16(M5, M7); M1 = _mm_shuffle_epi8(M1, shuffle); M5 = _mm_shuffle_epi8(M5, shuffle); M3 = _mm_unpacklo_epi16(M1, M5); M7 = _mm_unpackhi_epi16(M1, M5); _mm_store_si128((__m128i*)&first_line[i], M3); _mm_store_si128((__m128i*)&first_line[16 + i], M7); } src = pSrc1 + bsy; for (i = left_size; i < line_size - 8; i += 16, src += 16) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); __m128i sum3 = _mm_add_epi16(H0, H1); __m128i sum4 = _mm_add_epi16(H1, H2); sum1 = _mm_add_epi16(sum1, sum2); sum3 = _mm_add_epi16(sum3, sum4); sum1 = _mm_add_epi16(sum1, coeff2); sum3 = _mm_add_epi16(sum3, coeff2); sum1 = _mm_srli_epi16(sum1, 2); sum3 = _mm_srli_epi16(sum3, 2); sum1 = _mm_packus_epi16(sum1, sum3); _mm_storeu_si128((__m128i*)&first_line[i], sum1); } if (i < line_size) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); sum1 = _mm_add_epi16(sum1, sum2); sum1 = _mm_add_epi16(sum1, coeff2); sum1 = _mm_srli_epi16(sum1, 2); sum1 = _mm_packus_epi16(sum1, sum1); _mm_storel_epi64((__m128i*)&first_line[i], sum1); } switch (bsx) { case 8: while (bsy--) { CP64(dst, pfirst); dst += i_dst; pfirst -= 4; } break; case 16: case 32: case 64: while (bsy--) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); dst += i_dst; pfirst -= 4; } break; default: assert(0); break; } } else { dst += (bsy - 1) * i_dst; for (i = 0; i < bsy; i++, src++) { dst[0] = (src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4; dst[1] = (src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3; dst[2] = (src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4; dst[3] = (src[0] + src[1] * 2 + src[2] + 2) >> 2; dst -= i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_xy_23_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsx > 8) { ALIGN16(pel_t first_line[64 + 512]); int left_size = (bsy << 3) - 1; int top_size = bsx - 7; int line_size = left_size + top_size; pel_t *pfirst = first_line + left_size - 7; pel_t *pfirst1 = first_line; pel_t *src_org = src; src -= bsy; __m128i zero = _mm_setzero_si128(); __m128i coeff0 = _mm_setr_epi16(7, 3, 5, 1, 3, 1, 1, 0); __m128i coeff1 = _mm_setr_epi16(15, 7, 13, 3, 11, 5, 9, 1); __m128i coeff2 = _mm_setr_epi16(9, 5, 11, 3, 13, 7, 15, 2); __m128i coeff3 = _mm_setr_epi16(1, 1, 3, 1, 5, 3, 7, 1); __m128i coeff4 = _mm_setr_epi16(16, 8, 16, 4, 16, 8, 16, 2); __m128i coeff5 = _mm_setr_epi16(1, 2, 1, 4, 1, 2, 1, 8); __m128i p00, p10, p20, p30; __m128i L0 = _mm_set1_epi16(src[-1]); __m128i L1 = _mm_set1_epi16(src[0]); __m128i L2 = _mm_set1_epi16(src[1]); __m128i L3 = _mm_set1_epi16(src[2]); src += 4; for (i = 0; i < left_size + 1; i += 32, src += 4) { p00 = _mm_mullo_epi16(L0, coeff0); p10 = _mm_mullo_epi16(L1, coeff1); p20 = _mm_mullo_epi16(L2, coeff2); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst1, p00); pfirst1 += 8; L0 = _mm_set1_epi16(src[-1]); p00 = _mm_mullo_epi16(L1, coeff0); p10 = _mm_mullo_epi16(L2, coeff1); p20 = _mm_mullo_epi16(L3, coeff2); p30 = _mm_mullo_epi16(L0, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst1, p00); pfirst1 += 8; L1 = _mm_set1_epi16(src[0]); p00 = _mm_mullo_epi16(L2, coeff0); p10 = _mm_mullo_epi16(L3, coeff1); p20 = _mm_mullo_epi16(L0, coeff2); p30 = _mm_mullo_epi16(L1, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst1, p00); pfirst1 += 8; L2 = _mm_set1_epi16(src[1]); p00 = _mm_mullo_epi16(L3, coeff0); p10 = _mm_mullo_epi16(L0, coeff1); p20 = _mm_mullo_epi16(L1, coeff2); p30 = _mm_mullo_epi16(L2, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst1, p00); pfirst1 += 8; L3 = _mm_set1_epi16(src[2]); } src = src_org + 1; for (; i < line_size; i += 16, src += 16) { coeff2 = _mm_set1_epi16(2); __m128i p00, p10; __m128i p01, p11; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); p00 = _mm_mullo_epi16(L0, coeff2); p10 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_mullo_epi16(H0, coeff2); p11 = _mm_add_epi16(H1, H2); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&first_line[i], p00); } for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); dst += i_dst; pfirst -= 8; } } else if (bsx == 8) { __m128i coeff0 = _mm_setr_epi16(7, 3, 5, 1, 3, 1, 1, 0); __m128i coeff1 = _mm_setr_epi16(15, 7, 13, 3, 11, 5, 9, 1); __m128i coeff2 = _mm_setr_epi16(9, 5, 11, 3, 13, 7, 15, 2); __m128i coeff3 = _mm_setr_epi16(1, 1, 3, 1, 5, 3, 7, 1); __m128i coeff4 = _mm_setr_epi16(16, 8, 16, 4, 16, 8, 16, 2); __m128i coeff5 = _mm_setr_epi16(1, 2, 1, 4, 1, 2, 1, 8); __m128i p00, p10, p20, p30; __m128i L0 = _mm_set1_epi16(src[-2]); __m128i L1 = _mm_set1_epi16(src[-1]); __m128i L2 = _mm_set1_epi16(src[0]); __m128i L3 = _mm_set1_epi16(src[1]); src -= 4; bsy >>= 2; for (i = 0; i < bsy; i++, src -= 4) { p00 = _mm_mullo_epi16(L0, coeff0); p10 = _mm_mullo_epi16(L1, coeff1); p20 = _mm_mullo_epi16(L2, coeff2); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L3 = _mm_set1_epi16(src[1]); p00 = _mm_mullo_epi16(L3, coeff0); p10 = _mm_mullo_epi16(L0, coeff1); p20 = _mm_mullo_epi16(L1, coeff2); p30 = _mm_mullo_epi16(L2, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L2 = _mm_set1_epi16(src[0]); p00 = _mm_mullo_epi16(L2, coeff0); p10 = _mm_mullo_epi16(L3, coeff1); p20 = _mm_mullo_epi16(L0, coeff2); p30 = _mm_mullo_epi16(L1, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L1 = _mm_set1_epi16(src[-1]); p00 = _mm_mullo_epi16(L1, coeff0); p10 = _mm_mullo_epi16(L2, coeff1); p20 = _mm_mullo_epi16(L3, coeff2); p30 = _mm_mullo_epi16(L0, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L0 = _mm_set1_epi16(src[-2]); } } else { __m128i zero = _mm_setzero_si128(); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i coeff9 = _mm_set1_epi16(9); __m128i coeff11 = _mm_set1_epi16(11); __m128i coeff13 = _mm_set1_epi16(13); __m128i coeff15 = _mm_set1_epi16(15); __m128i coeff16 = _mm_set1_epi16(16); __m128i shuffle = _mm_setr_epi8(7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8); if (bsy == 4) { src -= 15; __m128i p01, p11, p21, p31; __m128i M2, M4, M6, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M2 = _mm_srli_epi16(p01, 5); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M4 = _mm_srli_epi16(p01, 4); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 5); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); M8 = _mm_srli_epi16(p01, 3); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M4 = _mm_unpacklo_epi16(M2, M6); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); } else { src -= 15; __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i M1, M2, M3, M4, M5, M6, M7, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M1 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M2 = _mm_srli_epi16(p01, 5); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M3 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M4 = _mm_srli_epi16(p01, 4); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M5 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 5); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); M7 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); M8 = _mm_srli_epi16(p01, 3); M1 = _mm_packus_epi16(M1, M3); M5 = _mm_packus_epi16(M5, M7); M1 = _mm_shuffle_epi8(M1, shuffle); M5 = _mm_shuffle_epi8(M5, shuffle); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M3 = _mm_unpacklo_epi16(M1, M5); M7 = _mm_unpackhi_epi16(M1, M5); M4 = _mm_unpacklo_epi16(M2, M6); M8 = _mm_unpackhi_epi16(M2, M6); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; *((int*)dst) = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); *((int*)dst) = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); *((int*)dst) = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); *((int*)dst) = _mm_cvtsi128_si32(M7); } } } xavs2-1.3/source/common/vec/intrinsic_intra-pred_avx2.c000066400000000000000000011547671340660520300232440ustar00rootroot00000000000000/* * intrinsic_intra-pred_avx2.c * * Description of this file: * AVX2 assembly functions of Intra-Prediction module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include #include #include #include #include #include "../basic_types.h" #include "avs2_defs.h" #include "intrinsic.h" #ifndef _MSC_VER #define __int64 int64_t #endif void intra_pred_ver_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { if (bsx <= 8 && bsy <= 8) { // block_sizeС8ʱavx2sse intra_pred_ver_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } pel_t *rsrc = src + 1; int i; __m256i S1; if (bsx >= 32) { for (i = 0; i < bsy; i++) { S1 = _mm256_loadu_si256((const __m256i*)(rsrc));//32 _mm256_storeu_si256((__m256i*)(dst), S1); if (32 < bsx) { S1 = _mm256_loadu_si256((const __m256i*)(rsrc + 32));//64 _mm256_storeu_si256((__m256i*)(dst + 32), S1); } dst += i_dst; } } else { int i, j; __m128i S1; if (bsx & 15) {//4/8 __m128i mask = _mm_load_si128((const __m128i*)intrinsic_mask[(bsx & 15) - 1]); for (i = 0; i < bsy; i++) { for (j = 0; j < bsx - 15; j += 16) { S1 = _mm_loadu_si128((const __m128i*)(rsrc + j)); _mm_storeu_si128((__m128i*)(dst + j), S1); } S1 = _mm_loadu_si128((const __m128i*)(rsrc + j)); _mm_maskmoveu_si128(S1, mask, (char *)&dst[j]); dst += i_dst; } } /*{//4/8 for (i = 0; i < bsy; i++) { for (j = 0; j < bsx; j += 4) { S1 = _mm_loadu_si128((const __m128i*)(rsrc + j)); _mm_storeu_si128((__m128i*)(dst + j), S1); } dst += i_dst; } }*/ else { for (i = 0; i < bsy; i++) {//16 S1 = _mm_loadu_si128((const __m128i*)rsrc); _mm_storeu_si128((__m128i*)dst, S1); dst += i_dst; } } } } void intra_pred_hor_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { if (bsx <= 8 && bsy <= 8) { // block_sizeС8ʱavx2sse intra_pred_hor_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } int i; pel_t *rsrc = src - 1; __m256i S1; if (bsx >= 32) { for (i = 0; i < bsy; i++) { S1 = _mm256_set1_epi8((char)rsrc[-i]);//32 _mm256_storeu_si256((__m256i*)(dst), S1); if (32 < bsx) {//64 _mm256_storeu_si256((__m256i*)(dst + 32), S1); } dst += i_dst; } } else { int i, j; __m128i S1; if (bsx & 15) {//4/8 __m128i mask = _mm_load_si128((const __m128i*)intrinsic_mask[(bsx & 15) - 1]); for (i = 0; i < bsy; i++) { for (j = 0; j < bsx - 15; j += 16) { S1 = _mm_set1_epi8((char)rsrc[-i]); _mm_storeu_si128((__m128i*)(dst + j), S1); } S1 = _mm_set1_epi8((char)rsrc[-i]); _mm_maskmoveu_si128(S1, mask, (char*)&dst[j]); dst += i_dst; } } else { for (i = 0; i < bsy; i++) {//16 S1 = _mm_set1_epi8((char)rsrc[-i]); _mm_storeu_si128((__m128i*)dst, S1); dst += i_dst; } } } } void intra_pred_dc_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { if (bsx <= 8 && bsy <= 8) { // block_sizeС8ʱavx2sse intra_pred_dc_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } int bAboveAvail = dir_mode >> 8; int bLeftAvail = dir_mode & 0xFF; int x, y; int iDCValue = 0; pel_t *rsrc = src - 1; __m256i S1; int i; if (bLeftAvail) { for (y = 0; y < bsy; y++) { iDCValue += rsrc[-y]; } rsrc = src + 1; if (bAboveAvail) { for (x = 0; x < bsx; x++) { iDCValue += rsrc[x]; } iDCValue += ((bsx + bsy) >> 1); iDCValue = (iDCValue * (512 / (bsx + bsy))) >> 9; } else { iDCValue += bsy / 2; iDCValue /= bsy; } } else { rsrc = src + 1; if (bAboveAvail) { for (x = 0; x < bsx; x++) { iDCValue += rsrc[x]; } iDCValue += bsx / 2; iDCValue /= bsx; } else { iDCValue = g_dc_value; } } /* for (y = 0; y < bsy; y++) { for (x = 0; x < bsx; x++) { dst[x] = iDCValue; } dst += i_dst; } */ S1 = _mm256_set1_epi8((char)iDCValue); if (bsx >= 32) { for (i = 0; i < bsy; i++) { _mm256_storeu_si256((__m256i*)(dst), S1);//32 if (32 < bsx) {//64 _mm256_storeu_si256((__m256i*)(dst + 32), S1); } dst += i_dst; } } else { __m128i S1; int j; S1 = _mm_set1_epi8((char)iDCValue); if (bsx & 15) {//4/8 __m128i mask = _mm_load_si128((const __m128i*)intrinsic_mask[(bsx & 15) - 1]); for (i = 0; i < bsy; i++) { for (j = 0; j < bsx - 15; j += 16) { _mm_storeu_si128((__m128i*)(dst + j), S1); } _mm_maskmoveu_si128(S1, mask, (char*)&dst[j]); dst += i_dst; } } else { for (i = 0; i < bsy; i++) {//16 _mm_storeu_si128((__m128i*)dst, S1); dst += i_dst; } } } } void intra_pred_plane_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { pel_t *rpSrc; int iH = 0; int iV = 0; int iA, iB, iC; int x, y; int iW2 = bsx >> 1; int iH2 = bsy >> 1; int ib_mult[5] = { 13, 17, 5, 11, 23 }; int ib_shift[5] = { 7, 10, 11, 15, 19 }; int im_h = ib_mult [tab_log2[bsx] - 2]; int is_h = ib_shift[tab_log2[bsx] - 2]; int im_v = ib_mult [tab_log2[bsy] - 2]; int is_v = ib_shift[tab_log2[bsy] - 2]; int iTmp; UNUSED_PARAMETER(dir_mode); rpSrc = src + iW2; for (x = 1; x < iW2 + 1; x++) { iH += x * (rpSrc[x] - rpSrc[-x]); } rpSrc = src - iH2; for (y = 1; y < iH2 + 1; y++) { iV += y * (rpSrc[-y] - rpSrc[y]); } iA = (src[-1 - (bsy - 1)] + src[1 + bsx - 1]) << 4; iB = ((iH << 5) * im_h + (1 << (is_h - 1))) >> is_h; iC = ((iV << 5) * im_v + (1 << (is_v - 1))) >> is_v; iTmp = iA - (iH2 - 1) * iC - (iW2 - 1) * iB + 16; __m256i TC, TB, TA, T_Start, T, D, D1; __m256i mask ; TA = _mm256_set1_epi16((int16_t)iTmp); TB = _mm256_set1_epi16((int16_t)iB); TC = _mm256_set1_epi16((int16_t)iC); T_Start = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); T_Start = _mm256_mullo_epi16(TB, T_Start); T_Start = _mm256_add_epi16(T_Start, TA); TB = _mm256_mullo_epi16(TB, _mm256_set1_epi16(16)); if (bsx == 4){ mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[3]); for (y = 0; y < bsy; y++) { D = _mm256_srai_epi16(T_Start, 5); D = _mm256_packus_epi16(D, D); _mm256_maskstore_epi32((int*)dst, mask, D); T_Start = _mm256_add_epi16(T_Start, TC); dst += i_dst; } } else if (bsx == 8) { mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[7]); for (y = 0; y < bsy; y++) { D = _mm256_srai_epi16(T_Start, 5); D = _mm256_packus_epi16(D, D); _mm256_maskstore_epi64((__int64*)dst, mask, D); T_Start = _mm256_add_epi16(T_Start, TC); dst += i_dst; } } else if (bsx == 16) { mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[15]); for (y = 0; y < bsy; y++) { D = _mm256_srai_epi16(T_Start, 5); D = _mm256_packus_epi16(D, D); D = _mm256_permute4x64_epi64(D, 8); _mm256_maskstore_epi64((__int64*)dst, mask, D); T_Start = _mm256_add_epi16(T_Start, TC); dst += i_dst; } } else { //32 64 for (y = 0; y < bsy; y++) { T = T_Start; for (x = 0; x < bsx; x += 32) { D = _mm256_srai_epi16(T, 5); T = _mm256_add_epi16(T, TB); D1 = _mm256_srai_epi16(T, 5); D = _mm256_packus_epi16(D, D1); D = _mm256_permute4x64_epi64(D, 0x00D8); _mm256_storeu_si256((__m256i*)(dst + x), D); T = _mm256_add_epi16(T, TB); } T_Start = _mm256_add_epi16(T_Start, TC); dst += i_dst; } } } void intra_pred_bilinear_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int x, y; int ishift_x = tab_log2[bsx]; int ishift_y = tab_log2[bsy]; int ishift = XAVS2_MIN(ishift_x, ishift_y); int ishift_xy = ishift_x + ishift_y + 1; int offset = 1 << (ishift_x + ishift_y); int a, b, c, t, val; pel_t *p; __m256i T, T1, T2, T3, C1, C2, ADD; ALIGN32(itr_t pTop[MAX_CU_SIZE + 32]); ALIGN32(itr_t pLeft[MAX_CU_SIZE + 32]); ALIGN32(itr_t pT[MAX_CU_SIZE + 32]); ALIGN32(itr_t pL[MAX_CU_SIZE + 32]); ALIGN32(itr_t wy[MAX_CU_SIZE + 32]); UNUSED_PARAMETER(dir_mode); p = src + 1; __m256i ZERO = _mm256_setzero_si256(); for (x = 0; x < bsx; x += 32) { T = _mm256_loadu_si256((__m256i*)(p + x));//8bit 32 T1 = _mm256_unpacklo_epi8(T, ZERO); //0 2 T2 = _mm256_unpackhi_epi8(T, ZERO); //1 3 T = _mm256_permute2x128_si256(T1, T2, 0x0020); _mm256_store_si256((__m256i*)(pTop + x), T); T = _mm256_permute2x128_si256(T1, T2, 0x0031); _mm256_store_si256((__m256i*)(pTop + x + 16), T); } for (y = 0; y < bsy; y++) { pLeft[y] = src[-1 - y]; } //p = src + 1; //for (x = 0; x < bsx; x++) { // pTop[x] = p[x]; //} //p = src - 1; //for (y = 0; y < bsy; y++) { // pLeft[y] = p[-y]; //} a = pTop[bsx - 1]; b = pLeft[bsy - 1]; if (bsx == bsy) { c = (a + b + 1) >> 1; } else { c = (((a << ishift_x) + (b << ishift_y)) * 13 + (1 << (ishift + 5))) >> (ishift + 6); } t = (c << 1) - a - b; T = _mm256_set1_epi16((int16_t)b); for (x = 0; x < bsx; x += 16) { T1 = _mm256_loadu_si256((__m256i*)(pTop + x)); T2 = _mm256_sub_epi16(T, T1); T1 = _mm256_slli_epi16(T1, ishift_y); _mm256_store_si256((__m256i*)(pT + x), T2); _mm256_store_si256((__m256i*)(pTop + x), T1); } T = _mm256_set1_epi16((int16_t)a); for (y = 0; y < bsy; y += 16) { T1 = _mm256_loadu_si256((__m256i*)(pLeft + y)); T2 = _mm256_sub_epi16(T, T1); T1 = _mm256_slli_epi16(T1, ishift_x); _mm256_store_si256((__m256i*)(pL + y), T2); _mm256_store_si256((__m256i*)(pLeft + y), T1); } T = _mm256_set1_epi16((int16_t)t); T = _mm256_mullo_epi16(T, _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); T1 = _mm256_set1_epi16((int16_t)(16 * t)); for (y = 0; y < bsy; y += 16) { _mm256_store_si256((__m256i*)(wy + y), T); T = _mm256_add_epi16(T, T1); } C1 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); C2 = _mm256_set1_epi32(8); if (bsx == 4) { __m256i pTT = _mm256_loadu_si256((__m256i*)pT); T = _mm256_loadu_si256((__m256i*)pTop); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (y = 0; y < bsy; y++) { int add = (pL[y] << ishift_y) + wy[y]; ADD = _mm256_set1_epi32(add); ADD = _mm256_mullo_epi32(C1, ADD); val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); ADD = _mm256_add_epi32(ADD, _mm256_set1_epi32(val)); T = _mm256_add_epi16(T, pTT); T1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(T, 0)); T1 = _mm256_slli_epi32(T1, ishift_x); T1 = _mm256_add_epi32(T1, ADD); T1 = _mm256_srai_epi32(T1, ishift_xy); T1 = _mm256_packus_epi32(T1, T1); T1 = _mm256_packus_epi16(T1, T1); _mm256_maskstore_epi32((int*)dst, mask, T1); dst += i_dst; } } else if (bsx == 8) { __m256i pTT = _mm256_load_si256((__m256i*)pT); T = _mm256_load_si256((__m256i*)pTop); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (y = 0; y < bsy; y++) { int add = (pL[y] << ishift_y) + wy[y]; ADD = _mm256_set1_epi32(add); ADD = _mm256_mullo_epi32(C1, ADD); val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); ADD = _mm256_add_epi32(ADD, _mm256_set1_epi32(val)); T = _mm256_add_epi16(T, pTT); T1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(T, 0)); T1 = _mm256_slli_epi32(T1, ishift_x); T1 = _mm256_add_epi32(T1, ADD); T1 = _mm256_srai_epi32(T1, ishift_xy); //mask //T1 is the result T1 = _mm256_packus_epi32(T1, T1); //1 2 3 4 1 2 3 4 5 6 7 8 5 6 7 8 T1 = _mm256_permute4x64_epi64(T1, 0x0008); T1 = _mm256_packus_epi16(T1, T1); _mm256_maskstore_epi64((__int64*)dst, mask, T1); dst += i_dst; } } else { __m256i TT[8]; __m256i PTT[8]; __m256i temp1, temp2; __m256i mask1 = _mm256_set_epi32(3, 2, 1, 0, 5, 1, 4, 0); __m256i mask2 = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (x = 0; x < bsx; x += 16) { int idx = x >> 3; __m256i M0 = _mm256_loadu_si256((__m256i*)(pTop + x)); //0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 __m256i M1 = _mm256_loadu_si256((__m256i*)(pT + x)); temp1 = _mm256_unpacklo_epi16(M0, ZERO); //0 1 2 3 8 9 10 11 temp2 = _mm256_unpackhi_epi16(M0, ZERO); //4 5 6 7 12 13 14 15 TT[idx] = _mm256_permute2x128_si256(temp1, temp2, 0x0020); //0 1 2 3 4 5 6 7 TT[idx + 1] = _mm256_permute2x128_si256(temp1, temp2, 0x0031); //8 9 10 11 12 13 14 15 PTT[idx] = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(M1, 0)); PTT[idx + 1] = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(M1, 1)); } for (y = 0; y < bsy; y++) { int add = (pL[y] << ishift_y) + wy[y]; ADD = _mm256_set1_epi32(add); T3 = _mm256_mullo_epi32(C2, ADD); ADD = _mm256_mullo_epi32(C1, ADD); val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); ADD = _mm256_add_epi32(ADD, _mm256_set1_epi32(val)); for (x = 0; x < bsx; x += 16) { int idx = x >> 3; TT[idx] = _mm256_add_epi32(TT[idx], PTT[idx]); //0 1 2 3 4 5 6 7 TT[idx + 1] = _mm256_add_epi32(TT[idx + 1], PTT[idx + 1]); //8 9 10 11 12 13 14 15 T1 = _mm256_slli_epi32(TT[idx], ishift_x); T2 = _mm256_slli_epi32(TT[idx + 1], ishift_x); T1 = _mm256_add_epi32(T1, ADD); T1 = _mm256_srai_epi32(T1, ishift_xy);//0 1 2 3 4 5 6 7 ADD = _mm256_add_epi32(ADD, T3); T2 = _mm256_add_epi32(T2, ADD); T2 = _mm256_srai_epi32(T2, ishift_xy);//8 9 10 11 12 13 14 15 //T1 T2 is the result T1 = _mm256_packus_epi32(T1, T2); //0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 T1 = _mm256_packus_epi16(T1, T1); //0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 4 5 6 7 12 13 14 15 T1 = _mm256_permutevar8x32_epi32(T1, mask1); //store 128 bits _mm256_maskstore_epi64((__int64*)(dst + x), mask2, T1); ADD = _mm256_add_epi32(ADD, T3); } dst += i_dst; } } } void intra_pred_ang_x_3_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; UNUSED_PARAMETER(dir_mode); if ((bsy > 4) && (bsx > 8)) { __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); ALIGN32(pel_t first_line[(64 + 176 + 16) << 2]); int line_size = bsx + (((bsy - 4) * 11) >> 2); int aligned_line_size = 64 + 176 + 16; int i; pel_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; __m256i SS2, SS11; __m256i L2, L3, L4, L5, L6, L7, L8, L9, L10, L11, L12, L13; __m256i H2, H3, H4, H5, H6, H7, H8, H9, H10, H11, H12, H13; for (i = 0; i < line_size - 16; i += 32, src += 32) { SS2 = _mm256_loadu_si256((__m256i*)(src + 2));//2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0));//2...17 H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1));//18...34 SS2 = _mm256_loadu_si256((__m256i*)(src + 3));//3 4 5 6 7 8 9 10 11 12 13 14 15 L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0));//3...18 H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1));//19...35 SS2 = _mm256_loadu_si256((__m256i*)(src + 4));//4 5 6 7 8 9 10 11 12 13 14 15 L4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0));//4 H4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1));//20 SS2 = _mm256_loadu_si256((__m256i*)(src + 5));//5 6 7 8 9 10 11 12 13 14 15 L5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0));//5 H5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1));//21 SS2 = _mm256_loadu_si256((__m256i*)(src + 6));//6 7 8 9 10 11 12 13 14 15 L6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0));//6 H6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1));//22 SS2 = _mm256_loadu_si256((__m256i*)(src + 7));//7 8 9 10 11 12 13 14 15 L7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); H7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1)); SS2 = _mm256_loadu_si256((__m256i*)(src + 8));//8 9 10 11 12 13 14 15 L8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); H8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1)); SS2 = _mm256_loadu_si256((__m256i*)(src + 9));//9 10 11 12 13 14 15 L9 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); H9 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1)); SS2 = _mm256_loadu_si256((__m256i*)(src + 10));//10 11 12 13 14 15 L10 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); H10 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1)); SS2 = _mm256_loadu_si256((__m256i*)(src + 11));//11 12 13 14 15 16 17 18 19 20 21 22 23 L11 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); H11 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1)); SS2 = _mm256_loadu_si256((__m256i*)(src + 12));//12 13 14 15 16 17 18 19 20... L12 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); H12 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1)); SS2 = _mm256_loadu_si256((__m256i*)(src + 13));//13 ...28 29...44 L13 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); H13 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1)); p00 = _mm256_add_epi16(L2, coeff8);//2 ...17 p10 = _mm256_mullo_epi16(L3, coeff5); p20 = _mm256_mullo_epi16(L4, coeff7); p30 = _mm256_mullo_epi16(L5, coeff3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p01 = _mm256_add_epi16(H2, coeff8); p11 = _mm256_mullo_epi16(H3, coeff5); p21 = _mm256_mullo_epi16(H4, coeff7); p31 = _mm256_mullo_epi16(H5, coeff3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[0][i], p00); p00 = _mm256_add_epi16(L5, L8); p10 = _mm256_add_epi16(L6, L7); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H5, H8); p11 = _mm256_add_epi16(H6, H7); p11 = _mm256_mullo_epi16(p11, coeff3); p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_srli_epi16(p01, 3); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[1][i], p00); p00 = _mm256_mullo_epi16(L8, coeff3); p10 = _mm256_mullo_epi16(L9, coeff7); p20 = _mm256_mullo_epi16(L10, coeff5); p30 = _mm256_add_epi16(L11, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p01 = _mm256_mullo_epi16(H8, coeff3); p11 = _mm256_mullo_epi16(H9, coeff7); p21 = _mm256_mullo_epi16(H10, coeff5); p31 = _mm256_add_epi16(H11, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[2][i], p00); p00 = _mm256_add_epi16(L11, L13); p10 = _mm256_mullo_epi16(L12, coeff2); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p01 = _mm256_add_epi16(H11, H13); p11 = _mm256_mullo_epi16(H12, coeff2); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[3][i], p00); } if (i < line_size) { SS2 = _mm256_loadu_si256((__m256i*)(src + 2)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 3)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 4)); L4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 5)); L5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 6)); L6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 7)); L7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 8)); L8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 9)); L9 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 10)); L10 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS11 = _mm256_loadu_si256((__m256i*)(src + 11)); L11 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS11, 0)); SS11 = _mm256_loadu_si256((__m256i*)(src + 12)); L12 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS11, 0)); SS11 = _mm256_loadu_si256((__m256i*)(src + 13)); L13 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS11, 0)); p00 = _mm256_add_epi16(L2, coeff8); p10 = _mm256_mullo_epi16(L3, coeff5); p20 = _mm256_mullo_epi16(L4, coeff7); p30 = _mm256_mullo_epi16(L5, coeff3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64*)&pfirst[0][i], mask, p00); p00 = _mm256_add_epi16(L5, L8); p10 = _mm256_add_epi16(L6, L7); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)&pfirst[1][i], mask, p00); p00 = _mm256_mullo_epi16(L8, coeff3); p10 = _mm256_mullo_epi16(L9, coeff7); p20 = _mm256_mullo_epi16(L10, coeff5); p30 = _mm256_add_epi16(L11, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)&pfirst[2][i], mask, p00); p00 = _mm256_add_epi16(L11, L13); p10 = _mm256_mullo_epi16(L12, coeff2); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)&pfirst[3][i], mask, p00); } bsy >>= 2; __m256i M; if (bsx == 64){ for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11)); _mm256_storeu_si256((__m256i*)dst1, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst1 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11)); _mm256_storeu_si256((__m256i*)dst2, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst2 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11)); _mm256_storeu_si256((__m256i*)dst3, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst3 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11)); _mm256_storeu_si256((__m256i*)dst4, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst4 + 32), M); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; } } else if (bsx == 32) { for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11)); _mm256_storeu_si256((__m256i*)dst1, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11)); _mm256_storeu_si256((__m256i*)dst2, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11)); _mm256_storeu_si256((__m256i*)dst3, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11)); _mm256_storeu_si256((__m256i*)dst4, M); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11)); _mm256_maskstore_epi64((__int64*)dst1, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11)); _mm256_maskstore_epi64((__int64*)dst2, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11)); _mm256_maskstore_epi64((__int64*)dst3, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11)); _mm256_maskstore_epi64((__int64*)dst4, mask, M); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; } } /*for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel_t)); memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel_t)); memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel_t)); memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel_t)); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; }*/ } else if (bsx == 16) { __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); __m256i p00, p10, p20, p30; __m256i SS2 = _mm256_loadu_si256((__m256i*)(src + 2)); __m256i L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 3)); __m256i L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 4)); __m256i L4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 5)); __m256i L5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 6)); __m256i L6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 7)); __m256i L7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 8)); __m256i L8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 9)); __m256i L9 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 10)); __m256i L10 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); __m256i SS11 = _mm256_loadu_si256((__m256i*)(src + 11)); __m256i L11 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS11, 0)); SS11 = _mm256_loadu_si256((__m256i*)(src + 12)); __m256i L12 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS11, 0)); SS11 = _mm256_loadu_si256((__m256i*)(src + 13)); __m256i L13 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS11, 0)); p00 = _mm256_add_epi16(L2, coeff8); p10 = _mm256_mullo_epi16(L3, coeff5); p20 = _mm256_mullo_epi16(L4, coeff7); p30 = _mm256_mullo_epi16(L5, coeff3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64*)dst1, mask, p00); p00 = _mm256_add_epi16(L5, L8); p10 = _mm256_add_epi16(L6, L7); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)dst2, mask, p00); p00 = _mm256_mullo_epi16(L8, coeff3); p10 = _mm256_mullo_epi16(L9, coeff7); p20 = _mm256_mullo_epi16(L10, coeff5); p30 = _mm256_add_epi16(L11, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)dst3, mask, p00); p00 = _mm256_add_epi16(L11, L13); p10 = _mm256_mullo_epi16(L12, coeff2); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)dst4, mask, p00); } else { //8x8 8x32 4x16 4x4 intra_pred_ang_x_3_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } } void intra_pred_ang_x_4_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { if (bsx != bsy && bsx < bsy){ intra_pred_ang_x_4_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } ALIGN32(pel_t first_line[64 + 128]); int line_size = bsx + ((bsy - 1) << 1); int iHeight2 = bsy << 1; int i; __m256i zero = _mm256_setzero_si256(); __m256i offset = _mm256_set1_epi16(2); UNUSED_PARAMETER(dir_mode); src += 3; for (i = 0; i < line_size - 16; i += 32, src += 32) { //0 1 2 3 .... 12 13 14 15 16 17 18 19 .... 28 29 30 21 __m256i S0 = _mm256_loadu_si256((__m256i*)(src - 1)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i L0 = _mm256_unpacklo_epi8(S0, zero);//0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23 __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i H0 = _mm256_unpackhi_epi8(S0, zero);//8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31 __m256i H1 = _mm256_unpackhi_epi8(S1, zero); __m256i H2 = _mm256_unpackhi_epi8(S2, zero); __m256i tmp0 = _mm256_permute2x128_si256(L0, H0, 0x0020);//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 __m256i tmp1 = _mm256_permute2x128_si256(L1, H1, 0x0020); __m256i tmp2 = _mm256_permute2x128_si256(L2, H2, 0x0020); __m256i sum1 = _mm256_add_epi16(tmp0, tmp1); __m256i sum2 = _mm256_add_epi16(tmp1, tmp2); tmp0 = _mm256_permute2x128_si256(L0, H0, 0x0031);//16 17...24 25... tmp1 = _mm256_permute2x128_si256(L1, H1, 0x0031); tmp2 = _mm256_permute2x128_si256(L2, H2, 0x0031); __m256i sum3 = _mm256_add_epi16(tmp0, tmp1); __m256i sum4 = _mm256_add_epi16(tmp1, tmp2); sum1 = _mm256_add_epi16(sum1, sum2); sum3 = _mm256_add_epi16(sum3, sum4); sum1 = _mm256_add_epi16(sum1, offset); sum3 = _mm256_add_epi16(sum3, offset); sum1 = _mm256_srli_epi16(sum1, 2); sum3 = _mm256_srli_epi16(sum3, 2); sum1 = _mm256_packus_epi16(sum1, sum3);//0 2 1 3 sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_storeu_si256((__m256i*)&first_line[i], sum1); } if (i < line_size) { //0 1 2 3 .... 12 13 14 15 16 17 18 19 .... 28 29 30 21 __m256i S0 = _mm256_loadu_si256((__m256i*)(src - 1)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src)); S0 = _mm256_permute4x64_epi64(S0, 0x00D8); S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S1 = _mm256_permute4x64_epi64(S1, 0x00D8); __m256i L0 = _mm256_unpacklo_epi8(S0, zero); __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i sum1 = _mm256_add_epi16(L0, L1); __m256i sum2 = _mm256_add_epi16(L1, L2); sum1 = _mm256_add_epi16(sum1, sum2); sum1 = _mm256_add_epi16(sum1, offset); sum1 = _mm256_srli_epi16(sum1, 2); sum1 = _mm256_packus_epi16(sum1, sum1); sum1 = _mm256_permute4x64_epi64(sum1, 0x0008); //store 128 bit __m256i mask2 = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64*)(first_line + i), mask2, sum1); //_mm_storel_epi64((__m128i*)&first_line[i], sum1); } if (bsx == 64){ for (i = 0; i < iHeight2; i += 8) { __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i]+32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 2] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 4]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 4] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 6]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 6] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < iHeight2; i += 8){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 4]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 6]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < iHeight2; i += 8) { __m256i M = _mm256_loadu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[i + 2]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[i + 4]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[i + 6]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else if (bsx == 8){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < iHeight2; i += 8) { __m256i M = _mm256_loadu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else{ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < iHeight2; i += 8) { __m256i M = _mm256_loadu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*if (bsx == bsy || bsx >= 16) { for (i = 0; i < iHeight2; i += 2) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else { __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < iHeight2; i += 8) { __m256i M = _mm256_loadu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } }*/ } void intra_pred_ang_x_5_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); __m256i coeff9 = _mm256_set1_epi16(9); __m256i coeff11 = _mm256_set1_epi16(11); __m256i coeff13 = _mm256_set1_epi16(13); __m256i coeff15 = _mm256_set1_epi16(15); __m256i coeff16 = _mm256_set1_epi16(16); UNUSED_PARAMETER(dir_mode); int i; if (((bsy > 4) && (bsx > 8))) { ALIGN32(pel_t first_line[(64 + 80 + 16) << 3]); int line_size = bsx + ((bsy - 8) >> 3) * 11; int aligned_line_size = (((line_size + 15) >> 4) << 4) + 16; pel_t *pfirst[8]; pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; __m256i SS1; __m256i L1, L2, L3, L4, L5, L6, L7, L8, L9, L10, L11, L12, L13; __m256i H1, H2, H3, H4, H5, H6, H7, H8, H9, H10, H11, H12, H13; for (i = 0; i < line_size - 16; i += 32, src += 32) { SS1 = _mm256_loadu_si256((__m256i*)(src + 1));//1...8 9...16 17..24 25..32 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//1 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1));//17 SS1 = _mm256_loadu_si256((__m256i*)(src + 2)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//2 H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1));//18 SS1 = _mm256_loadu_si256((__m256i*)(src + 3)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//3 H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1));//19 SS1 = _mm256_loadu_si256((__m256i*)(src + 4)); L4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//4 H4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1));//20 SS1 = _mm256_loadu_si256((__m256i*)(src + 5)); L5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 6)); L6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 7)); L7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 8)); L8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 9)); L9 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H9 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 10)); L10 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H10 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 11)); L11 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H11 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 12)); L12 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H12 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 13)); L13 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H13 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); p00 = _mm256_mullo_epi16(L1, coeff5); p10 = _mm256_mullo_epi16(L2, coeff13); p20 = _mm256_mullo_epi16(L3, coeff11); p30 = _mm256_mullo_epi16(L4, coeff3); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p01 = _mm256_mullo_epi16(H1, coeff5); p11 = _mm256_mullo_epi16(H2, coeff13); p21 = _mm256_mullo_epi16(H3, coeff11); p31 = _mm256_mullo_epi16(H4, coeff3); p01 = _mm256_add_epi16(p01, coeff16); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[0][i], p00); p10 = _mm256_mullo_epi16(L3, coeff5); p20 = _mm256_mullo_epi16(L4, coeff7); p30 = _mm256_mullo_epi16(L5, coeff3); p00 = _mm256_add_epi16(L2, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p11 = _mm256_mullo_epi16(H3, coeff5); p21 = _mm256_mullo_epi16(H4, coeff7); p31 = _mm256_mullo_epi16(H5, coeff3); p01 = _mm256_add_epi16(H2, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[1][i], p00); p00 = _mm256_mullo_epi16(L4, coeff7); p10 = _mm256_mullo_epi16(L5, coeff15); p20 = _mm256_mullo_epi16(L6, coeff9); p30 = _mm256_add_epi16(L7, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p01 = _mm256_mullo_epi16(H4, coeff7); p11 = _mm256_mullo_epi16(H5, coeff15); p21 = _mm256_mullo_epi16(H6, coeff9); p31 = _mm256_add_epi16(H7, coeff16); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[2][i], p00); p00 = _mm256_add_epi16(L5, L8); p10 = _mm256_add_epi16(L6, L7); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H5, H8); p11 = _mm256_add_epi16(H6, H7); p11 = _mm256_mullo_epi16(p11, coeff3); p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_srli_epi16(p01, 3); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[3][i], p00); p00 = _mm256_add_epi16(L6, coeff16); p10 = _mm256_mullo_epi16(L7, coeff9); p20 = _mm256_mullo_epi16(L8, coeff15); p30 = _mm256_mullo_epi16(L9, coeff7); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p01 = _mm256_add_epi16(H6, coeff16); p11 = _mm256_mullo_epi16(H7, coeff9); p21 = _mm256_mullo_epi16(H8, coeff15); p31 = _mm256_mullo_epi16(H9, coeff7); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[4][i], p00); p00 = _mm256_mullo_epi16(L8, coeff3); p10 = _mm256_mullo_epi16(L9, coeff7); p20 = _mm256_mullo_epi16(L10, coeff5); p30 = _mm256_add_epi16(L11, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p01 = _mm256_mullo_epi16(H8, coeff3); p11 = _mm256_mullo_epi16(H9, coeff7); p21 = _mm256_mullo_epi16(H10, coeff5); p31 = _mm256_add_epi16(H11, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[5][i], p00); p00 = _mm256_mullo_epi16(L9, coeff3); p10 = _mm256_mullo_epi16(L10, coeff11); p20 = _mm256_mullo_epi16(L11, coeff13); p30 = _mm256_mullo_epi16(L12, coeff5); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p01 = _mm256_mullo_epi16(H9, coeff3); p11 = _mm256_mullo_epi16(H10, coeff11); p21 = _mm256_mullo_epi16(H11, coeff13); p31 = _mm256_mullo_epi16(H12, coeff5); p01 = _mm256_add_epi16(p01, coeff16); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[6][i], p00); p00 = _mm256_add_epi16(L11, L13); p10 = _mm256_add_epi16(L12, L12); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p01 = _mm256_add_epi16(H11, H13); p11 = _mm256_add_epi16(H12, H12); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[7][i], p00); } if (i < line_size) { SS1 = _mm256_loadu_si256((__m256i*)(src + 1));//1...8 9...16 17..24 25..32 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//1 SS1 = _mm256_loadu_si256((__m256i*)(src + 2)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//2 SS1 = _mm256_loadu_si256((__m256i*)(src + 3)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//3 SS1 = _mm256_loadu_si256((__m256i*)(src + 4)); L4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//4 SS1 = _mm256_loadu_si256((__m256i*)(src + 5)); L5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 6)); L6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 7)); L7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 8)); L8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 9)); L9 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 10)); L10 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 11)); L11 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 12)); L12 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 13)); L13 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); p00 = _mm256_mullo_epi16(L1, coeff5); p10 = _mm256_mullo_epi16(L2, coeff13); p20 = _mm256_mullo_epi16(L3, coeff11); p30 = _mm256_mullo_epi16(L4, coeff3); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64*)&pfirst[0][i], mask, p00); p10 = _mm256_mullo_epi16(L3, coeff5); p20 = _mm256_mullo_epi16(L4, coeff7); p30 = _mm256_mullo_epi16(L5, coeff3); p00 = _mm256_add_epi16(L2, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)&pfirst[1][i], mask, p00); p00 = _mm256_mullo_epi16(L4, coeff7); p10 = _mm256_mullo_epi16(L5, coeff15); p20 = _mm256_mullo_epi16(L6, coeff9); p30 = _mm256_add_epi16(L7, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); _mm256_maskstore_epi64((__int64*)&pfirst[2][i], mask, p00); p00 = _mm256_add_epi16(L5, L8); p10 = _mm256_add_epi16(L6, L7); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); _mm256_maskstore_epi64((__int64*)&pfirst[3][i], mask, p00); p00 = _mm256_add_epi16(L6, coeff16); p10 = _mm256_mullo_epi16(L7, coeff9); p20 = _mm256_mullo_epi16(L8, coeff15); p30 = _mm256_mullo_epi16(L9, coeff7); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); _mm256_maskstore_epi64((__int64*)&pfirst[4][i], mask, p00); p00 = _mm256_mullo_epi16(L8, coeff3); p10 = _mm256_mullo_epi16(L9, coeff7); p20 = _mm256_mullo_epi16(L10, coeff5); p30 = _mm256_add_epi16(L11, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); _mm256_maskstore_epi64((__int64*)&pfirst[5][i], mask, p00); p00 = _mm256_mullo_epi16(L9, coeff3); p10 = _mm256_mullo_epi16(L10, coeff11); p20 = _mm256_mullo_epi16(L11, coeff13); p30 = _mm256_mullo_epi16(L12, coeff5); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); _mm256_maskstore_epi64((__int64*)&pfirst[6][i], mask, p00); p00 = _mm256_add_epi16(L11, L13); p10 = _mm256_add_epi16(L12, L12); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); _mm256_maskstore_epi64((__int64*)&pfirst[7][i], mask, p00); } bsy >>= 3; __m256i M; if (bsx == 64){ for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11)); _mm256_storeu_si256((__m256i*)dst1, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst1 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11)); _mm256_storeu_si256((__m256i*)dst2, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst2 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11)); _mm256_storeu_si256((__m256i*)dst3, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst3 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11)); _mm256_storeu_si256((__m256i*)dst4, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst4 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[4] + i * 11)); _mm256_storeu_si256((__m256i*)dst5, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[4] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst5 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[5] + i * 11)); _mm256_storeu_si256((__m256i*)dst6, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[5] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst6 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[6] + i * 11)); _mm256_storeu_si256((__m256i*)dst7, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[6] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst7 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[7] + i * 11)); _mm256_storeu_si256((__m256i*)dst8, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[7] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst8 + 32), M); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; } } else if (bsx == 32) { for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11)); _mm256_storeu_si256((__m256i*)dst1, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11)); _mm256_storeu_si256((__m256i*)dst2, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11)); _mm256_storeu_si256((__m256i*)dst3, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11)); _mm256_storeu_si256((__m256i*)dst4, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[4] + i * 11)); _mm256_storeu_si256((__m256i*)dst5, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[5] + i * 11)); _mm256_storeu_si256((__m256i*)dst6, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[6] + i * 11)); _mm256_storeu_si256((__m256i*)dst7, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[7] + i * 11)); _mm256_storeu_si256((__m256i*)dst8, M); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11)); _mm256_maskstore_epi64((__int64*)dst1, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11)); _mm256_maskstore_epi64((__int64*)dst2, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11)); _mm256_maskstore_epi64((__int64*)dst3, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11)); _mm256_maskstore_epi64((__int64*)dst4, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[4] + i * 11)); _mm256_maskstore_epi64((__int64*)dst5, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[5] + i * 11)); _mm256_maskstore_epi64((__int64*)dst6, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[6] + i * 11)); _mm256_maskstore_epi64((__int64*)dst7, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[7] + i * 11)); _mm256_maskstore_epi64((__int64*)dst8, mask, M); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; } } /*for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel_t)); memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel_t)); memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel_t)); memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel_t)); memcpy(dst5, pfirst[4] + i * 11, bsx * sizeof(pel_t)); memcpy(dst6, pfirst[5] + i * 11, bsx * sizeof(pel_t)); memcpy(dst7, pfirst[6] + i * 11, bsx * sizeof(pel_t)); memcpy(dst8, pfirst[7] + i * 11, bsx * sizeof(pel_t)); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; }*/ } else if (bsx == 16) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m256i p00, p10, p20, p30; __m256i SS1; __m256i L1, L2, L3, L4, L5, L6, L7, L8; SS1 = _mm256_loadu_si256((__m256i*)(src + 1));//1...8 9...16 17..24 25..32 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//1 SS1 = _mm256_loadu_si256((__m256i*)(src + 2)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//2 SS1 = _mm256_loadu_si256((__m256i*)(src + 3)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//3 SS1 = _mm256_loadu_si256((__m256i*)(src + 4)); L4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//4 SS1 = _mm256_loadu_si256((__m256i*)(src + 5)); L5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 6)); L6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 7)); L7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 8)); L8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); p00 = _mm256_mullo_epi16(L1, coeff5); p10 = _mm256_mullo_epi16(L2, coeff13); p20 = _mm256_mullo_epi16(L3, coeff11); p30 = _mm256_mullo_epi16(L4, coeff3); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64*)dst1, mask, p00); p10 = _mm256_mullo_epi16(L3, coeff5); p20 = _mm256_mullo_epi16(L4, coeff7); p30 = _mm256_mullo_epi16(L5, coeff3); p00 = _mm256_add_epi16(L2, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)dst2, mask, p00); p00 = _mm256_mullo_epi16(L4, coeff7); p10 = _mm256_mullo_epi16(L5, coeff15); p20 = _mm256_mullo_epi16(L6, coeff9); p30 = _mm256_add_epi16(L7, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)dst3, mask, p00); p00 = _mm256_add_epi16(L5, L8); p10 = _mm256_add_epi16(L6, L7); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)dst4, mask, p00); } else { //8x8 8x32 4x4 4x16 intra_pred_ang_x_5_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } } void intra_pred_ang_x_6_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; int i; __m256i zero = _mm256_setzero_si256(); __m256i offset = _mm256_set1_epi16(2); UNUSED_PARAMETER(dir_mode); src += 2; for (i = 0; i < line_size - 16; i += 32, src += 32) { //0 1 2 3 .... 12 13 14 15 16 17 18 19 .... 28 29 30 21 __m256i S0 = _mm256_loadu_si256((__m256i*)(src - 1)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i L0 = _mm256_unpacklo_epi8(S0, zero);//0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23 __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i H0 = _mm256_unpackhi_epi8(S0, zero);//8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31 __m256i H1 = _mm256_unpackhi_epi8(S1, zero); __m256i H2 = _mm256_unpackhi_epi8(S2, zero); __m256i tmp0 = _mm256_permute2x128_si256(L0, H0, 0x0020);//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 __m256i tmp1 = _mm256_permute2x128_si256(L1, H1, 0x0020); __m256i tmp2 = _mm256_permute2x128_si256(L2, H2, 0x0020); __m256i sum1 = _mm256_add_epi16(tmp0, tmp1); __m256i sum2 = _mm256_add_epi16(tmp1, tmp2); tmp0 = _mm256_permute2x128_si256(L0, H0, 0x0031);//16 17...24 25... tmp1 = _mm256_permute2x128_si256(L1, H1, 0x0031); tmp2 = _mm256_permute2x128_si256(L2, H2, 0x0031); __m256i sum3 = _mm256_add_epi16(tmp0, tmp1); __m256i sum4 = _mm256_add_epi16(tmp1, tmp2); sum1 = _mm256_add_epi16(sum1, sum2); sum3 = _mm256_add_epi16(sum3, sum4); sum1 = _mm256_add_epi16(sum1, offset); sum3 = _mm256_add_epi16(sum3, offset); sum1 = _mm256_srli_epi16(sum1, 2); sum3 = _mm256_srli_epi16(sum3, 2); sum1 = _mm256_packus_epi16(sum1, sum3);//0 2 1 3 sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_storeu_si256((__m256i*)&first_line[i], sum1); } if (i < line_size) { //0 1 2 3 .... 12 13 14 15 16 17 18 19 .... 28 29 30 21 __m256i S0 = _mm256_loadu_si256((__m256i*)(src - 1)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src)); S0 = _mm256_permute4x64_epi64(S0, 0x00D8); S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S1 = _mm256_permute4x64_epi64(S1, 0x00D8); __m256i L0 = _mm256_unpacklo_epi8(S0, zero); __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i sum1 = _mm256_add_epi16(L0, L1); __m256i sum2 = _mm256_add_epi16(L1, L2); sum1 = _mm256_add_epi16(sum1, sum2); sum1 = _mm256_add_epi16(sum1, offset); sum1 = _mm256_srli_epi16(sum1, 2); sum1 = _mm256_packus_epi16(sum1, sum1); sum1 = _mm256_permute4x64_epi64(sum1, 0x0008); //store 128 bit __m256i mask2 = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64*)(first_line + i), mask2, sum1); //_mm_storel_epi64((__m128i*)&first_line[i], sum1); } if (bsx == 64){ for (i = 0; i < bsy; i += 4){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 1]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 1] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 2] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 3]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 3] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 1]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 3]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i += 4){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 1]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 3]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else if (bsx == 8){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < bsy; i += 4){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 1]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 3]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (i = 0; i < bsy; i += 4){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 1]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 3]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /* if (bsx == bsy || bsx >= 16) { for (i = 0; i < bsy; i++) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else {//8x32 4x16 __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_loadu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 1); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 1); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 1); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } }*/ } void intra_pred_ang_x_7_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j; UNUSED_PARAMETER(dir_mode); if (bsx >= bsy) { if (bsx <= 8) {//4x4 8x8 intra_pred_ang_x_7_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx & 16){//16 __m256i S0, S1, S2, S3; __m256i t0, t1, t2, t3; __m256i c0; __m256i D0; __m256i off = _mm256_set1_epi16(64); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (j = 0; j < bsy; j++) { int idx = tab_idx_mode_7[j]; c0 = _mm256_loadu_si256((__m256i*)tab_coeff_mode_7_avx[j]); S0 = _mm256_loadu_si256((__m256i*)(src + idx)); //0...7 8...15 16...23 24...31 S1 = _mm256_loadu_si256((__m256i*)(src + idx + 1));//1.. 8 9...16 17...24 25...32 S2 = _mm256_loadu_si256((__m256i*)(src + idx + 2));//2...9 10...17 S3 = _mm256_loadu_si256((__m256i*)(src + idx + 3));//3...10 11...18 S0 = _mm256_permute4x64_epi64(S0, 0x00D8);//0...7 16...23 8...15 24...31 S1 = _mm256_permute4x64_epi64(S1, 0x00D8);//1...8 17...24 9...16 25...32 S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); t0 = _mm256_unpacklo_epi8(S0, S1);//0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 t1 = _mm256_unpacklo_epi8(S2, S3);//2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);//0...7 t3 = _mm256_unpackhi_epi16(t0, t1);//8...15 t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D0 = _mm256_hadds_epi16(t0, t1);//0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 D0 = _mm256_permute4x64_epi64(D0, 0x00D8); D0 = _mm256_add_epi16(D0, off); D0 = _mm256_srli_epi16(D0, 7); D0 = _mm256_packus_epi16(D0, D0); D0 = _mm256_permute4x64_epi64(D0, 0x00D8); _mm256_maskstore_epi64((__int64*)dst, mask, D0); dst += i_dst; } } else {//32 64 __m256i S0, S1, S2, S3; __m256i t0, t1, t2, t3; __m256i c0; __m256i D0, D1; __m256i off = _mm256_set1_epi16(64); for (j = 0; j < bsy; j++) { int idx = tab_idx_mode_7[j]; c0 = _mm256_loadu_si256((__m256i*)tab_coeff_mode_7_avx[j]); for (i = 0; i < bsx; i += 32, idx += 32) { S0 = _mm256_loadu_si256((__m256i*)(src + idx)); //0...7 8...15 16...23 24...31 S1 = _mm256_loadu_si256((__m256i*)(src + idx + 1));//1.. 8 9...16 17...24 25...32 S2 = _mm256_loadu_si256((__m256i*)(src + idx + 2));//2...9 10...17 18 S3 = _mm256_loadu_si256((__m256i*)(src + idx + 3));//3...10 11...18 19 S0 = _mm256_permute4x64_epi64(S0, 0x00D8);//0...7 16...23 8...15 24...31 S1 = _mm256_permute4x64_epi64(S1, 0x00D8);//1...8 17...24 9...16 25...32 S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); t0 = _mm256_unpacklo_epi8(S0, S1);//0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 t1 = _mm256_unpacklo_epi8(S2, S3);//2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);// t3 = _mm256_unpackhi_epi16(t0, t1);//........15 16 17 18 t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D0 = _mm256_hadds_epi16(t0, t1);//0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 D0 = _mm256_permute4x64_epi64(D0, 0x00D8); D0 = _mm256_add_epi16(D0, off); D0 = _mm256_srli_epi16(D0, 7); t0 = _mm256_unpackhi_epi8(S0, S1);//16 17 17 18 18 19 19 20 20 21 21 22 22 23 23 24...24 25 25.. t1 = _mm256_unpackhi_epi8(S2, S3);//18 19 19 20 ..... t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);//16 17 18 19... t3 = _mm256_unpackhi_epi16(t0, t1);//24 25 26 27... t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D1 = _mm256_hadds_epi16(t0, t1);//16 17 18 19 24 25 26 27 20 21 22 23 28 29 30 31 D1 = _mm256_permute4x64_epi64(D1, 0x00D8); D1 = _mm256_add_epi16(D1, off); D1 = _mm256_srli_epi16(D1, 7); D0 = _mm256_packus_epi16(D0, D1); D0 = _mm256_permute4x64_epi64(D0, 0x00D8); _mm256_storeu_si256((__m256i*)(dst + i), D0); } dst += i_dst; } } } else { intra_pred_ang_x_7_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } } void intra_pred_ang_x_8_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[2 * (64 + 48)]); int line_size = bsx + (bsy >> 1) - 1; int i; int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[2]; __m256i zero = _mm256_setzero_si256(); __m256i coeff = _mm256_set1_epi16(3); //16 __m256i offset1 = _mm256_set1_epi16(4); __m256i offset2 = _mm256_set1_epi16(2); UNUSED_PARAMETER(dir_mode); pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; __m256i p01, p02, p11, p12; __m256i p21, p22, p31, p32; __m256i tmp0, tmp1, tmp2, tmp3; for (i = 0; i < line_size - 16; i += 32, src += 32) { //0 1 2 3 .... 12 13 14 15 16 17 18 19 .... 28 29 30 21 __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + 3)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 2)); __m256i L0 = _mm256_unpacklo_epi8(S0, zero);//0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23 __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i L3 = _mm256_unpacklo_epi8(S3, zero); __m256i H0 = _mm256_unpackhi_epi8(S0, zero);//8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31 __m256i H1 = _mm256_unpackhi_epi8(S1, zero); __m256i H2 = _mm256_unpackhi_epi8(S2, zero); __m256i H3 = _mm256_unpackhi_epi8(S3, zero); tmp0 = _mm256_permute2x128_si256(L0, H0, 0x0020);//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 tmp1 = _mm256_permute2x128_si256(L1, H1, 0x0020); tmp2 = _mm256_permute2x128_si256(L2, H2, 0x0020); tmp3 = _mm256_permute2x128_si256(L3, H3, 0x0020); p01 = _mm256_add_epi16(tmp1, tmp2); p01 = _mm256_mullo_epi16(p01, coeff); p02 = _mm256_add_epi16(tmp0, tmp3); p02 = _mm256_add_epi16(p02, offset1); p01 = _mm256_add_epi16(p01, p02); p01 = _mm256_srli_epi16(p01, 3); // //prepare for next line p21 = _mm256_add_epi16(tmp1, tmp2); p22 = _mm256_add_epi16(tmp2, tmp3); tmp0 = _mm256_permute2x128_si256(L0, H0, 0x0031);//16 17....24 25.... tmp1 = _mm256_permute2x128_si256(L1, H1, 0x0031); tmp2 = _mm256_permute2x128_si256(L2, H2, 0x0031); tmp3 = _mm256_permute2x128_si256(L3, H3, 0x0031); p11 = _mm256_add_epi16(tmp1, tmp2); p11 = _mm256_mullo_epi16(p11, coeff); p12 = _mm256_add_epi16(tmp0, tmp3); p12 = _mm256_add_epi16(p12, offset1); p11 = _mm256_add_epi16(p11, p12); p11 = _mm256_srli_epi16(p11, 3); //prepare for next line p31 = _mm256_add_epi16(tmp1, tmp2); p32 = _mm256_add_epi16(tmp2, tmp3); p01 = _mm256_packus_epi16(p01, p11); p01 = _mm256_permute4x64_epi64(p01, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[0][i], p01); p21 = _mm256_add_epi16(p21, p22); p31 = _mm256_add_epi16(p31, p32); p21 = _mm256_add_epi16(p21, offset2); p31 = _mm256_add_epi16(p31, offset2); p21 = _mm256_srli_epi16(p21, 2); p31 = _mm256_srli_epi16(p31, 2); p21 = _mm256_packus_epi16(p21, p31); p21 = _mm256_permute4x64_epi64(p21, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[1][i], p21); } if (i < line_size) { __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + 3)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 2)); S0 = _mm256_permute4x64_epi64(S0, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); S1 = _mm256_permute4x64_epi64(S1, 0x00D8); S2 = _mm256_permute4x64_epi64(S2, 0x00D8); __m256i L0 = _mm256_unpacklo_epi8(S0, zero); __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i L3 = _mm256_unpacklo_epi8(S3, zero); p01 = _mm256_add_epi16(L1, L2); p01 = _mm256_mullo_epi16(p01, coeff); p02 = _mm256_add_epi16(L0, L3); p02 = _mm256_add_epi16(p02, offset1); p01 = _mm256_add_epi16(p01, p02); p01 = _mm256_srli_epi16(p01, 3); p01 = _mm256_packus_epi16(p01, p01); p01 = _mm256_permute4x64_epi64(p01, 0x0008); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64*)&pfirst[0][i], mask, p01); p01 = _mm256_add_epi16(L1, L2); p02 = _mm256_add_epi16(L2, L3); p01 = _mm256_add_epi16(p01, p02); p01 = _mm256_add_epi16(p01, offset2); p01 = _mm256_srli_epi16(p01, 2); p01 = _mm256_packus_epi16(p01, p01); p01=_mm256_permute4x64_epi64(p01,0x0008); _mm256_maskstore_epi64((__int64*)&pfirst[1][i], mask, p01); } bsy >>= 1; if (bsx == 64){ for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < bsy; i += 4){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else if (bsx == 8){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else{ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*if (bsx != 8) { for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); dst += i_dst2; } } else if (bsy == 4) {//8x8 __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); __m256i M1 = _mm256_loadu_si256((__m256i*)&pfirst[0][0]); __m256i M2 = _mm256_loadu_si256((__m256i*)&pfirst[1][0]); _mm256_maskstore_epi64((__int64*)dst, mask, M1); _mm256_maskstore_epi64((__int64*)(dst + i_dst), mask, M2); dst += i_dst2; M1 = _mm256_srli_si256(M1, 1); M2 = _mm256_srli_si256(M2, 1); _mm256_maskstore_epi64((__int64*)dst, mask, M1); _mm256_maskstore_epi64((__int64*)(dst + i_dst), mask, M2); dst += i_dst2; M1 = _mm256_srli_si256(M1, 1); M2 = _mm256_srli_si256(M2, 1); _mm256_maskstore_epi64((__int64*)dst, mask, M1); _mm256_maskstore_epi64((__int64*)(dst + i_dst), mask, M2); dst += i_dst2; M1 = _mm256_srli_si256(M1, 1); M2 = _mm256_srli_si256(M2, 1); _mm256_maskstore_epi64((__int64*)dst, mask, M1); _mm256_maskstore_epi64((__int64*)(dst + i_dst), mask, M2); } else { //8x32 __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < 16; i = i + 4) { __m256i M1 = _mm256_loadu_si256((__m256i*)&pfirst[0][i]); __m256i M2 = _mm256_loadu_si256((__m256i*)&pfirst[1][i]); _mm256_maskstore_epi64((__int64*)dst, mask, M1); _mm256_maskstore_epi64((__int64*)(dst + i_dst), mask, M2); dst += i_dst2; M1 = _mm256_srli_si256(M1, 1); M2 = _mm256_srli_si256(M2, 1); _mm256_maskstore_epi64((__int64*)dst, mask, M1); _mm256_maskstore_epi64((__int64*)(dst + i_dst), mask, M2); dst += i_dst2; M1 = _mm256_srli_si256(M1, 1); M2 = _mm256_srli_si256(M2, 1); _mm256_maskstore_epi64((__int64*)dst, mask, M1); _mm256_maskstore_epi64((__int64*)(dst + i_dst), mask, M2); dst += i_dst2; M1 = _mm256_srli_si256(M1, 1); M2 = _mm256_srli_si256(M2, 1); _mm256_maskstore_epi64((__int64*)dst, mask, M1); _mm256_maskstore_epi64((__int64*)(dst + i_dst), mask, M2); dst += i_dst2; //M1 = _mm256_srli_si256(M1, 1); //M2 = _mm256_srli_si256(M2, 1); //_mm256_maskstore_epi64((__m256i*)dst, mask, M1); //_mm256_maskstore_epi64((__m256i*)(dst + i_dst), mask, M2); //dst += i_dst2; //M1 = _mm256_srli_si256(M1, 1); //M2 = _mm256_srli_si256(M2, 1); //_mm256_maskstore_epi64((__m256i*)dst, mask, M1); //_mm256_maskstore_epi64((__m256i*)(dst + i_dst), mask, M2); //dst += i_dst2; //M1 = _mm256_srli_si256(M1, 1); //M2 = _mm256_srli_si256(M2, 1); //_mm256_maskstore_epi64((__m256i*)dst, mask, M1); //_mm256_maskstore_epi64((__m256i*)(dst + i_dst), mask, M2); //dst += i_dst2; //M1 = _mm256_srli_si256(M1, 1); //M2 = _mm256_srli_si256(M2, 1); //_mm256_maskstore_epi64((__m256i*)dst, mask, M1); //_mm256_maskstore_epi64((__m256i*)(dst + i_dst), mask, M2); //dst += i_dst2; } }*/ } void intra_pred_ang_x_9_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j; UNUSED_PARAMETER(dir_mode); if (bsx >= bsy) { if (bsx & 0x07) {//4 intra_pred_ang_x_9_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx & 0x0f) {//8 intra_pred_ang_x_9_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx & 16){//16 __m256i S0, S1, S2, S3; __m256i t0, t1, t2, t3; __m256i c0; __m256i D0; __m256i off = _mm256_set1_epi16(64); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (j = 0; j < bsy; j++) { int idx = tab_idx_mode_9[j]; c0 = _mm256_set1_epi32(((int*)(tab_coeff_mode_9[j]))[0]); S0 = _mm256_loadu_si256((__m256i*)(src + idx)); //0...7 8...15 16...23 24...31 S1 = _mm256_loadu_si256((__m256i*)(src + idx + 1));//1.. 8 9...16 17...24 25...32 S2 = _mm256_loadu_si256((__m256i*)(src + idx + 2));//2...9 10...17 S3 = _mm256_loadu_si256((__m256i*)(src + idx + 3));//3...10 11...18 S0 = _mm256_permute4x64_epi64(S0, 0x00D8);//0...7 16...23 8...15 24...31 S1 = _mm256_permute4x64_epi64(S1, 0x00D8);//1...8 17...24 9...16 25...32 S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); t0 = _mm256_unpacklo_epi8(S0, S1);//0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 t1 = _mm256_unpacklo_epi8(S2, S3);//2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);//0...7 t3 = _mm256_unpackhi_epi16(t0, t1);//8...15 t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D0 = _mm256_hadds_epi16(t0, t1);//0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 D0 = _mm256_permute4x64_epi64(D0, 0x00D8); D0 = _mm256_add_epi16(D0, off); D0 = _mm256_srli_epi16(D0, 7); D0 = _mm256_packus_epi16(D0, D0); D0 = _mm256_permute4x64_epi64(D0, 0x00D8); _mm256_maskstore_epi64((__int64*)dst, mask, D0); dst += i_dst; } } else {//32 64 __m256i S0, S1, S2, S3; __m256i t0, t1, t2, t3; __m256i c0; __m256i D0, D1; __m256i off = _mm256_set1_epi16(64); for (j = 0; j < bsy; j++) { int idx = tab_idx_mode_9[j]; c0 = _mm256_set1_epi32(((int*)tab_coeff_mode_9[j])[0]); for (i = 0; i < bsx; i += 32, idx += 32) { S0 = _mm256_loadu_si256((__m256i*)(src + idx)); //0...7 8...15 16...23 24...31 S1 = _mm256_loadu_si256((__m256i*)(src + idx + 1));//1.. 8 9...16 17...24 25...32 S2 = _mm256_loadu_si256((__m256i*)(src + idx + 2));//2...9 10...17 18 S3 = _mm256_loadu_si256((__m256i*)(src + idx + 3));//3...10 11...18 19 S0 = _mm256_permute4x64_epi64(S0, 0x00D8);//0...7 16...23 8...15 24...31 S1 = _mm256_permute4x64_epi64(S1, 0x00D8);//1...8 17...24 9...16 25...32 S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); t0 = _mm256_unpacklo_epi8(S0, S1);//0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 t1 = _mm256_unpacklo_epi8(S2, S3);//2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);// t3 = _mm256_unpackhi_epi16(t0, t1);//........15 16 17 18 t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D0 = _mm256_hadds_epi16(t0, t1);//0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 D0 = _mm256_permute4x64_epi64(D0, 0x00D8); D0 = _mm256_add_epi16(D0, off); D0 = _mm256_srli_epi16(D0, 7); t0 = _mm256_unpackhi_epi8(S0, S1);//16 17 17 18 18 19 19 20 20 21 21 22 22 23 23 24...24 25 25.. t1 = _mm256_unpackhi_epi8(S2, S3);//18 19 19 20 ..... t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);//16 17 18 19... t3 = _mm256_unpackhi_epi16(t0, t1);//24 25 26 27... t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D1 = _mm256_hadds_epi16(t0, t1);//16 17 18 19 24 25 26 27 20 21 22 23 28 29 30 31 D1 = _mm256_permute4x64_epi64(D1, 0x00D8); D1 = _mm256_add_epi16(D1, off); D1 = _mm256_srli_epi16(D1, 7); D0 = _mm256_packus_epi16(D0, D1); D0 = _mm256_permute4x64_epi64(D0, 0x00D8); _mm256_storeu_si256((__m256i*)(dst + i), D0); } dst += i_dst; } } } else {//4x16 8x32 intra_pred_ang_x_9_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } } void intra_pred_ang_x_10_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { if (bsy == 4){ intra_pred_ang_x_10_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } int i; pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; UNUSED_PARAMETER(dir_mode); if (bsy != 4) { __m256i zero = _mm256_setzero_si256(); __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); ALIGN32(pel_t first_line[4 * (64 + 32)]); int line_size = bsx + bsy / 4 - 1; int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; pfirst[2] = first_line + aligned_line_size * 2; pfirst[3] = first_line + aligned_line_size * 3; for (i = 0; i < line_size - 16; i += 32, src += 32) { __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; //0 1 2 3 .... 12 13 14 15 16 17 18 19 .... 28 29 30 21 __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + 3)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 2)); __m256i L0 = _mm256_unpacklo_epi8(S0, zero);//0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23 __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i L3 = _mm256_unpacklo_epi8(S3, zero); __m256i H0 = _mm256_unpackhi_epi8(S0, zero);// 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31 __m256i H1 = _mm256_unpackhi_epi8(S1, zero); __m256i H2 = _mm256_unpackhi_epi8(S2, zero); __m256i H3 = _mm256_unpackhi_epi8(S3, zero); __m256i tmpL0 = _mm256_permute2x128_si256(L0, H0, 0x0020);//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 __m256i tmpL1 = _mm256_permute2x128_si256(L1, H1, 0x0020); __m256i tmpL2 = _mm256_permute2x128_si256(L2, H2, 0x0020); __m256i tmpL3 = _mm256_permute2x128_si256(L3, H3, 0x0020); __m256i tmpH0 = _mm256_permute2x128_si256(L0, H0, 0x0031);//16 17...24 25... __m256i tmpH1 = _mm256_permute2x128_si256(L1, H1, 0x0031); __m256i tmpH2 = _mm256_permute2x128_si256(L2, H2, 0x0031); __m256i tmpH3 = _mm256_permute2x128_si256(L3, H3, 0x0031); p00 = _mm256_mullo_epi16(tmpL0, coeff3);//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 p10 = _mm256_mullo_epi16(tmpL1, coeff7); p20 = _mm256_mullo_epi16(tmpL2, coeff5); p30 = _mm256_add_epi16(tmpL3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_srli_epi16(p00, 4); p01 = _mm256_mullo_epi16(tmpH0, coeff3);//16 17...24 25... p11 = _mm256_mullo_epi16(tmpH1, coeff7); p21 = _mm256_mullo_epi16(tmpH2, coeff5); p31 = _mm256_add_epi16(tmpH3, coeff8); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[0][i], p00); p00 = _mm256_add_epi16(tmpL1, tmpL2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(tmpL0, tmpL3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); p00 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(tmpH1, tmpH2); p01 = _mm256_mullo_epi16(p01, coeff3); p11 = _mm256_add_epi16(tmpH0, tmpH3); p11 = _mm256_add_epi16(p11, coeff4); p01 = _mm256_add_epi16(p11, p01); p01 = _mm256_srli_epi16(p01, 3); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[1][i], p00); p10 = _mm256_mullo_epi16(tmpL1, coeff5); p20 = _mm256_mullo_epi16(tmpL2, coeff7); p30 = _mm256_mullo_epi16(tmpL3, coeff3); p00 = _mm256_add_epi16(tmpL0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p11 = _mm256_mullo_epi16(tmpH1, coeff5); p21 = _mm256_mullo_epi16(tmpH2, coeff7); p31 = _mm256_mullo_epi16(tmpH3, coeff3); p01 = _mm256_add_epi16(tmpH0, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[2][i], p00); p00 = _mm256_add_epi16(tmpL1, tmpL2); p10 = _mm256_add_epi16(tmpL2, tmpL3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_srli_epi16(p00, 2); p01 = _mm256_add_epi16(tmpH1, tmpH2); p11 = _mm256_add_epi16(tmpH2, tmpH3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[3][i], p00); } if (i < line_size) { __m256i p00, p10, p20, p30; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + 3)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 2)); S0 = _mm256_permute4x64_epi64(S0, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S1 = _mm256_permute4x64_epi64(S1, 0x00D8); __m256i L0 = _mm256_unpacklo_epi8(S0, zero); __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i L3 = _mm256_unpacklo_epi8(S3, zero); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64*)&pfirst[0][i], mask, p00); p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)&pfirst[1][i], mask, p00); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)&pfirst[2][i], mask, p00); p00 = _mm256_add_epi16(L1, L2); p10 = _mm256_add_epi16(L2, L3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)&pfirst[3][i], mask, p00); } bsy >>= 2; int i_dstx4 = i_dst << 2; if (bsx == 64){ for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_storeu_si256((__m256i*)dst1, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 32)); _mm256_storeu_si256((__m256i*)(dst1 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_storeu_si256((__m256i*)dst2, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 32)); _mm256_storeu_si256((__m256i*)(dst2 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i)); _mm256_storeu_si256((__m256i*)dst3, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i + 32)); _mm256_storeu_si256((__m256i*)(dst3 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i)); _mm256_storeu_si256((__m256i*)dst4, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i + 32)); _mm256_storeu_si256((__m256i*)(dst4 + 32), M); dst1 += i_dstx4; dst2 += i_dstx4; dst3 += i_dstx4; dst4 += i_dstx4; } } else if (bsx == 32){ for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_storeu_si256((__m256i*)dst1, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_storeu_si256((__m256i*)dst2, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i)); _mm256_storeu_si256((__m256i*)dst3, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i)); _mm256_storeu_si256((__m256i*)dst4, M); dst1 += i_dstx4; dst2 += i_dstx4; dst3 += i_dstx4; dst4 += i_dstx4; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi64((__int64*)dst1, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi64((__int64*)dst2, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i)); _mm256_maskstore_epi64((__int64*)dst3, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i)); _mm256_maskstore_epi64((__int64*)dst4, mask, M); dst1 += i_dstx4; dst2 += i_dstx4; dst3 += i_dstx4; dst4 += i_dstx4; } } else if (bsx == 8){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi64((__int64*)dst1, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi64((__int64*)dst2, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i)); _mm256_maskstore_epi64((__int64*)dst3, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i)); _mm256_maskstore_epi64((__int64*)dst4, mask, M); dst1 += i_dstx4; dst2 += i_dstx4; dst3 += i_dstx4; dst4 += i_dstx4; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi32((int*)dst1, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi32((int*)dst2, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i)); _mm256_maskstore_epi32((int*)dst3, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i)); _mm256_maskstore_epi32((int*)dst4, mask, M); dst1 += i_dstx4; dst2 += i_dstx4; dst3 += i_dstx4; dst4 += i_dstx4; } } /* if (bsx != 8) { switch (bsx) { case 4: for (i = 0; i < bsy; i++) { CP32(dst1, pfirst[0] + i); dst1 += i_dstx4; CP32(dst2, pfirst[1] + i); dst2 += i_dstx4; CP32(dst3, pfirst[2] + i); dst3 += i_dstx4; CP32(dst4, pfirst[3] + i); dst4 += i_dstx4; } break; case 16: for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i, 16 * sizeof(pel_t)); dst1 += i_dstx4; memcpy(dst2, pfirst[1] + i, 16 * sizeof(pel_t)); dst2 += i_dstx4; memcpy(dst3, pfirst[2] + i, 16 * sizeof(pel_t)); dst3 += i_dstx4; memcpy(dst4, pfirst[3] + i, 16 * sizeof(pel_t)); dst4 += i_dstx4; } break; case 32: for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i, 32 * sizeof(pel_t)); dst1 += i_dstx4; memcpy(dst2, pfirst[1] + i, 32 * sizeof(pel_t)); dst2 += i_dstx4; memcpy(dst3, pfirst[2] + i, 32 * sizeof(pel_t)); dst3 += i_dstx4; memcpy(dst4, pfirst[3] + i, 32 * sizeof(pel_t)); dst4 += i_dstx4; } break; case 64: for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i, 64 * sizeof(pel_t)); dst1 += i_dstx4; memcpy(dst2, pfirst[1] + i, 64 * sizeof(pel_t)); dst2 += i_dstx4; memcpy(dst3, pfirst[2] + i, 64 * sizeof(pel_t)); dst3 += i_dstx4; memcpy(dst4, pfirst[3] + i, 64 * sizeof(pel_t)); dst4 += i_dstx4; } break; default: assert(0); break; } } else { if (bsy == 2) { //8x8 for (i = 0; i < bsy; i++) { CP64(dst1, pfirst[0] + i); CP64(dst2, pfirst[1] + i); CP64(dst3, pfirst[2] + i); CP64(dst4, pfirst[3] + i); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; } } else {//8x32 __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][0]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][0]); __m128i M3 = _mm_loadu_si128((__m128i*)&pfirst[2][0]); __m128i M4 = _mm_loadu_si128((__m128i*)&pfirst[3][0]); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); } }*/ } } void intra_pred_ang_x_11_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j; UNUSED_PARAMETER(dir_mode); if (bsx & 0x07) { intra_pred_ang_x_11_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx & 0x0f) { intra_pred_ang_x_11_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx & 16){ __m256i S0, S1, S2, S3; __m256i t0, t1, t2, t3; __m256i c0; __m256i D0; __m256i off = _mm256_set1_epi16(64); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (j = 0; j < bsy; j++) { int idx = (j + 1) >> 3; c0 = _mm256_set1_epi32(((int*)(tab_coeff_mode_11[j & 0x07]))[0]); S0 = _mm256_loadu_si256((__m256i*)(src + idx)); //0...7 8...15 16...23 24...31 S1 = _mm256_loadu_si256((__m256i*)(src + idx + 1));//1.. 8 9...16 17...24 25...32 S2 = _mm256_loadu_si256((__m256i*)(src + idx + 2));//2...9 10...17 S3 = _mm256_loadu_si256((__m256i*)(src + idx + 3));//3...10 11...18 S0 = _mm256_permute4x64_epi64(S0, 0x00D8);//0...7 16...23 8...15 24...31 S1 = _mm256_permute4x64_epi64(S1, 0x00D8);//1...8 17...24 9...16 25...32 S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); t0 = _mm256_unpacklo_epi8(S0, S1);//0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 t1 = _mm256_unpacklo_epi8(S2, S3);//2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);//0...7 t3 = _mm256_unpackhi_epi16(t0, t1);//8...15 t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D0 = _mm256_hadds_epi16(t0, t1);//0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 D0 = _mm256_permute4x64_epi64(D0, 0x00D8); D0 = _mm256_add_epi16(D0, off); D0 = _mm256_srli_epi16(D0, 7); D0 = _mm256_packus_epi16(D0, D0); D0 = _mm256_permute4x64_epi64(D0, 0x00D8); _mm256_maskstore_epi64((__int64*)dst, mask, D0); dst += i_dst; } } else { __m256i S0, S1, S2, S3; __m256i t0, t1, t2, t3; __m256i c0; __m256i D0, D1; __m256i off = _mm256_set1_epi16(64); for (j = 0; j < bsy; j++) { int idx = (j + 1) >> 3; c0 = _mm256_set1_epi32(((int*)tab_coeff_mode_11[j & 0x07])[0]); for (i = 0; i < bsx; i += 32, idx += 32) { S0 = _mm256_loadu_si256((__m256i*)(src + idx)); //0...7 8...15 16...23 24...31 S1 = _mm256_loadu_si256((__m256i*)(src + idx + 1));//1.. 8 9...16 17...24 25...32 S2 = _mm256_loadu_si256((__m256i*)(src + idx + 2));//2...9 10...17 18 S3 = _mm256_loadu_si256((__m256i*)(src + idx + 3));//3...10 11...18 19 S0 = _mm256_permute4x64_epi64(S0, 0x00D8);//0...7 16...23 8...15 24...31 S1 = _mm256_permute4x64_epi64(S1, 0x00D8);//1...8 17...24 9...16 25...32 S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); t0 = _mm256_unpacklo_epi8(S0, S1);//0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 t1 = _mm256_unpacklo_epi8(S2, S3);//2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);// t3 = _mm256_unpackhi_epi16(t0, t1);//........15 16 17 18 t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D0 = _mm256_hadds_epi16(t0, t1);//0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 D0 = _mm256_permute4x64_epi64(D0, 0x00D8); D0 = _mm256_add_epi16(D0, off); D0 = _mm256_srli_epi16(D0, 7); t0 = _mm256_unpackhi_epi8(S0, S1);//16 17 17 18 18 19 19 20 20 21 21 22 22 23 23 24...24 25 25.. t1 = _mm256_unpackhi_epi8(S2, S3);//18 19 19 20 ..... t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);//16 17 18 19... t3 = _mm256_unpackhi_epi16(t0, t1);//24 25 26 27... t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D1 = _mm256_hadds_epi16(t0, t1);//16 17 18 19 24 25 26 27 20 21 22 23 28 29 30 31 D1 = _mm256_permute4x64_epi64(D1, 0x00D8); D1 = _mm256_add_epi16(D1, off); D1 = _mm256_srli_epi16(D1, 7); D0 = _mm256_packus_epi16(D0, D1); D0 = _mm256_permute4x64_epi64(D0, 0x00D8); _mm256_storeu_si256((__m256i*)(dst + i), D0); } dst += i_dst; } } } void intra_pred_ang_y_25_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { UNUSED_PARAMETER(dir_mode); int i; if (bsx > 8) { ALIGN32(pel_t first_line[64 + (64 << 3)]); int line_size = bsx + ((bsy - 1) << 3); int iHeight8 = bsy << 3; pel_t *pfirst = first_line; __m256i coeff0 = _mm256_setr_epi16( 7, 3, 5, 1, 3, 1, 1, 0, 7, 3, 5, 1, 3, 1, 1, 0); __m256i coeff1 = _mm256_setr_epi16(15, 7, 13, 3, 11, 5, 9, 1, 15, 7, 13, 3, 11, 5, 9, 1); __m256i coeff2 = _mm256_setr_epi16( 9, 5, 11, 3, 13, 7, 15, 2, 9, 5, 11, 3, 13, 7, 15, 2); __m256i coeff3 = _mm256_setr_epi16( 1, 1, 3, 1, 5, 3, 7, 1, 1, 1, 3, 1, 5, 3, 7, 1); __m256i coeff4 = _mm256_setr_epi16(16, 8, 16, 4, 16, 8, 16, 2, 16, 8, 16, 4, 16, 8, 16, 2); __m256i coeff5 = _mm256_setr_epi16( 1, 2, 1, 4, 1, 2, 1, 8, 1, 2, 1, 4, 1, 2, 1, 8); __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; __m256i res1, res2; __m256i L0 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4]); __m256i L1 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5]); __m256i L2 = _mm256_setr_epi16(src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6]); __m256i L3 = _mm256_setr_epi16(src[-3], src[-3], src[-3], src[-3], src[-3], src[-3], src[-3], src[-3], src[-7], src[-7], src[-7], src[-7], src[-7], src[-7], src[-7], src[-7]); src -= 4; for (i = 0; i < line_size; i += 64, src -= 4) { p00 = _mm256_mullo_epi16(L0, coeff0);//0...4... p10 = _mm256_mullo_epi16(L1, coeff1);//1...5... p20 = _mm256_mullo_epi16(L2, coeff2);//2...6... p30 = _mm256_mullo_epi16(L3, coeff3);//3...7... p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_mullo_epi16(p00, coeff5); p00 = _mm256_srli_epi16(p00, 5); L0 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4]);//4 8 p01 = _mm256_mullo_epi16(L1, coeff0);//1...5... p11 = _mm256_mullo_epi16(L2, coeff1);//2...6... p21 = _mm256_mullo_epi16(L3, coeff2);//3...7... p31 = _mm256_mullo_epi16(L0, coeff3);//4...8... p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_mullo_epi16(p01, coeff5); p01 = _mm256_srli_epi16(p01, 5); res1 = _mm256_packus_epi16(p00, p01); L1 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5]);//5 9 p00 = _mm256_mullo_epi16(L2, coeff0);//2...6... p10 = _mm256_mullo_epi16(L3, coeff1);//3...7... p20 = _mm256_mullo_epi16(L0, coeff2);//4...8... p30 = _mm256_mullo_epi16(L1, coeff3);//5...9... p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_mullo_epi16(p00, coeff5); p00 = _mm256_srli_epi16(p00, 5); L2 = _mm256_setr_epi16(src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6]);//6 10 p01 = _mm256_mullo_epi16(L3, coeff0);//3...7... p11 = _mm256_mullo_epi16(L0, coeff1);//4...8... p21 = _mm256_mullo_epi16(L1, coeff2);//5...9... p31 = _mm256_mullo_epi16(L2, coeff3);//6...10... p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_mullo_epi16(p01, coeff5); p01 = _mm256_srli_epi16(p01, 5); res2 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute2x128_si256(res1, res2, 0x0020); _mm256_storeu_si256((__m256i*)pfirst, p00); pfirst += 32; p00 = _mm256_permute2x128_si256(res1, res2, 0x0031); _mm256_storeu_si256((__m256i*)pfirst, p00); pfirst += 32; src -= 4; L0 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4]);//8 12 L1 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5]);//9 13 L2 = _mm256_setr_epi16(src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6]);//10 14 L3 = _mm256_setr_epi16(src[-3], src[-3], src[-3], src[-3], src[-3], src[-3], src[-3], src[-3], src[-7], src[-7], src[-7], src[-7], src[-7], src[-7], src[-7], src[-7]);//11 15 } //if (bsx == 16) {// 8 // __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[7]); // p00 = _mm256_mullo_epi16(L0, coeff0); // p10 = _mm256_mullo_epi16(L1, coeff1); // p20 = _mm256_mullo_epi16(L2, coeff2); // p30 = _mm256_mullo_epi16(L3, coeff3); // p00 = _mm256_add_epi16(p00, coeff4); // p00 = _mm256_add_epi16(p00, p10); // p00 = _mm256_add_epi16(p00, p20); // p00 = _mm256_add_epi16(p00, p30); // p00 = _mm256_mullo_epi16(p00, coeff5); // p00 = _mm256_srli_epi16(p00, 5); // // p00 = _mm256_packus_epi16(p00, p00); // p00 = _mm256_permute4x64_epi64(p00, 0x0008); // _mm256_maskstore_epi64((__m256i*)pfirst, mask, p00); //} else if(bsx == 32){ // __m256i mask = _mm256_set_epi64x(0, -1, -1, -1); // p00 = _mm256_mullo_epi16(L0, coeff0); // p10 = _mm256_mullo_epi16(L1, coeff1); // p20 = _mm256_mullo_epi16(L2, coeff2); // p30 = _mm256_mullo_epi16(L3, coeff3); // p00 = _mm256_add_epi16(p00, coeff4); // p00 = _mm256_add_epi16(p00, p10); // p00 = _mm256_add_epi16(p00, p20); // p00 = _mm256_add_epi16(p00, p30); // p00 = _mm256_mullo_epi16(p00, coeff5); // p00 = _mm256_srli_epi16(p00, 5); // // L0 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], // src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4]); // // p01 = _mm256_mullo_epi16(L1, coeff0); // p11 = _mm256_mullo_epi16(L2, coeff1); // p21 = _mm256_mullo_epi16(L3, coeff2); // p31 = _mm256_mullo_epi16(L0, coeff3); // p01 = _mm256_add_epi16(p01, coeff4); // p01 = _mm256_add_epi16(p01, p11); // p01 = _mm256_add_epi16(p01, p21); // p01 = _mm256_add_epi16(p01, p31); // p01 = _mm256_mullo_epi16(p01, coeff5); // p01 = _mm256_srli_epi16(p01, 5); // // p00 = _mm256_packus_epi16(p00, p01); // p00 = _mm256_permute4x64_epi64(p00, 0x00D8); // _mm256_maskstore_epi64((__int64*)pfirst, mask, p00); // //} else { // __m256i mask = _mm256_set_epi64x(0, -1, -1, -1); // p00 = _mm256_mullo_epi16(L0, coeff0); // p10 = _mm256_mullo_epi16(L1, coeff1); // p20 = _mm256_mullo_epi16(L2, coeff2); // p30 = _mm256_mullo_epi16(L3, coeff3); // p00 = _mm256_add_epi16(p00, coeff4); // p00 = _mm256_add_epi16(p00, p10); // p00 = _mm256_add_epi16(p00, p20); // p00 = _mm256_add_epi16(p00, p30); // p00 = _mm256_mullo_epi16(p00, coeff5); // p00 = _mm256_srli_epi16(p00, 5); // // L0 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], // src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4]); // // p01 = _mm256_mullo_epi16(L1, coeff0); // p11 = _mm256_mullo_epi16(L2, coeff1); // p21 = _mm256_mullo_epi16(L3, coeff2); // p31 = _mm256_mullo_epi16(L0, coeff3); // p01 = _mm256_add_epi16(p01, coeff4); // p01 = _mm256_add_epi16(p01, p11); // p01 = _mm256_add_epi16(p01, p21); // p01 = _mm256_add_epi16(p01, p31); // p01 = _mm256_mullo_epi16(p01, coeff5); // p01 = _mm256_srli_epi16(p01, 5); // // p00 = _mm256_packus_epi16(p00, p01); // p00 = _mm256_permute4x64_epi64(p00, 0x00D8); // _mm256_storeu_si256((__m256*)pfirst, p00); // // pfirst += 32; // // L1 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], // src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5]); // // p00 = _mm256_mullo_epi16(L2, coeff0); // p10 = _mm256_mullo_epi16(L3, coeff1); // p20 = _mm256_mullo_epi16(L0, coeff2); // p30 = _mm256_mullo_epi16(L1, coeff3); // p00 = _mm256_add_epi16(p00, coeff4); // p00 = _mm256_add_epi16(p00, p10); // p00 = _mm256_add_epi16(p00, p20); // p00 = _mm256_add_epi16(p00, p30); // p00 = _mm256_mullo_epi16(p00, coeff5); // p00 = _mm256_srli_epi16(p00, 5); // // p00 = _mm256_packus_epi16(p00, p00); // p00 = _mm256_permute4x64_epi64(p00, 0x0008); // _mm256_maskstore_epi64((__int64*)pfirst, mask, p00); // //} __m256i M; if (bsx == 64) { for (i = 0; i < iHeight8; i += 32){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 8)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + +8 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 16)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 16 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 24)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 24 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < iHeight8; i += 32){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 8)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 16)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 24)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < iHeight8; i += 32){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 8)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 16)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 24)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } /*for (i = 0; i < iHeight8; i += 8) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; }*/ } else {//8x8 8x32 4x4 4x16 intra_pred_ang_y_25_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return ; } } void intra_pred_ang_y_26_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsx != 4) { __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); __m256i shuffle = _mm256_setr_epi8(7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8, 7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8); ALIGN32(pel_t first_line[64 + 256]); int line_size = bsx + (bsy - 1) * 4; int iHeight4 = bsy << 2; src -= 31; __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; __m256i M1, M2, M3, M4, M5, M6, M7, M8; __m256i S0, S1, S2, S3; __m256i L0, L1, L2, L3; __m256i H0, H1, H2, H3; for (i = 0; i < line_size - 64; i += 128, src -= 32) { S0 = _mm256_loadu_si256((__m256i*)(src)); //15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 S1 = _mm256_loadu_si256((__m256i*)(src - 1));//16 15 14... S2 = _mm256_loadu_si256((__m256i*)(src - 2));//17 16 15... S3 = _mm256_loadu_si256((__m256i*)(src - 3));//18 17 16... L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0));//15 14 13 12 11 10 9 8 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0));//16 15 14... L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0));//17 16 15... L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0));//18 17 16... H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//7 6 5 4 3 2 1 0 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//8 7 6.. H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1));//9 8 7... H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1));//10 9 8... p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); M1 = _mm256_srli_epi16(p00, 4);//31...16 p01 = _mm256_mullo_epi16(H0, coeff3); p11 = _mm256_mullo_epi16(H1, coeff7); p21 = _mm256_mullo_epi16(H2, coeff5); p31 = _mm256_add_epi16(H3, coeff8); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); M2 = _mm256_srli_epi16(p01, 4);//15...0 p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); M3 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H1, H2); p01 = _mm256_mullo_epi16(p01, coeff3); p11 = _mm256_add_epi16(H0, H3); p11 = _mm256_add_epi16(p11, coeff4); p01 = _mm256_add_epi16(p11, p01); M4 = _mm256_srli_epi16(p01, 3); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); M5 = _mm256_srli_epi16(p00, 4);//31...16 p11 = _mm256_mullo_epi16(H1, coeff5); p21 = _mm256_mullo_epi16(H2, coeff7); p31 = _mm256_mullo_epi16(H3, coeff3); p01 = _mm256_add_epi16(H0, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); M6 = _mm256_srli_epi16(p01, 4);//15...0 p00 = _mm256_add_epi16(L1, L2); p10 = _mm256_add_epi16(L2, L3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); M7 = _mm256_srli_epi16(p00, 2); p01 = _mm256_add_epi16(H1, H2); p11 = _mm256_add_epi16(H2, H3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); M8 = _mm256_srli_epi16(p01, 2); M1 = _mm256_packus_epi16(M1, M3); M5 = _mm256_packus_epi16(M5, M7); M1 = _mm256_shuffle_epi8(M1, shuffle); M5 = _mm256_shuffle_epi8(M5, shuffle); M2 = _mm256_packus_epi16(M2, M4); M6 = _mm256_packus_epi16(M6, M8); M2 = _mm256_shuffle_epi8(M2, shuffle); M6 = _mm256_shuffle_epi8(M6, shuffle); //M1 = _mm256_permute4x64_epi64(M1, 0x4E); //M5 = _mm256_permute4x64_epi64(M5, 0x4E); //M2 = _mm256_permute4x64_epi64(M2, 0x4E); //M6 = _mm256_permute4x64_epi64(M6, 0x4E); M1 = _mm256_permute4x64_epi64(M1, 0x72); M5 = _mm256_permute4x64_epi64(M5, 0x72); M2 = _mm256_permute4x64_epi64(M2, 0x72); M6 = _mm256_permute4x64_epi64(M6, 0x72); M3 = _mm256_unpacklo_epi16(M1, M5); M7 = _mm256_unpackhi_epi16(M1, M5); M4 = _mm256_unpacklo_epi16(M2, M6); M8 = _mm256_unpackhi_epi16(M2, M6); _mm256_storeu_si256((__m256i*)&first_line[i], M4); _mm256_storeu_si256((__m256i*)&first_line[32 + i], M8); _mm256_storeu_si256((__m256i*)&first_line[64 + i], M3); _mm256_storeu_si256((__m256i*)&first_line[96 + i], M7); } if (i < line_size) { S0 = _mm256_loadu_si256((__m256i*)(src)); //15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 S1 = _mm256_loadu_si256((__m256i*)(src - 1));//16 15 14... S2 = _mm256_loadu_si256((__m256i*)(src - 2));//17 16 15... S3 = _mm256_loadu_si256((__m256i*)(src - 3));//18 17 16... H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//7 6 5 4 3 2 1 0 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//8 7 6.. H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1));//9 8 7... H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1));//10 9 8... p01 = _mm256_mullo_epi16(H0, coeff3); p11 = _mm256_mullo_epi16(H1, coeff7); p21 = _mm256_mullo_epi16(H2, coeff5); p31 = _mm256_add_epi16(H3, coeff8); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); M2 = _mm256_srli_epi16(p01, 4);//15...0 p01 = _mm256_add_epi16(H1, H2); p01 = _mm256_mullo_epi16(p01, coeff3); p11 = _mm256_add_epi16(H0, H3); p11 = _mm256_add_epi16(p11, coeff4); p01 = _mm256_add_epi16(p11, p01); M4 = _mm256_srli_epi16(p01, 3); p11 = _mm256_mullo_epi16(H1, coeff5); p21 = _mm256_mullo_epi16(H2, coeff7); p31 = _mm256_mullo_epi16(H3, coeff3); p01 = _mm256_add_epi16(H0, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); M6 = _mm256_srli_epi16(p01, 4);//15...0 p01 = _mm256_add_epi16(H1, H2); p11 = _mm256_add_epi16(H2, H3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); M8 = _mm256_srli_epi16(p01, 2); M2 = _mm256_packus_epi16(M2, M4); M6 = _mm256_packus_epi16(M6, M8); M2 = _mm256_shuffle_epi8(M2, shuffle); M6 = _mm256_shuffle_epi8(M6, shuffle); //M2 = _mm256_permute4x64_epi64(M2, 0x4E); //M6 = _mm256_permute4x64_epi64(M6, 0x4E); M2 = _mm256_permute4x64_epi64(M2, 0x72); M6 = _mm256_permute4x64_epi64(M6, 0x72); M4 = _mm256_unpacklo_epi16(M2, M6); M8 = _mm256_unpackhi_epi16(M2, M6); _mm256_storeu_si256((__m256i*)&first_line[i], M4); _mm256_storeu_si256((__m256i*)&first_line[32 + i], M8); } __m256i M; if (bsx == 64) { for (i = 0; i < iHeight4; i += 16){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 4)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32 + 4)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 8)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32 + 8)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 12)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32 + 12)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32) { for (i = 0; i < iHeight4; i += 16){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 4)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 8)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 12)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < iHeight4; i += 16){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 4)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 8)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 12)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < iHeight4; i += 16){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 4)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 8)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 12)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } /*switch (bsx) { case 4: for (i = 0; i < iHeight4; i += 4) { CP32(dst, first_line + i); dst += i_dst; } break; case 8: for (i = 0; i < iHeight4; i += 4) { CP64(dst, first_line + i); dst += i_dst; } break; default: for (i = 0; i < iHeight4; i += 4) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } break; }*/ } else { //4x4 4x16 intra_pred_ang_y_26_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } } void intra_pred_ang_y_28_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[64 + 128]); int line_size = bsx + (bsy - 1) * 2; int i; int iHeight2 = bsy << 1; UNUSED_PARAMETER(dir_mode); __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i shuffle = _mm256_setr_epi8(7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8, 7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8); src -= 31; __m256i p00, p10; __m256i p01, p11; __m256i S0, S1, S2, S3; __m256i L0, L1, L2, L3; __m256i H0, H1, H2, H3; for (i = 0; i < line_size - 32; i += 64, src -= 32) { S0 = _mm256_loadu_si256((__m256i*)(src)); S3 = _mm256_loadu_si256((__m256i*)(src - 3)); S1 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src - 2)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0));//15 14 13 12 11 10 9 8 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0));//16 15 14... L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0));//17 16 15... L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0));//18 17 16... H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//7 6 5 4 3 2 1 0 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//8 7 6.. H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1));//9 8 7... H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1));//10 9 8... p00 = _mm256_adds_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3);//031...016 p01 = _mm256_add_epi16(L1, L2); p11 = _mm256_add_epi16(L2, L3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_srli_epi16(p01, 2);//131...116 p00 = _mm256_packus_epi16(p00, p01);// p00 = _mm256_shuffle_epi8(p00, shuffle); p00 = _mm256_permute4x64_epi64(p00, 0x4E); _mm256_storeu_si256((__m256i*)&first_line[i + 32], p00); p00 = _mm256_adds_epi16(H1, H2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_adds_epi16(H0, H3); p10 = _mm256_adds_epi16(p10, coeff4); p00 = _mm256_adds_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H1, H2); p11 = _mm256_add_epi16(H2, H3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_shuffle_epi8(p00, shuffle); p00 = _mm256_permute4x64_epi64(p00, 0x4E); _mm256_storeu_si256((__m256i*)&first_line[i], p00); } if (i < line_size) { S0 = _mm256_loadu_si256((__m256i*)(src)); S3 = _mm256_loadu_si256((__m256i*)(src - 3)); S1 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src - 2)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//7 6 5 4 3 2 1 0 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//8 7 6.. H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1));//9 8 7... H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1));//10 9 8... p00 = _mm256_adds_epi16(H1, H2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_adds_epi16(H0, H3); p10 = _mm256_adds_epi16(p10, coeff4); p00 = _mm256_adds_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H1, H2); p11 = _mm256_add_epi16(H2, H3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_shuffle_epi8(p00, shuffle); p00 = _mm256_permute4x64_epi64(p00, 0x4E); _mm256_storeu_si256((__m256i*)&first_line[i], p00); } if (bsx == 64){ for (i = 0; i < iHeight2; i += 8) { __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 2] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 4]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 4] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 6]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 6] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < iHeight2; i += 8) { __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 4]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 6]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < iHeight2; i += 8){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 4]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 6]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else if (bsx == 8){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < iHeight2; i += 8){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 4]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 6]); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (i = 0; i < iHeight2; i += 8){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 4]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 6]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*if (bsx >= 16) { for (i = 0; i < iHeight2; i += 2) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 8) { for (i = 0; i < iHeight2; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; } } else { for (i = 0; i < iHeight2; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; } }*/ } void intra_pred_ang_y_30_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; UNUSED_PARAMETER(dir_mode); int i; __m256i coeff2 = _mm256_set1_epi16(2); __m256i shuffle = _mm256_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __m256i p00, p10; __m256i p01, p11; __m256i S0, S1, S2; __m256i L0, L1, L2; __m256i H0, H1, H2; src -= 33; for (i = 0; i < line_size - 16; i += 32, src -= 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0));//35 34 33... L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0));//34 33 32... L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//20 19 18... H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//19 18 17... H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); p00 = _mm256_add_epi16(L0, L1); p10 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_srli_epi16(p00, 2);//31...24 23...16 p01 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_srli_epi16(p01, 2);//15..8 7...0 p00 = _mm256_packus_epi16(p00, p01);//32...24 15...8 23...16 7...0 p00 = _mm256_permute4x64_epi64(p00, 0x8D); p00 = _mm256_shuffle_epi8(p00, shuffle); _mm256_storeu_si256((__m256i*)&first_line[i], p00); } __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); if (i < line_size) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//20 19 18... H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//19 18 17... H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1));//18 p01 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_srli_epi16(p01, 2);//15...8..7..0 p01 = _mm256_packus_epi16(p01, p01);//15...8 15...8 7...0 7...0 p01 = _mm256_permute4x64_epi64(p01, 0x0008); p01 = _mm256_shuffle_epi8(p01, shuffle); _mm256_maskstore_epi64((__int64*)&first_line[i], mask, p01); } __m256i M; if (bsx == 64) { for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32 + 1)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32 + 2)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32 + 3)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32) { for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 1)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 2)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 3)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else if (bsx == 8){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 1)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 2)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 3)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 1)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 2)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 3)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*if (bsx > 16) { for (i = 0; i < bsy; i++) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 16) { pel_t *dst1 = dst; if (bsy == 4) { __m256i M = _mm256_loadu_si256((__m256i*)&first_line[0]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[1]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[2]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[3]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); } else { __m256i M = _mm256_loadu_si256((__m256i*)&first_line[0]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[1]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[2]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[3]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[4]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[5]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[6]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[7]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[8]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[9]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[10]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[11]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[12]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[13]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[14]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[15]); _mm256_maskstore_epi64((__int64*)dst1, mask, M); } } else if (bsx == 8) { for (i = 0; i < bsy; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; } } else { for (i = 0; i < bsy; i += 4) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 1); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 1); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 1); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; } }*/ } void intra_pred_ang_y_31_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsx >= bsy){ ALIGN32(pel_t dst_tran[MAX_CU_SIZE * MAX_CU_SIZE]); ALIGN32(pel_t src_tran[MAX_CU_SIZE << 3]); for (i = 0; i < (bsy + bsx * 11 / 8 + 3); i++){ src_tran[i] = src[-i]; } intra_pred_ang_x_5_avx(src_tran, dst_tran, bsy, 5, bsy, bsx); for (i = 0; i < bsy; i++){ for (int j = 0; j < bsx; j++){ dst[j + i_dst * i] = dst_tran[i + bsy * j]; } } } else if (bsx == 8){ __m128i coeff0 = _mm_setr_epi16( 5, 1, 7, 1, 1, 3, 3, 1); __m128i coeff1 = _mm_setr_epi16(13, 5, 15, 3, 9, 7, 11, 2); __m128i coeff2 = _mm_setr_epi16(11, 7, 9, 3, 15, 5, 13, 1); __m128i coeff3 = _mm_setr_epi16( 3, 3, 1, 1, 7, 1, 5, 0); __m128i coeff4 = _mm_setr_epi16(16, 8, 16, 4, 16, 8, 16, 2); __m128i coeff5 = _mm_setr_epi16( 1, 2, 1, 4, 1, 2, 1, 8); __m128i L0, L1, L2, L3; __m128i p00, p10, p20, p30; for (i = 0; i < bsy; i++,src--){ L0 = _mm_setr_epi16(src[-1], src[-2], src[-4], src[-5], src[-6], src[ -8], src[ -9], src[-11]); L1 = _mm_setr_epi16(src[-2], src[-3], src[-5], src[-6], src[-7], src[ -9], src[-10], src[-12]); L2 = _mm_setr_epi16(src[-3], src[-4], src[-6], src[-7], src[-8], src[-10], src[-11], src[-13]); L3 = _mm_setr_epi16(src[-4], src[-5], src[-7], src[-8], src[-9], src[-11], src[-12], src[-14]); p00 = _mm_mullo_epi16(L0, coeff0); p10 = _mm_mullo_epi16(L1, coeff1); p20 = _mm_mullo_epi16(L2, coeff2); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; } } else { intra_pred_ang_y_31_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } } void intra_pred_ang_y_32_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[2 * (64 + 64)]); int line_size = (bsy >> 1) + bsx - 1; int i; int aligned_line_size = ((line_size + 63) >> 4) << 4; pel_t *pfirst[2]; UNUSED_PARAMETER(dir_mode); __m256i coeff2 = _mm256_set1_epi16(2); __m256i shuffle = _mm256_setr_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; src -= 34; __m256i S0, S1, S2; __m256i L0, L1, L2; __m256i H0, H1, H2; __m256i p00, p01, p10, p11; __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < line_size - 8; i += 16, src -= 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1));//19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 S1 = _mm256_loadu_si256((__m256i*)(src)); //18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 S2 = _mm256_loadu_si256((__m256i*)(src + 1));//17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0));//19 18 17 16 15 14 13 12 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0));//18 17 16 15 14 13 12 11 L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0));//17 16 15 14 13 12 11 10 H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//11 10 9 8 7 6 5 4 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//10 9 8 7 6 5 4 3 H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1));// 9 8 7 6 5 4 3 2 p00 = _mm256_add_epi16(L0, L1); p01 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p01); p00 = _mm256_srli_epi16(p00, 2);//19...12(31...16) p10 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p10 = _mm256_add_epi16(p10, coeff2); p10 = _mm256_add_epi16(p10, p11); p10 = _mm256_srli_epi16(p10, 2);//11...4(15...0) //31...24 15...8 23...16 7...0 p00 = _mm256_packus_epi16(p00, p10); //19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 p00 = _mm256_permute4x64_epi64(p00, 0x8D);//31...16 15..0 //0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15 16.... p00 = _mm256_shuffle_epi8(p00, shuffle); p10 = _mm256_permute4x64_epi64(p00, 0x0D); p00 = _mm256_permute4x64_epi64(p00, 0x08); _mm256_maskstore_epi64((__int64*)&pfirst[0][i], mask, p00); _mm256_maskstore_epi64((__int64*)&pfirst[1][i], mask, p10); } mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[8]); if (i < line_size) { S0 = _mm256_loadu_si256((__m256i*)(src - 1));//19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 S1 = _mm256_loadu_si256((__m256i*)(src)); //18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 S2 = _mm256_loadu_si256((__m256i*)(src + 1));//17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//11 10 9 8 7 6 5 4 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//10 9 8 7 6 5 4 3 H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1));// 9 8 7 6 5 4 3 2 p10 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p10 = _mm256_add_epi16(p10, coeff2); p10 = _mm256_add_epi16(p10, p11); p10 = _mm256_srli_epi16(p10, 2); //15...8 15...8 7...0 7...0 p00 = _mm256_packus_epi16(p10, p10); //19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 p00 = _mm256_permute4x64_epi64(p00, 0x8D);//15...0 15...0 //0 2 4 6 8 10 12 14 1 3 5 7 1 3 5 7 8.... p00 = _mm256_shuffle_epi8(p00, shuffle); p10 = _mm256_permute4x64_epi64(p00, 0x0D); p00 = _mm256_permute4x64_epi64(p00, 0x08); _mm256_maskstore_epi64((__int64*)&pfirst[0][i], mask, p00); _mm256_maskstore_epi64((__int64*)&pfirst[1][i], mask, p10); ; } bsy >>= 1; if (bsx == 64){ for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < bsy; i += 4){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else if (bsx == 8){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else{ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*if (bsx >= 16 || bsx == 4) { for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); dst += i_dst2; } } else { if (bsy == 4) {//8x8 __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][0]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][0]); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); } else {//8x32 for (i = 0; i < 16; i = i + 8) { __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][i]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][i]); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; } } }*/ } void intra_pred_ang_xy_13_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsy > 4) { __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); __m256i coeff9 = _mm256_set1_epi16(9); __m256i coeff11 = _mm256_set1_epi16(11); __m256i coeff13 = _mm256_set1_epi16(13); __m256i coeff15 = _mm256_set1_epi16(15); __m256i coeff16 = _mm256_set1_epi16(16); ALIGN32(pel_t first_line[(64 + 16) << 3]); int line_size = bsx + (bsy >> 3) - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 15) >> 4) << 4; pel_t *pfirst[8]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; src -= bsy - 8; for (i = 0; i < left_size; i++, src += 8) {//left size`s value is small ,there is no need to use intrinsic assmble pfirst[0][i] = (pel_t)((src[6] + (src[7] << 1) + src[8] + 2) >> 2); pfirst[1][i] = (pel_t)((src[5] + (src[6] << 1) + src[7] + 2) >> 2); pfirst[2][i] = (pel_t)((src[4] + (src[5] << 1) + src[6] + 2) >> 2); pfirst[3][i] = (pel_t)((src[3] + (src[4] << 1) + src[5] + 2) >> 2); pfirst[4][i] = (pel_t)((src[2] + (src[3] << 1) + src[4] + 2) >> 2); pfirst[5][i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); pfirst[6][i] = (pel_t)((src[0] + (src[1] << 1) + src[2] + 2) >> 2); pfirst[7][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; __m256i S0, S1, S2, S3; __m256i L0, L1, L2, L3; __m256i H0, H1, H2, H3; for (; i < line_size - 16; i += 32, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src + 2)); S1 = _mm256_loadu_si256((__m256i*)(src + 1)); S2 = _mm256_loadu_si256((__m256i*)(src)); S3 = _mm256_loadu_si256((__m256i*)(src - 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1)); p00 = _mm256_mullo_epi16(L0, coeff7); p10 = _mm256_mullo_epi16(L1, coeff15); p20 = _mm256_mullo_epi16(L2, coeff9); p30 = _mm256_add_epi16(L3, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p01 = _mm256_mullo_epi16(H0, coeff7); p11 = _mm256_mullo_epi16(H1, coeff15); p21 = _mm256_mullo_epi16(H2, coeff9); p31 = _mm256_add_epi16(H3, coeff16); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[0][i], p00); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p01 = _mm256_mullo_epi16(H0, coeff3); p11 = _mm256_mullo_epi16(H1, coeff7); p21 = _mm256_mullo_epi16(H2, coeff5); p31 = _mm256_add_epi16(H3, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[1][i], p00); p00 = _mm256_mullo_epi16(L0, coeff5); p10 = _mm256_mullo_epi16(L1, coeff13); p20 = _mm256_mullo_epi16(L2, coeff11); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p01 = _mm256_mullo_epi16(H0, coeff5); p11 = _mm256_mullo_epi16(H1, coeff13); p21 = _mm256_mullo_epi16(H2, coeff11); p31 = _mm256_mullo_epi16(H3, coeff3); p01 = _mm256_add_epi16(p01, coeff16); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[2][i], p00); p00 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(L1, L2); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H0, H3); p11 = _mm256_add_epi16(H1, H2); p11 = _mm256_mullo_epi16(p11, coeff3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_srli_epi16(p01, 3); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[3][i], p00); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff11); p20 = _mm256_mullo_epi16(L2, coeff13); p30 = _mm256_mullo_epi16(L3, coeff5); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p01 = _mm256_mullo_epi16(H0, coeff3); p11 = _mm256_mullo_epi16(H1, coeff11); p21 = _mm256_mullo_epi16(H2, coeff13); p31 = _mm256_mullo_epi16(H3, coeff5); p01 = _mm256_add_epi16(p01, coeff16); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[4][i], p00); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p11 = _mm256_mullo_epi16(H1, coeff5); p21 = _mm256_mullo_epi16(H2, coeff7); p31 = _mm256_mullo_epi16(H3, coeff3); p01 = _mm256_add_epi16(H0, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[5][i], p00); p10 = _mm256_mullo_epi16(L1, coeff9); p20 = _mm256_mullo_epi16(L2, coeff15); p30 = _mm256_mullo_epi16(L3, coeff7); p00 = _mm256_add_epi16(L0, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p11 = _mm256_mullo_epi16(H1, coeff9); p21 = _mm256_mullo_epi16(H2, coeff15); p31 = _mm256_mullo_epi16(H3, coeff7); p01 = _mm256_add_epi16(H0, coeff16); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[6][i], p00); p10 = _mm256_mullo_epi16(L2, coeff2); p00 = _mm256_add_epi16(L1, L3); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p11 = _mm256_mullo_epi16(H2, coeff2); p01 = _mm256_add_epi16(H1, H3); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[7][i], p00); } __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[bsx - 1]); if (i < line_size) { S0 = _mm256_loadu_si256((__m256i*)(src + 2)); S1 = _mm256_loadu_si256((__m256i*)(src + 1)); S2 = _mm256_loadu_si256((__m256i*)(src)); S3 = _mm256_loadu_si256((__m256i*)(src - 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); p00 = _mm256_mullo_epi16(L0, coeff7); p10 = _mm256_mullo_epi16(L1, coeff15); p20 = _mm256_mullo_epi16(L2, coeff9); p30 = _mm256_add_epi16(L3, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[0][i], mask, p00); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[1][i], mask, p00); p00 = _mm256_mullo_epi16(L0, coeff5); p10 = _mm256_mullo_epi16(L1, coeff13); p20 = _mm256_mullo_epi16(L2, coeff11); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[2][i], mask, p00); p00 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(L1, L2); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[3][i], mask, p00); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff11); p20 = _mm256_mullo_epi16(L2, coeff13); p30 = _mm256_mullo_epi16(L3, coeff5); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[4][i], mask, p00); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[5][i], mask, p00); p10 = _mm256_mullo_epi16(L1, coeff9); p20 = _mm256_mullo_epi16(L2, coeff15); p30 = _mm256_mullo_epi16(L3, coeff7); p00 = _mm256_add_epi16(L0, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[6][i], mask, p00); p10 = _mm256_mullo_epi16(L2, coeff2); p00 = _mm256_add_epi16(L1, L3); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[7][i], mask, p00); } pfirst[0] += left_size; pfirst[1] += left_size; pfirst[2] += left_size; pfirst[3] += left_size; pfirst[4] += left_size; pfirst[5] += left_size; pfirst[6] += left_size; pfirst[7] += left_size; bsy >>= 3; __m256i M; if (bsx == 64){ for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[4] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[4] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[5] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[5] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[6] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[6] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[7] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[7] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[4] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[5] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[6] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[7] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16) { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[4] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[5] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[6] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[7] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else if (bsx == 8) { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[4] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[5] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[6] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[7] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[4] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[5] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[6] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[7] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[4] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[5] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[6] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[7] - i, bsx * sizeof(pel_t)); dst += i_dst; }*/ } else { intra_pred_ang_xy_13_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } } void intra_pred_ang_xy_14_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); if (bsy != 4) { ALIGN32(pel_t first_line[4 * (64 + 32)]); int line_size = bsx + bsy / 4 - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[4]; __m256i shuffle = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); __m256i index = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); pel_t *pSrc1 = src; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; pfirst[2] = first_line + aligned_line_size * 2; pfirst[3] = first_line + aligned_line_size * 3; src -= bsy - 4; __m256i p00, p01, p10, p11; __m256i p20, p30, p21, p31; __m256i S0, S1, S2, S3; __m256i L0, L1, L2, L3; __m256i H0, H1, H2, H3; for (i = 0; i < left_size - 1; i += 8, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1));//0 1 2 3 4 5 6 7 8...15 S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0));//0...15 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//16...31 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); p00 = _mm256_add_epi16(L0, L1); p01 = _mm256_add_epi16(L1, L2); p10 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p00 = _mm256_add_epi16(p00, coeff2); p10 = _mm256_add_epi16(p10, coeff2); p00 = _mm256_add_epi16(p00, p01); p10 = _mm256_add_epi16(p10, p11); p00 = _mm256_srli_epi16(p00, 2);//0...7 8...15 p10 = _mm256_srli_epi16(p10, 2);//16...23 24...31 p00 = _mm256_packus_epi16(p00, p10);//0...7 16...23 8...15 24...31 p00 = _mm256_permute4x64_epi64(p00, 0x00D8); //0 4 8 12 1 5 9 13 2 6 10 14 3 7 11 15 16 20 24 28 17 21... p10 = _mm256_shuffle_epi8(p00, shuffle); //0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29 p10 = _mm256_permutevar8x32_epi32(p10, index); ((__int64*)&pfirst[0][i])[0] = _mm256_extract_epi64(p10, 3); ((__int64*)&pfirst[1][i])[0] = _mm256_extract_epi64(p10, 2); ((__int64*)&pfirst[2][i])[0] = _mm256_extract_epi64(p10, 1); ((__int64*)&pfirst[3][i])[0] = _mm256_extract_epi64(p10, 0); } if (i < left_size) { //sse汾avx죬ݽ __m128i shuffle1 = _mm_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 0, 4, 1, 5, 2, 6, 3, 7); __m128i coeff2 = _mm_set1_epi16(2); __m128i zero = _mm_setzero_si128(); __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero);//0 1 2 3 4 5 6 7 __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i p00 = _mm_add_epi16(L0, L1); __m128i p01 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p01); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00);//0 1 2 3 4 5 6 7 p00 = _mm_shuffle_epi8(p00, shuffle1);//0 4 1 5 2 6 3 7 ((int*)&pfirst[0][i])[0] = _mm_extract_epi16(p00, 3); ((int*)&pfirst[1][i])[0] = _mm_extract_epi16(p00, 2); ((int*)&pfirst[2][i])[0] = _mm_extract_epi16(p00, 1); ((int*)&pfirst[3][i])[0] = _mm_extract_epi16(p00, 0); } src = pSrc1; for (i = left_size; i < line_size - 16; i += 32, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1)); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_srli_epi16(p00, 4); p01 = _mm256_mullo_epi16(H0, coeff3); p11 = _mm256_mullo_epi16(H1, coeff7); p21 = _mm256_mullo_epi16(H2, coeff5); p31 = _mm256_add_epi16(H3, coeff8); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[2][i], p00); p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); p00 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H1, H2); p01 = _mm256_mullo_epi16(p01, coeff3); p11 = _mm256_add_epi16(H0, H3); p11 = _mm256_add_epi16(p11, coeff4); p01 = _mm256_add_epi16(p11, p01); p01 = _mm256_srli_epi16(p01, 3); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[1][i], p00); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p11 = _mm256_mullo_epi16(H1, coeff5); p21 = _mm256_mullo_epi16(H2, coeff7); p31 = _mm256_mullo_epi16(H3, coeff3); p01 = _mm256_add_epi16(H0, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[0][i], p00); p00 = _mm256_add_epi16(L0, L1); p10 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_srli_epi16(p00, 2); p01 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[3][i], p00); } if (i < line_size) { __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)&pfirst[2][i], mask, p00); p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)&pfirst[1][i], mask, p00); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)&pfirst[0][i], mask, p00); p00 = _mm256_add_epi16(L0, L1); p10 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)&pfirst[3][i], mask, p00); } pfirst[0] += left_size; pfirst[1] += left_size; pfirst[2] += left_size; pfirst[3] += left_size; bsy >>= 2; if (bsx == 64){ for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32) { for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16) { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else if (bsx == 8) { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t)); dst += i_dst; }*/ } else { if (bsx == 16) { __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); pel_t *dst2 = dst + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m256i p00, p10, p20, p30; __m256i L0, L1, L2, L3; __m256i S0 = _mm256_loadu_si256((__m256i*)(src - 1)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + 2)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)dst3, mask, p00); p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)dst2, mask, p00); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)dst, mask, p00); p00 = _mm256_add_epi16(L0, L1); p10 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)dst4, mask, p00); } else {//4x4 pel_t *dst2 = dst + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i p00, p10, p20, p30; __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i zero = _mm_setzero_si128(); __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); ((int*)dst3)[0] = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); ((int*)dst2)[0] = _mm_cvtsi128_si32(p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); ((int*)dst)[0] = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L0, L1); p10 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); ((int*)dst4)[0] = _mm_cvtsi128_si32(p00); } } } void intra_pred_ang_xy_16_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[2 * (64 + 48)]); int line_size = bsx + bsy / 2 - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[2]; UNUSED_PARAMETER(dir_mode); __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i shuffle = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); int i; pel_t *pSrc1; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; src -= bsy - 2; pSrc1 = src; __m256i p00, p01, p10, p11; __m256i S0, S1, S2, S3; __m256i L0, L1, L2, L3; __m256i H0, H1, H2, H3; __m256i mask1 = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < left_size - 8; i += 16, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1));// S2 = _mm256_loadu_si256((__m256i*)(src + 1));// S1 = _mm256_loadu_si256((__m256i*)(src));// L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); p00 = _mm256_add_epi16(L0, L1); p01 = _mm256_add_epi16(L1, L2); p10 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p00 = _mm256_add_epi16(p00, coeff2); p10 = _mm256_add_epi16(p10, coeff2); p00 = _mm256_add_epi16(p00, p01); p10 = _mm256_add_epi16(p10, p11); p00 = _mm256_srli_epi16(p00, 2);//0 1 2 3 4 5 6 7....15 p10 = _mm256_srli_epi16(p10, 2);//16 17 18....31 //0...7 16...23 8...15 24...31 p00 = _mm256_packus_epi16(p00, p10); p00 = _mm256_permute4x64_epi64(p00, 0x00D8);//31...16 15..0 //0 1 2 3 p00 = _mm256_shuffle_epi8(p00, shuffle); p10 = _mm256_permute4x64_epi64(p00, 0x08);//0 2 p00 = _mm256_permute4x64_epi64(p00, 0x0D);//1 3 _mm256_maskstore_epi64((__int64*)&pfirst[0][i], mask1, p00); _mm256_maskstore_epi64((__int64*)&pfirst[1][i], mask1, p10); } __m256i mask2 = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[7]); if (i < left_size) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); p00 = _mm256_add_epi16(L0, L1); p01 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p01); p00 = _mm256_srli_epi16(p00, 2); //0...7 0...7 8...15 8...15 p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008);//0...15 0...15 p01 = _mm256_shuffle_epi8(p00, shuffle);//0 2 4 6 7 8 10 12 14 1 3 5 7 9 11 13 15 p10 = _mm256_permute4x64_epi64(p01, 0x01); _mm256_maskstore_epi64((__int64*)&pfirst[0][i], mask2, p10); _mm256_maskstore_epi64((__int64*)&pfirst[1][i], mask2, p01); } src = pSrc1 + left_size + left_size; for (i = left_size; i < line_size - 16; i += 32, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1)); p00 = _mm256_add_epi16(L1, L2); p01 = _mm256_add_epi16(L0, L3); p00 = _mm256_mullo_epi16(p00, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p01); p00 = _mm256_srli_epi16(p00, 3); p10 = _mm256_add_epi16(H1, H2); p11 = _mm256_add_epi16(H0, H3); p10 = _mm256_mullo_epi16(p10, coeff3); p10 = _mm256_add_epi16(p10, coeff4); p10 = _mm256_add_epi16(p10, p11); p10 = _mm256_srli_epi16(p10, 3); p00 = _mm256_packus_epi16(p00, p10); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[0][i], p00); p00 = _mm256_add_epi16(L0, L1); p01 = _mm256_add_epi16(L1, L2); p10 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p00 = _mm256_add_epi16(p00, coeff2); p10 = _mm256_add_epi16(p10, coeff2); p00 = _mm256_add_epi16(p00, p01); p10 = _mm256_add_epi16(p10, p11); p00 = _mm256_srli_epi16(p00, 2); p10 = _mm256_srli_epi16(p10, 2); p00 = _mm256_packus_epi16(p00, p10); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[1][i], p00); } if (i < line_size) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); p00 = _mm256_add_epi16(L1, L2); p01 = _mm256_add_epi16(L0, L3); p00 = _mm256_mullo_epi16(p00, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p01); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)&pfirst[0][i], mask1, p00); p00 = _mm256_add_epi16(L0, L1); p01 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p01); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64*)&pfirst[1][i], mask1, p00); } pfirst[0] += left_size; pfirst[1] += left_size; bsy >>= 1; if (bsx == 64){ for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 1)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 1 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 1)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 1 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 2)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 2 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 2)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 2 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 3)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 3 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 3)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 3 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < bsy; i += 4){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 1)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 1)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 2)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 2)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 3)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 3)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 1)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 1)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 2)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 2)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 3)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 3)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else if (bsx == 8){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 1)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 1)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 2)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 2)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 3)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 3)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; } } else{ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 1)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 1)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 2)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 2)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 3)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 3)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*switch (bsx) { case 4: for (i = 0; i < bsy; i++) { CP32(dst, pfirst[0] - i); CP32(dst + i_dst, pfirst[1] - i); dst += (i_dst << 1); } break; case 8: for (i = 0; i < bsy; i++) { CP64(dst, pfirst[0] - i); CP64(dst + i_dst, pfirst[1] - i); dst += (i_dst << 1); } break; default: for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += (i_dst << 1); } break; }*/ } void intra_pred_ang_xy_18_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; int i; pel_t *pfirst = first_line + bsy - 1; UNUSED_PARAMETER(dir_mode); __m256i coeff2 = _mm256_set1_epi16(2); src -= bsy - 1; __m256i S0, S1, S2; __m256i L0, L1, L2; __m256i H0, H1, H2; __m256i sum1, sum2, sum3, sum4; for (i = 0; i < line_size - 16; i += 32, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); sum1 = _mm256_add_epi16(L0, L1); sum2 = _mm256_add_epi16(L1, L2); sum3 = _mm256_add_epi16(H0, H1); sum4 = _mm256_add_epi16(H1, H2); sum1 = _mm256_add_epi16(sum1, sum2); sum3 = _mm256_add_epi16(sum3, sum4); sum1 = _mm256_add_epi16(sum1, coeff2); sum3 = _mm256_add_epi16(sum3, coeff2); sum1 = _mm256_srli_epi16(sum1, 2); sum3 = _mm256_srli_epi16(sum3, 2); sum1 = _mm256_packus_epi16(sum1, sum3); sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_storeu_si256((__m256i*)&first_line[i], sum1); } if (i < line_size) { __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); sum1 = _mm256_add_epi16(L0, L1); sum2 = _mm256_add_epi16(L1, L2); sum1 = _mm256_add_epi16(sum1, sum2); sum1 = _mm256_add_epi16(sum1, coeff2); sum1 = _mm256_srli_epi16(sum1, 2); sum1 = _mm256_packus_epi16(sum1, sum1); sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_maskstore_epi64((__int64*)&first_line[i], mask, sum1); } __m256i M; if (bsx == 64) { for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst--; } } else if (bsx == 32) { for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst--; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst--; } } else if (bsx == 8) { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst--; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst--; } } /*switch (bsx) { case 4: for (i = 0; i < bsy; i++) { CP32(dst, pfirst--); dst += i_dst; } break; case 8: for (i = 0; i < bsy; i++) { CP64(dst, pfirst--); dst += i_dst; } break; default: for (i = 0; i < bsy; i++) { memcpy(dst, pfirst--, bsx * sizeof(pel_t)); dst += i_dst; } break; break; }*/ } void intra_pred_ang_xy_20_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[64 + 128]); int left_size = (bsy - 1) * 2 + 1; int top_size = bsx - 1; int line_size = left_size + top_size; int i; pel_t *pfirst = first_line + left_size - 1; __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i shuffle = _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); pel_t *pSrc1 = src; UNUSED_PARAMETER(dir_mode); src -= bsy; __m256i p00, p01, p10, p11; __m256i p20, p21, p30, p31; __m256i S0, S1, S2, S3; __m256i L0, L1, L2, L3; __m256i H0, H1, H2, H3; for (i = 0; i < left_size - 32; i += 64, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1));//0...7 8...15 S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0));//0...7 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1)); p00 = _mm256_add_epi16(L1, L2); p01 = _mm256_add_epi16(L0, L3); p00 = _mm256_mullo_epi16(p00, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p01); p00 = _mm256_srli_epi16(p00, 3);//0...15 p10 = _mm256_add_epi16(H1, H2); p11 = _mm256_add_epi16(H0, H3); p10 = _mm256_mullo_epi16(p10, coeff3); p10 = _mm256_add_epi16(p10, coeff4); p10 = _mm256_add_epi16(p10, p11); p10 = _mm256_srli_epi16(p10, 3);//16..31 p20 = _mm256_add_epi16(L1, L2); p21 = _mm256_add_epi16(L2, L3); p20 = _mm256_add_epi16(p20, coeff2); p20 = _mm256_add_epi16(p20, p21); p20 = _mm256_srli_epi16(p20, 2);//0...15 p30 = _mm256_add_epi16(H1, H2); p31 = _mm256_add_epi16(H2, H3); p30 = _mm256_add_epi16(p30, coeff2); p30 = _mm256_add_epi16(p30, p31); p30 = _mm256_srli_epi16(p30, 2);//16...31 //00...07 10...17 08...015 18...115 p00 = _mm256_packus_epi16(p00, p20); p10 = _mm256_packus_epi16(p10, p30); p00 = _mm256_shuffle_epi8(p00, shuffle); p10 = _mm256_shuffle_epi8(p10, shuffle); _mm256_storeu_si256((__m256i*)&first_line[i], p00); _mm256_storeu_si256((__m256i*)&first_line[i + 32], p10); } if (i < left_size) { S0 = _mm256_loadu_si256((__m256i*)(src - 1));//0...7 8...15 S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0));//0...7 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p01 = _mm256_add_epi16(L0, L3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p01); p00 = _mm256_srli_epi16(p00, 3);//0...15 p20 = _mm256_add_epi16(L1, L2); p21 = _mm256_add_epi16(L2, L3); p20 = _mm256_add_epi16(p20, coeff2); p20 = _mm256_add_epi16(p20, p21); p20 = _mm256_srli_epi16(p20, 2);//0...15 p00 = _mm256_packus_epi16(p00, p20); p00 = _mm256_shuffle_epi8(p00, shuffle); _mm256_storeu_si256((__m256i*)&first_line[i], p00); } src = pSrc1; __m256i sum1, sum2, sum3, sum4; for (i = left_size; i < line_size - 16; i += 32, src += 32) { S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); sum1 = _mm256_add_epi16(L0, L1); sum2 = _mm256_add_epi16(L1, L2); sum3 = _mm256_add_epi16(H0, H1); sum4 = _mm256_add_epi16(H1, H2); sum1 = _mm256_add_epi16(sum1, sum2); sum3 = _mm256_add_epi16(sum3, sum4); sum1 = _mm256_add_epi16(sum1, coeff2); sum3 = _mm256_add_epi16(sum3, coeff2); sum1 = _mm256_srli_epi16(sum1, 2); sum3 = _mm256_srli_epi16(sum3, 2); sum1 = _mm256_packus_epi16(sum1, sum3); sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_storeu_si256((__m256i*)&first_line[i], sum1); } __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); if (i < line_size) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); sum1 = _mm256_add_epi16(L0, L1); sum2 = _mm256_add_epi16(L1, L2); sum1 = _mm256_add_epi16(sum1, sum2); sum1 = _mm256_add_epi16(sum1, coeff2); sum1 = _mm256_srli_epi16(sum1, 2); sum1 = _mm256_packus_epi16(sum1, sum1); sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_maskstore_epi64((__int64*)&first_line[i], mask, sum1); } if (bsx == 64){ for (i = 0; i < bsy; i += 8) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; } } else if (bsx == 32){ for (i = 0; i < bsy; i += 8) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; } } else if (bsx == 16){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 2; } } else if (bsx == 8){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 8) { __m256i M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 2; } } else{ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst -= 2; } } /*for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); pfirst -= 2; dst += i_dst; }*/ } void intra_pred_ang_xy_22_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsx != 4) { src -= bsy; ALIGN32(pel_t first_line[64 + 256]); int left_size = (bsy - 1) * 4 + 3; int top_size = bsx - 3; int line_size = left_size + top_size; pel_t *pfirst = first_line + left_size - 3; pel_t *pSrc1 = src; __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); __m256i shuffle = _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; __m256i M1, M2, M3, M4, M5, M6, M7, M8; __m256i S0, S1, S2, S3; __m256i L0, L1, L2, L3; __m256i H0, H1, H2, H3; for (i = 0; i < line_size - 64; i += 128, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1)); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); M1 = _mm256_srli_epi16(p00, 4);//0...15 p01 = _mm256_mullo_epi16(H0, coeff3); p11 = _mm256_mullo_epi16(H1, coeff7); p21 = _mm256_mullo_epi16(H2, coeff5); p31 = _mm256_add_epi16(H3, coeff8); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); M2 = _mm256_srli_epi16(p01, 4);//16...31 p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); M3 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H1, H2); p01 = _mm256_mullo_epi16(p01, coeff3); p11 = _mm256_add_epi16(H0, H3); p11 = _mm256_add_epi16(p11, coeff4); p01 = _mm256_add_epi16(p11, p01); M4 = _mm256_srli_epi16(p01, 3); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); M5 = _mm256_srli_epi16(p00, 4); p11 = _mm256_mullo_epi16(H1, coeff5); p21 = _mm256_mullo_epi16(H2, coeff7); p31 = _mm256_mullo_epi16(H3, coeff3); p01 = _mm256_add_epi16(H0, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); M6 = _mm256_srli_epi16(p01, 4); p00 = _mm256_add_epi16(L1, L2); p10 = _mm256_add_epi16(L2, L3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); M7 = _mm256_srli_epi16(p00, 2); p01 = _mm256_add_epi16(H1, H2); p11 = _mm256_add_epi16(H2, H3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); M8 = _mm256_srli_epi16(p01, 2); M1 = _mm256_packus_epi16(M1, M3);//00...08 10...18 M5 = _mm256_packus_epi16(M5, M7); M1 = _mm256_shuffle_epi8(M1, shuffle);//00 10 01 11 02 12... M5 = _mm256_shuffle_epi8(M5, shuffle); M2 = _mm256_packus_epi16(M2, M4); M6 = _mm256_packus_epi16(M6, M8); M2 = _mm256_shuffle_epi8(M2, shuffle); M6 = _mm256_shuffle_epi8(M6, shuffle); M1 = _mm256_permute4x64_epi64(M1, 0x00D8); M5 = _mm256_permute4x64_epi64(M5, 0x00D8); M2 = _mm256_permute4x64_epi64(M2, 0x00D8); M6 = _mm256_permute4x64_epi64(M6, 0x00D8); M3 = _mm256_unpacklo_epi16(M1, M5); M7 = _mm256_unpackhi_epi16(M1, M5); M4 = _mm256_unpacklo_epi16(M2, M6); M8 = _mm256_unpackhi_epi16(M2, M6); _mm256_storeu_si256((__m256i*)&first_line[i], M3); _mm256_storeu_si256((__m256i*)&first_line[32 + i], M7); _mm256_storeu_si256((__m256i*)&first_line[64 + i], M4); _mm256_storeu_si256((__m256i*)&first_line[96 + i], M8); } if (i < left_size) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); M1 = _mm256_srli_epi16(p00, 4); p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); M3 = _mm256_srli_epi16(p00, 3); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); M5 = _mm256_srli_epi16(p00, 4); p10 = _mm256_add_epi16(L2, L3); p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); M7 = _mm256_srli_epi16(p00, 2); M1 = _mm256_packus_epi16(M1, M3); M5 = _mm256_packus_epi16(M5, M7); M1 = _mm256_shuffle_epi8(M1, shuffle); M5 = _mm256_shuffle_epi8(M5, shuffle); M1 = _mm256_permute4x64_epi64(M1, 0x00D8); M5 = _mm256_permute4x64_epi64(M5, 0x00D8); M3 = _mm256_unpacklo_epi16(M1, M5); M7 = _mm256_unpackhi_epi16(M1, M5); _mm256_store_si256((__m256i*)&first_line[i], M3); _mm256_store_si256((__m256i*)&first_line[32 + i], M7); } src = pSrc1 + bsy; __m256i sum1, sum2, sum3, sum4; for (i = left_size; i < line_size - 16; i += 32, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); sum1 = _mm256_add_epi16(L0, L1); sum2 = _mm256_add_epi16(L1, L2); sum3 = _mm256_add_epi16(H0, H1); sum4 = _mm256_add_epi16(H1, H2); sum1 = _mm256_add_epi16(sum1, sum2); sum3 = _mm256_add_epi16(sum3, sum4); sum1 = _mm256_add_epi16(sum1, coeff2); sum3 = _mm256_add_epi16(sum3, coeff2); sum1 = _mm256_srli_epi16(sum1, 2); sum3 = _mm256_srli_epi16(sum3, 2); sum1 = _mm256_packus_epi16(sum1, sum3); sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_storeu_si256((__m256i*)&first_line[i], sum1); } if (i < line_size) { __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); sum1 = _mm256_add_epi16(L0, L1); sum2 = _mm256_add_epi16(L1, L2); sum1 = _mm256_add_epi16(sum1, sum2); sum1 = _mm256_add_epi16(sum1, coeff2); sum1 = _mm256_srli_epi16(sum1, 2); sum1 = _mm256_packus_epi16(sum1, sum1); sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_maskstore_epi64((__int64*)&first_line[i], mask, sum1); } __m256i M; if (bsx == 64) { for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 4; } } else if (bsx == 32) { for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 4; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 4; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 4; } } /* switch (bsx) { case 8: while (bsy--) { CP64(dst, pfirst); dst += i_dst; pfirst -= 4; } break; case 16: case 32: case 64: while (bsy--) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); dst += i_dst; pfirst -= 4; } break; default: assert(0); break; }*/ } else {//4x4 4x16 for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((src[-2] * 3 + src[-1] * 7 + src[0] * 5 + src[1] + 8) >> 4); dst[1] = (pel_t)((src[-2] + (src[-1] + src[0]) * 3 + src[1] + 4) >> 3); dst[2] = (pel_t)((src[-2] + src[-1] * 5 + src[0] * 7 + src[1] * 3 + 8) >> 4); dst[3] = (pel_t)(( src[-1] + src[0] * 2 + src[1] + 2) >> 2); dst += i_dst; } } } void intra_pred_ang_xy_23_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsx > 8) { ALIGN32(pel_t first_line[64 + 512]); int left_size = (bsy << 3) - 1; int top_size = bsx - 7; int line_size = left_size + top_size; pel_t *pfirst = first_line + left_size - 7; pel_t *pfirst1 = first_line; pel_t *src_org = src; src -= bsy; __m256i coeff0 = _mm256_setr_epi16(7, 3, 5, 1, 3, 1, 1, 0, 7, 3, 5, 1, 3, 1, 1, 0); __m256i coeff1 = _mm256_setr_epi16(15, 7, 13, 3, 11, 5, 9, 1, 15, 7, 13, 3, 11, 5, 9, 1); __m256i coeff2 = _mm256_setr_epi16(9, 5, 11, 3, 13, 7, 15, 2, 9, 5, 11, 3, 13, 7, 15, 2); __m256i coeff3 = _mm256_setr_epi16(1, 1, 3, 1, 5, 3, 7, 1, 1, 1, 3, 1, 5, 3, 7, 1); __m256i coeff4 = _mm256_setr_epi16(16, 8, 16, 4, 16, 8, 16, 2, 16, 8, 16, 4, 16, 8, 16, 2); __m256i coeff5 = _mm256_setr_epi16(1, 2, 1, 4, 1, 2, 1, 8, 1, 2, 1, 4, 1, 2, 1, 8); __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; __m256i res1, res2; __m256i L0, L1, L2, L3; __m256i H0, H1, H2; if (bsy == 4){ L0 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[1]);//-1 3 L1 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[2], src[2], src[2], src[2], src[2], src[2], src[2], src[2]);//0 4 L2 = _mm256_setr_epi16(src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[3], src[3], src[3], src[3], src[3], src[3], src[3], src[3]);//1 5 L3 = _mm256_setr_epi16(src[2], src[2], src[2], src[2], src[2], src[2], src[2], src[2], src[4], src[4], src[4], src[4], src[4], src[4], src[4], src[4]);//2 6 src += 4; for (i = 0; i < left_size + 1; i += 32) { p00 = _mm256_mullo_epi16(L0, coeff0);//-1 p10 = _mm256_mullo_epi16(L1, coeff1);//0 p20 = _mm256_mullo_epi16(L2, coeff2);//1 p30 = _mm256_mullo_epi16(L3, coeff3);//2 p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_mullo_epi16(p00, coeff5); p00 = _mm256_srli_epi16(p00, 5); L0 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[1]);//-1 3 p01 = _mm256_mullo_epi16(L1, coeff0);//0 p11 = _mm256_mullo_epi16(L2, coeff1);//1 p21 = _mm256_mullo_epi16(L3, coeff2);//2 p31 = _mm256_mullo_epi16(L0, coeff3);//3 p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_mullo_epi16(p01, coeff5); p01 = _mm256_srli_epi16(p01, 5); res1 = _mm256_packus_epi16(p00, p01); _mm256_storeu_si256((__m256i*)pfirst1, res1); } } else { L0 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[3], src[3], src[3], src[3], src[3], src[3], src[3], src[3]);//-1 3 L1 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[4], src[4], src[4], src[4], src[4], src[4], src[4], src[4]);//0 4 L2 = _mm256_setr_epi16(src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[5], src[5], src[5], src[5], src[5], src[5], src[5], src[5]);//1 5 L3 = _mm256_setr_epi16(src[2], src[2], src[2], src[2], src[2], src[2], src[2], src[2], src[6], src[6], src[6], src[6], src[6], src[6], src[6], src[6]);//2 6 src += 4; for (i = 0; i < left_size + 1; i += 64, src += 4) { p00 = _mm256_mullo_epi16(L0, coeff0);//-1 3 p10 = _mm256_mullo_epi16(L1, coeff1);// 0 4 p20 = _mm256_mullo_epi16(L2, coeff2);// 1 5 p30 = _mm256_mullo_epi16(L3, coeff3);// 2 6 p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_mullo_epi16(p00, coeff5); p00 = _mm256_srli_epi16(p00, 5); L0 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[3], src[3], src[3], src[3], src[3], src[3], src[3], src[3]);//3 7 p01 = _mm256_mullo_epi16(L1, coeff0);//0 4 p11 = _mm256_mullo_epi16(L2, coeff1);//1 5 p21 = _mm256_mullo_epi16(L3, coeff2);//2 6 p31 = _mm256_mullo_epi16(L0, coeff3);//3 7 p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_mullo_epi16(p01, coeff5); p01 = _mm256_srli_epi16(p01, 5); res1 = _mm256_packus_epi16(p00, p01); L1 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[4], src[4], src[4], src[4], src[4], src[4], src[4], src[4]);//4 8 p00 = _mm256_mullo_epi16(L2, coeff0);//1 5 p10 = _mm256_mullo_epi16(L3, coeff1);//2 6 p20 = _mm256_mullo_epi16(L0, coeff2);//3 7 p30 = _mm256_mullo_epi16(L1, coeff3);//4 8 p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_mullo_epi16(p00, coeff5); p00 = _mm256_srli_epi16(p00, 5); L2 = _mm256_setr_epi16(src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[5], src[5], src[5], src[5], src[5], src[5], src[5], src[5]);//5 9 p01 = _mm256_mullo_epi16(L3, coeff0);//2 6 p11 = _mm256_mullo_epi16(L0, coeff1);//3 7 p21 = _mm256_mullo_epi16(L1, coeff2);//4 8 p31 = _mm256_mullo_epi16(L2, coeff3);//5 9 p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_mullo_epi16(p01, coeff5); p01 = _mm256_srli_epi16(p01, 5); res2 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute2x128_si256(res1, res2, 0x0020); _mm256_storeu_si256((__m256i*)pfirst1, p00); pfirst1 += 32; p00 = _mm256_permute2x128_si256(res1, res2, 0x0031); _mm256_storeu_si256((__m256i*)pfirst1, p00); pfirst1 += 32; src += 4; L0 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[3], src[3], src[3], src[3], src[3], src[3], src[3], src[3]); L1 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[4], src[4], src[4], src[4], src[4], src[4], src[4], src[4]); L2 = _mm256_setr_epi16(src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[5], src[5], src[5], src[5], src[5], src[5], src[5], src[5]); L3 = _mm256_setr_epi16(src[2], src[2], src[2], src[2], src[2], src[2], src[2], src[2], src[6], src[6], src[6], src[6], src[6], src[6], src[6], src[6]); } } src = src_org + 1; __m256i S0, S1, S2; coeff2 = _mm256_set1_epi16(2); for (; i < line_size; i += 32, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + 1)); S2 = _mm256_loadu_si256((__m256i*)(src - 1)); __m256i L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); __m256i L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); __m256i L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); p00 = _mm256_mullo_epi16(L0, coeff2); p10 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p01 = _mm256_mullo_epi16(H0, coeff2); p11 = _mm256_add_epi16(H1, H2); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&first_line[i], p00); } __m256i M; if (bsx == 64) { for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 8; } } else if (bsx == 32){ for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 8; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64*)dst, mask, M); dst += i_dst; pfirst -= 8; } } /*for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); dst += i_dst; pfirst -= 8; }*/ } else {//8x8 8x32 4x4 4x16------128bit is enough intra_pred_ang_xy_23_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } } xavs2-1.3/source/common/vec/intrinsic_mad.c000066400000000000000000005121761340660520300207670ustar00rootroot00000000000000/* * intrinsic_mad.c * * Description of this file: * SSE assembly functions of MAD-Calculating module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../basic_types.h" #include "intrinsic.h" #include #include #include #include /* --------------------------------------------------------------------------- */ int mad_16x16_sse128(pel_t *p_src, int i_src, int cu_size) { __m128i zero; __m128i T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15; __m128i T0_0, T1_0, T2_0, T3_0, T4_0, T5_0, T6_0, T7_0, T8_0, T9_0, T10_0, T11_0, T12_0, T13_0, T14_0, T15_0; __m128i T0_1, T1_1, T2_1, T3_1, T4_1, T5_1, T6_1, T7_1, T8_1, T9_1, T10_1, T11_1, T12_1, T13_1, T14_1, T15_1; __m128i S; __m128i avg; __m128i M; int num_pix = cu_size * cu_size; int sum = 0; int f_avg = 0; /* average of all pixels in current block */ int mad = 0; /* cal average */ zero = _mm_set1_epi8(0); T0 = _mm_loadu_si128((__m128i *)p_src); T0_0 = _mm_unpacklo_epi8(T0, zero); T0_1 = _mm_unpackhi_epi8(T0, zero); T0 = _mm_add_epi16(T0_0, T0_1); T1 = _mm_loadu_si128((__m128i *)(p_src + i_src)); T1_0 = _mm_unpacklo_epi8(T1, zero); T1_1 = _mm_unpackhi_epi8(T1, zero); T1 = _mm_add_epi16(T1_0, T1_1); T2 = _mm_loadu_si128((__m128i *)(p_src + 2 * i_src)); T2_0 = _mm_unpacklo_epi8(T2, zero); T2_1 = _mm_unpackhi_epi8(T2, zero); T2 = _mm_add_epi16(T2_0, T2_1); T3 = _mm_loadu_si128((__m128i *)(p_src + 3 * i_src)); T3_0 = _mm_unpacklo_epi8(T3, zero); T3_1 = _mm_unpackhi_epi8(T3, zero); T3 = _mm_add_epi16(T3_0, T3_1); T4 = _mm_loadu_si128((__m128i *)(p_src + 4 * i_src)); T4_0 = _mm_unpacklo_epi8(T4, zero); T4_1 = _mm_unpackhi_epi8(T4, zero); T4 = _mm_add_epi16(T4_0, T4_1); T5 = _mm_loadu_si128((__m128i *)(p_src + 5 * i_src)); T5_0 = _mm_unpacklo_epi8(T5, zero); T5_1 = _mm_unpackhi_epi8(T5, zero); T5 = _mm_add_epi16(T5_0, T5_1); T6 = _mm_loadu_si128((__m128i *)(p_src + 6 * i_src)); T6_0 = _mm_unpacklo_epi8(T6, zero); T6_1 = _mm_unpackhi_epi8(T6, zero); T6 = _mm_add_epi16(T6_0, T6_1); T7 = _mm_loadu_si128((__m128i *)(p_src + 7 * i_src)); T7_0 = _mm_unpacklo_epi8(T7, zero); T7_1 = _mm_unpackhi_epi8(T7, zero); T7 = _mm_add_epi16(T7_0, T7_1); T8 = _mm_loadu_si128((__m128i *)(p_src + 8 * i_src)); T8_0 = _mm_unpacklo_epi8(T8, zero); T8_1 = _mm_unpackhi_epi8(T8, zero); T8 = _mm_add_epi16(T8_0, T8_1); T9 = _mm_loadu_si128((__m128i *)(p_src + 9 * i_src)); T9_0 = _mm_unpacklo_epi8(T9, zero); T9_1 = _mm_unpackhi_epi8(T9, zero); T9 = _mm_add_epi16(T9_0, T9_1); T10 = _mm_loadu_si128((__m128i *)(p_src + 10 * i_src)); T10_0 = _mm_unpacklo_epi8(T10, zero); T10_1 = _mm_unpackhi_epi8(T10, zero); T10 = _mm_add_epi16(T10_0, T10_1); T11 = _mm_loadu_si128((__m128i *)(p_src + 11 * i_src)); T11_0 = _mm_unpacklo_epi8(T11, zero); T11_1 = _mm_unpackhi_epi8(T11, zero); T11 = _mm_add_epi16(T11_0, T11_1); T12 = _mm_loadu_si128((__m128i *)(p_src + 12 * i_src)); T12_0 = _mm_unpacklo_epi8(T12, zero); T12_1 = _mm_unpackhi_epi8(T12, zero); T12 = _mm_add_epi16(T12_0, T12_1); T13 = _mm_loadu_si128((__m128i *)(p_src + 13 * i_src)); T13_0 = _mm_unpacklo_epi8(T13, zero); T13_1 = _mm_unpackhi_epi8(T13, zero); T13 = _mm_add_epi16(T13_0, T13_1); T14 = _mm_loadu_si128((__m128i *)(p_src + 14 * i_src)); T14_0 = _mm_unpacklo_epi8(T14, zero); T14_1 = _mm_unpackhi_epi8(T14, zero); T14 = _mm_add_epi16(T14_0, T14_1); T15 = _mm_loadu_si128((__m128i *)(p_src + 15 * i_src)); T15_0 = _mm_unpacklo_epi8(T15, zero); T15_1 = _mm_unpackhi_epi8(T15, zero); T15 = _mm_add_epi16(T15_0, T15_1); S = _mm_add_epi16(T0, T1); S = _mm_add_epi16(S, T2); S = _mm_add_epi16(S, T3); S = _mm_add_epi16(S, T4); S = _mm_add_epi16(S, T5); S = _mm_add_epi16(S, T6); S = _mm_add_epi16(S, T7); S = _mm_add_epi16(S, T8); S = _mm_add_epi16(S, T9); S = _mm_add_epi16(S, T10); S = _mm_add_epi16(S, T11); S = _mm_add_epi16(S, T12); S = _mm_add_epi16(S, T13); S = _mm_add_epi16(S, T14); S = _mm_add_epi16(S, T15); sum = M128_I16(S, 0) + M128_I16(S, 1) + M128_I16(S, 2) + M128_I16(S, 3) + M128_I16(S, 4) + M128_I16(S, 5) + M128_I16(S, 6) + M128_I16(S, 7); f_avg = (sum + (num_pix >> 1)) / num_pix; avg = _mm_set1_epi16((short)f_avg); /* cal mad */ T0_0 = _mm_sub_epi16(T0_0, avg); T0_1 = _mm_sub_epi16(T0_1, avg); T1_0 = _mm_sub_epi16(T1_0, avg); T1_1 = _mm_sub_epi16(T1_1, avg); T2_0 = _mm_sub_epi16(T2_0, avg); T2_1 = _mm_sub_epi16(T2_1, avg); T3_0 = _mm_sub_epi16(T3_0, avg); T3_1 = _mm_sub_epi16(T3_1, avg); T4_0 = _mm_sub_epi16(T4_0, avg); T4_1 = _mm_sub_epi16(T4_1, avg); T5_0 = _mm_sub_epi16(T5_0, avg); T5_1 = _mm_sub_epi16(T5_1, avg); T6_0 = _mm_sub_epi16(T6_0, avg); T6_1 = _mm_sub_epi16(T6_1, avg); T7_0 = _mm_sub_epi16(T7_0, avg); T7_1 = _mm_sub_epi16(T7_1, avg); T8_0 = _mm_sub_epi16(T8_0, avg); T8_1 = _mm_sub_epi16(T8_1, avg); T9_0 = _mm_sub_epi16(T9_0, avg); T9_1 = _mm_sub_epi16(T9_1, avg); T10_0 = _mm_sub_epi16(T10_0, avg); T10_1 = _mm_sub_epi16(T10_1, avg); T11_0 = _mm_sub_epi16(T11_0, avg); T11_1 = _mm_sub_epi16(T11_1, avg); T12_0 = _mm_sub_epi16(T12_0, avg); T12_1 = _mm_sub_epi16(T12_1, avg); T13_0 = _mm_sub_epi16(T13_0, avg); T13_1 = _mm_sub_epi16(T13_1, avg); T14_0 = _mm_sub_epi16(T14_0, avg); T14_1 = _mm_sub_epi16(T14_1, avg); T15_0 = _mm_sub_epi16(T15_0, avg); T15_1 = _mm_sub_epi16(T15_1, avg); T0_0 = _mm_abs_epi16(T0_0); T0_1 = _mm_abs_epi16(T0_1); T1_0 = _mm_abs_epi16(T1_0); T1_1 = _mm_abs_epi16(T1_1); T2_0 = _mm_abs_epi16(T2_0); T2_1 = _mm_abs_epi16(T2_1); T3_0 = _mm_abs_epi16(T3_0); T3_1 = _mm_abs_epi16(T3_1); T4_0 = _mm_abs_epi16(T4_0); T4_1 = _mm_abs_epi16(T4_1); T5_0 = _mm_abs_epi16(T5_0); T5_1 = _mm_abs_epi16(T5_1); T6_0 = _mm_abs_epi16(T6_0); T6_1 = _mm_abs_epi16(T6_1); T7_0 = _mm_abs_epi16(T7_0); T7_1 = _mm_abs_epi16(T7_1); T8_0 = _mm_abs_epi16(T8_0); T8_1 = _mm_abs_epi16(T8_1); T9_0 = _mm_abs_epi16(T9_0); T9_1 = _mm_abs_epi16(T9_1); T10_0 = _mm_abs_epi16(T10_0); T10_1 = _mm_abs_epi16(T10_1); T11_0 = _mm_abs_epi16(T11_0); T11_1 = _mm_abs_epi16(T11_1); T12_0 = _mm_abs_epi16(T12_0); T12_1 = _mm_abs_epi16(T12_1); T13_0 = _mm_abs_epi16(T13_0); T13_1 = _mm_abs_epi16(T13_1); T14_0 = _mm_abs_epi16(T14_0); T14_1 = _mm_abs_epi16(T14_1); T15_0 = _mm_abs_epi16(T15_0); T15_1 = _mm_abs_epi16(T15_1); T0 = _mm_add_epi16(T0_0, T0_1); T1 = _mm_add_epi16(T1_0, T1_1); T2 = _mm_add_epi16(T2_0, T2_1); T3 = _mm_add_epi16(T3_0, T3_1); T4 = _mm_add_epi16(T4_0, T4_1); T5 = _mm_add_epi16(T5_0, T5_1); T6 = _mm_add_epi16(T6_0, T6_1); T7 = _mm_add_epi16(T7_0, T7_1); T8 = _mm_add_epi16(T8_0, T8_1); T9 = _mm_add_epi16(T9_0, T9_1); T10 = _mm_add_epi16(T10_0, T10_1); T11 = _mm_add_epi16(T11_0, T11_1); T12 = _mm_add_epi16(T12_0, T12_1); T13 = _mm_add_epi16(T13_0, T13_1); T14 = _mm_add_epi16(T14_0, T14_1); T15 = _mm_add_epi16(T15_0, T15_1); M = _mm_add_epi16(T0, T1); M = _mm_add_epi16(M, T2); M = _mm_add_epi16(M, T3); M = _mm_add_epi16(M, T4); M = _mm_add_epi16(M, T5); M = _mm_add_epi16(M, T6); M = _mm_add_epi16(M, T7); M = _mm_add_epi16(M, T8); M = _mm_add_epi16(M, T9); M = _mm_add_epi16(M, T10); M = _mm_add_epi16(M, T11); M = _mm_add_epi16(M, T12); M = _mm_add_epi16(M, T13); M = _mm_add_epi16(M, T14); M = _mm_add_epi16(M, T15); mad = M128_U16(S, 0) + M128_U16(S, 1) + M128_U16(S, 2) + M128_U16(S, 3) + M128_U16(S, 4) + M128_U16(S, 5) + M128_U16(S, 6) + M128_U16(S, 7); return mad; } /* --------------------------------------------------------------------------- */ int mad_32x32_sse128(pel_t *p_src, int i_src, int cu_size) { __m128i zero; __m128i T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, T31; __m128i T0A, T1A, T2A, T3A, T4A, T5A, T6A, T7A, T8A, T9A, T10A, T11A, T12A, T13A, T14A, T15A, T16A, T17A, T18A, T19A, T20A, T21A, T22A, T23A, T24A, T25A, T26A, T27A, T28A, T29A, T30A, T31A; __m128i T0B, T1B, T2B, T3B, T4B, T5B, T6B, T7B, T8B, T9B, T10B, T11B, T12B, T13B, T14B, T15B, T16B, T17B, T18B, T19B, T20B, T21B, T22B, T23B, T24B, T25B, T26B, T27B, T28B, T29B, T30B, T31B; __m128i T0_0A, T1_0A, T2_0A, T3_0A, T4_0A, T5_0A, T6_0A, T7_0A, T8_0A, T9_0A, T10_0A, T11_0A, T12_0A, T13_0A, T14_0A, T15_0A, T16_0A, T17_0A, T18_0A, T19_0A, T20_0A, T21_0A, T22_0A, T23_0A, T24_0A, T25_0A, T26_0A, T27_0A, T28_0A, T29_0A, T30_0A, T31_0A; __m128i T0_1A, T1_1A, T2_1A, T3_1A, T4_1A, T5_1A, T6_1A, T7_1A, T8_1A, T9_1A, T10_1A, T11_1A, T12_1A, T13_1A, T14_1A, T15_1A, T16_1A, T17_1A, T18_1A, T19_1A, T20_1A, T21_1A, T22_1A, T23_1A, T24_1A, T25_1A, T26_1A, T27_1A, T28_1A, T29_1A, T30_1A, T31_1A; __m128i T0_0B, T1_0B, T2_0B, T3_0B, T4_0B, T5_0B, T6_0B, T7_0B, T8_0B, T9_0B, T10_0B, T11_0B, T12_0B, T13_0B, T14_0B, T15_0B, T16_0B, T17_0B, T18_0B, T19_0B, T20_0B, T21_0B, T22_0B, T23_0B, T24_0B, T25_0B, T26_0B, T27_0B, T28_0B, T29_0B, T30_0B, T31_0B; __m128i T0_1B, T1_1B, T2_1B, T3_1B, T4_1B, T5_1B, T6_1B, T7_1B, T8_1B, T9_1B, T10_1B, T11_1B, T12_1B, T13_1B, T14_1B, T15_1B, T16_1B, T17_1B, T18_1B, T19_1B, T20_1B, T21_1B, T22_1B, T23_1B, T24_1B, T25_1B, T26_1B, T27_1B, T28_1B, T29_1B, T30_1B, T31_1B; __m128i S; __m128i avg; __m128i M; int num_pix = cu_size * cu_size; int sum = 0; int f_avg = 0; /* average of all pixels in current block */ int mad = 0; /* cal average */ zero = _mm_set1_epi8(0); T0A = _mm_loadu_si128((__m128i *)p_src); T0_0A = _mm_unpacklo_epi8(T0A, zero); T0_1A = _mm_unpackhi_epi8(T0A, zero); T0A = _mm_add_epi16(T0_0A, T0_1A); T0B = _mm_loadu_si128((__m128i *)(p_src + 16)); T0_0B = _mm_unpacklo_epi8(T0B, zero); T0_1B = _mm_unpackhi_epi8(T0B, zero); T0B = _mm_add_epi16(T0_0B, T0_1B); T0 = _mm_add_epi16(T0A, T0B); T1A = _mm_loadu_si128((__m128i *)(p_src + i_src)); T1_0A = _mm_unpacklo_epi8(T1A, zero); T1_1A = _mm_unpackhi_epi8(T1A, zero); T1A = _mm_add_epi16(T1_0A, T1_1A); T1B = _mm_loadu_si128((__m128i *)(p_src + i_src + 16)); T1_0B = _mm_unpacklo_epi8(T1B, zero); T1_1B = _mm_unpackhi_epi8(T1B, zero); T1B = _mm_add_epi16(T1_0B, T1_1B); T1 = _mm_add_epi16(T1A, T1B); T2A = _mm_loadu_si128((__m128i *)(p_src + 2 * i_src)); T2_0A = _mm_unpacklo_epi8(T2A, zero); T2_1A = _mm_unpackhi_epi8(T2A, zero); T2A = _mm_add_epi16(T2_0A, T2_1A); T2B = _mm_loadu_si128((__m128i *)(p_src + 2 * i_src + 16)); T2_0B = _mm_unpacklo_epi8(T2B, zero); T2_1B = _mm_unpackhi_epi8(T2B, zero); T2B = _mm_add_epi16(T2_0B, T2_1B); T2 = _mm_add_epi16(T2A, T2B); T3A = _mm_loadu_si128((__m128i *)(p_src + 3 * i_src)); T3_0A = _mm_unpacklo_epi8(T3A, zero); T3_1A = _mm_unpackhi_epi8(T3A, zero); T3A = _mm_add_epi16(T3_0A, T3_1A); T3B = _mm_loadu_si128((__m128i *)(p_src + 3 * i_src + 16)); T3_0B = _mm_unpacklo_epi8(T3B, zero); T3_1B = _mm_unpackhi_epi8(T3B, zero); T3B = _mm_add_epi16(T3_0B, T3_1B); T3 = _mm_add_epi16(T3A, T3B); T4A = _mm_loadu_si128((__m128i *)(p_src + 4 * i_src)); T4_0A = _mm_unpacklo_epi8(T4A, zero); T4_1A = _mm_unpackhi_epi8(T4A, zero); T4A = _mm_add_epi16(T4_0A, T4_1A); T4B = _mm_loadu_si128((__m128i *)(p_src + 4 * i_src + 16)); T4_0B = _mm_unpacklo_epi8(T4B, zero); T4_1B = _mm_unpackhi_epi8(T4B, zero); T4B = _mm_add_epi16(T4_0B, T4_1B); T4 = _mm_add_epi16(T4A, T4B); T5A = _mm_loadu_si128((__m128i *)(p_src + 5 * i_src)); T5_0A = _mm_unpacklo_epi8(T5A, zero); T5_1A = _mm_unpackhi_epi8(T5A, zero); T5A = _mm_add_epi16(T5_0A, T5_1A); T5B = _mm_loadu_si128((__m128i *)(p_src + 5 * i_src + 16)); T5_0B = _mm_unpacklo_epi8(T5B, zero); T5_1B = _mm_unpackhi_epi8(T5B, zero); T5B = _mm_add_epi16(T5_0B, T5_1B); T5 = _mm_add_epi16(T5A, T5B); T6A = _mm_loadu_si128((__m128i *)(p_src + 6 * i_src)); T6_0A = _mm_unpacklo_epi8(T6A, zero); T6_1A = _mm_unpackhi_epi8(T6A, zero); T6A = _mm_add_epi16(T6_0A, T6_1A); T6B = _mm_loadu_si128((__m128i *)(p_src + 6 * i_src + 16)); T6_0B = _mm_unpacklo_epi8(T6B, zero); T6_1B = _mm_unpackhi_epi8(T6B, zero); T6B = _mm_add_epi16(T6_0B, T6_1B); T6 = _mm_add_epi16(T6A, T6B); T7A = _mm_loadu_si128((__m128i *)(p_src + 7 * i_src)); T7_0A = _mm_unpacklo_epi8(T7A, zero); T7_1A = _mm_unpackhi_epi8(T7A, zero); T7A = _mm_add_epi16(T7_0A, T7_1A); T7B = _mm_loadu_si128((__m128i *)(p_src + 7 * i_src + 16)); T7_0B = _mm_unpacklo_epi8(T7B, zero); T7_1B = _mm_unpackhi_epi8(T7B, zero); T7B = _mm_add_epi16(T7_0B, T7_1B); T7 = _mm_add_epi16(T7A, T7B); T8A = _mm_loadu_si128((__m128i *)(p_src + 8 * i_src)); T8_0A = _mm_unpacklo_epi8(T8A, zero); T8_1A = _mm_unpackhi_epi8(T8A, zero); T8A = _mm_add_epi16(T8_0A, T8_1A); T8B = _mm_loadu_si128((__m128i *)(p_src + 8 * i_src + 16)); T8_0B = _mm_unpacklo_epi8(T8B, zero); T8_1B = _mm_unpackhi_epi8(T8B, zero); T8B = _mm_add_epi16(T8_0B, T8_1B); T8 = _mm_add_epi16(T8A, T8B); T9A = _mm_loadu_si128((__m128i *)(p_src + 9 * i_src)); T9_0A = _mm_unpacklo_epi8(T9A, zero); T9_1A = _mm_unpackhi_epi8(T9A, zero); T9A = _mm_add_epi16(T9_0A, T9_1A); T9B = _mm_loadu_si128((__m128i *)(p_src + 9 * i_src + 16)); T9_0B = _mm_unpacklo_epi8(T9B, zero); T9_1B = _mm_unpackhi_epi8(T9B, zero); T9B = _mm_add_epi16(T9_0B, T9_1B); T9 = _mm_add_epi16(T9A, T9B); T10A = _mm_loadu_si128((__m128i *)(p_src + 10 * i_src)); T10_0A = _mm_unpacklo_epi8(T10A, zero); T10_1A = _mm_unpackhi_epi8(T10A, zero); T10A = _mm_add_epi16(T10_0A, T10_1A); T10B = _mm_loadu_si128((__m128i *)(p_src + 10 * i_src + 16)); T10_0B = _mm_unpacklo_epi8(T10B, zero); T10_1B = _mm_unpackhi_epi8(T10B, zero); T10B = _mm_add_epi16(T10_0B, T10_1B); T10 = _mm_add_epi16(T10A, T10B); T11A = _mm_loadu_si128((__m128i *)(p_src + 11 * i_src)); T11_0A = _mm_unpacklo_epi8(T11A, zero); T11_1A = _mm_unpackhi_epi8(T11A, zero); T11A = _mm_add_epi16(T11_0A, T11_1A); T11B = _mm_loadu_si128((__m128i *)(p_src + 11 * i_src + 16)); T11_0B = _mm_unpacklo_epi8(T11B, zero); T11_1B = _mm_unpackhi_epi8(T11B, zero); T11B = _mm_add_epi16(T11_0B, T11_1B); T11 = _mm_add_epi16(T11A, T11B); T12A = _mm_loadu_si128((__m128i *)(p_src + 12 * i_src)); T12_0A = _mm_unpacklo_epi8(T12A, zero); T12_1A = _mm_unpackhi_epi8(T12A, zero); T12A = _mm_add_epi16(T12_0A, T12_1A); T12B = _mm_loadu_si128((__m128i *)(p_src + 12 * i_src + 16)); T12_0B = _mm_unpacklo_epi8(T12B, zero); T12_1B = _mm_unpackhi_epi8(T12B, zero); T12B = _mm_add_epi16(T12_0B, T12_1B); T12 = _mm_add_epi16(T12A, T12B); T13A = _mm_loadu_si128((__m128i *)(p_src + 13 * i_src)); T13_0A = _mm_unpacklo_epi8(T13A, zero); T13_1A = _mm_unpackhi_epi8(T13A, zero); T13A = _mm_add_epi16(T13_0A, T13_1A); T13B = _mm_loadu_si128((__m128i *)(p_src + 13 * i_src + 16)); T13_0B = _mm_unpacklo_epi8(T13B, zero); T13_1B = _mm_unpackhi_epi8(T13B, zero); T13B = _mm_add_epi16(T13_0B, T13_1B); T13 = _mm_add_epi16(T13A, T13B); T14A = _mm_loadu_si128((__m128i *)(p_src + 14 * i_src)); T14_0A = _mm_unpacklo_epi8(T14A, zero); T14_1A = _mm_unpackhi_epi8(T14A, zero); T14A = _mm_add_epi16(T14_0A, T14_1A); T14B = _mm_loadu_si128((__m128i *)(p_src + 14 * i_src + 16)); T14_0B = _mm_unpacklo_epi8(T14B, zero); T14_1B = _mm_unpackhi_epi8(T14B, zero); T14B = _mm_add_epi16(T14_0B, T14_1B); T14 = _mm_add_epi16(T14A, T14B); T15A = _mm_loadu_si128((__m128i *)(p_src + 15 * i_src)); T15_0A = _mm_unpacklo_epi8(T15A, zero); T15_1A = _mm_unpackhi_epi8(T15A, zero); T15A = _mm_add_epi16(T15_0A, T15_1A); T15B = _mm_loadu_si128((__m128i *)(p_src + 15 * i_src + 16)); T15_0B = _mm_unpacklo_epi8(T15B, zero); T15_1B = _mm_unpackhi_epi8(T15B, zero); T15B = _mm_add_epi16(T15_0B, T15_1B); T15 = _mm_add_epi16(T15A, T15B); T16A = _mm_loadu_si128((__m128i *)(p_src + 16 * i_src)); T16_0A = _mm_unpacklo_epi8(T16A, zero); T16_1A = _mm_unpackhi_epi8(T16A, zero); T16A = _mm_add_epi16(T16_0A, T16_1A); T16B = _mm_loadu_si128((__m128i *)(p_src + 16 * i_src + 16)); T16_0B = _mm_unpacklo_epi8(T16B, zero); T16_1B = _mm_unpackhi_epi8(T16B, zero); T16B = _mm_add_epi16(T16_0B, T16_1B); T16 = _mm_add_epi16(T16A, T16B); T17A = _mm_loadu_si128((__m128i *)(p_src + 17 * i_src)); T17_0A = _mm_unpacklo_epi8(T17A, zero); T17_1A = _mm_unpackhi_epi8(T17A, zero); T17A = _mm_add_epi16(T17_0A, T17_1A); T17B = _mm_loadu_si128((__m128i *)(p_src + 17 * i_src + 16)); T17_0B = _mm_unpacklo_epi8(T17B, zero); T17_1B = _mm_unpackhi_epi8(T17B, zero); T17B = _mm_add_epi16(T17_0B, T17_1B); T17 = _mm_add_epi16(T17A, T17B); T18A = _mm_loadu_si128((__m128i *)(p_src + 18 * i_src)); T18_0A = _mm_unpacklo_epi8(T18A, zero); T18_1A = _mm_unpackhi_epi8(T18A, zero); T18A = _mm_add_epi16(T18_0A, T18_1A); T18B = _mm_loadu_si128((__m128i *)(p_src + 18 * i_src + 16)); T18_0B = _mm_unpacklo_epi8(T18B, zero); T18_1B = _mm_unpackhi_epi8(T18B, zero); T18B = _mm_add_epi16(T18_0B, T18_1B); T18 = _mm_add_epi16(T18A, T18B); T19A = _mm_loadu_si128((__m128i *)(p_src + 19 * i_src)); T19_0A = _mm_unpacklo_epi8(T19A, zero); T19_1A = _mm_unpackhi_epi8(T19A, zero); T19A = _mm_add_epi16(T19_0A, T19_1A); T19B = _mm_loadu_si128((__m128i *)(p_src + 19 * i_src + 16)); T19_0B = _mm_unpacklo_epi8(T19B, zero); T19_1B = _mm_unpackhi_epi8(T19B, zero); T19B = _mm_add_epi16(T19_0B, T19_1B); T19 = _mm_add_epi16(T19A, T19B); T20A = _mm_loadu_si128((__m128i *)(p_src + 20 * i_src)); T20_0A = _mm_unpacklo_epi8(T20A, zero); T20_1A = _mm_unpackhi_epi8(T20A, zero); T20A = _mm_add_epi16(T20_0A, T20_1A); T20B = _mm_loadu_si128((__m128i *)(p_src + 20 * i_src + 16)); T20_0B = _mm_unpacklo_epi8(T20B, zero); T20_1B = _mm_unpackhi_epi8(T20B, zero); T20B = _mm_add_epi16(T20_0B, T20_1B); T20 = _mm_add_epi16(T20A, T20B); T21A = _mm_loadu_si128((__m128i *)(p_src + 21 * i_src)); T21_0A = _mm_unpacklo_epi8(T21A, zero); T21_1A = _mm_unpackhi_epi8(T21A, zero); T21A = _mm_add_epi16(T21_0A, T21_1A); T21B = _mm_loadu_si128((__m128i *)(p_src + 21 * i_src + 16)); T21_0B = _mm_unpacklo_epi8(T21B, zero); T21_1B = _mm_unpackhi_epi8(T21B, zero); T21B = _mm_add_epi16(T21_0B, T21_1B); T21 = _mm_add_epi16(T21A, T21B); T22A = _mm_loadu_si128((__m128i *)(p_src + 22 * i_src)); T22_0A = _mm_unpacklo_epi8(T22A, zero); T22_1A = _mm_unpackhi_epi8(T22A, zero); T22A = _mm_add_epi16(T22_0A, T22_1A); T22B = _mm_loadu_si128((__m128i *)(p_src + 22 * i_src + 16)); T22_0B = _mm_unpacklo_epi8(T22B, zero); T22_1B = _mm_unpackhi_epi8(T22B, zero); T22B = _mm_add_epi16(T22_0B, T22_1B); T22 = _mm_add_epi16(T22A, T22B); T23A = _mm_loadu_si128((__m128i *)(p_src + 23 * i_src)); T23_0A = _mm_unpacklo_epi8(T23A, zero); T23_1A = _mm_unpackhi_epi8(T23A, zero); T23A = _mm_add_epi16(T23_0A, T23_1A); T23B = _mm_loadu_si128((__m128i *)(p_src + 23 * i_src + 16)); T23_0B = _mm_unpacklo_epi8(T23B, zero); T23_1B = _mm_unpackhi_epi8(T23B, zero); T23B = _mm_add_epi16(T23_0B, T23_1B); T23 = _mm_add_epi16(T23A, T23B); T24A = _mm_loadu_si128((__m128i *)(p_src + 24 * i_src)); T24_0A = _mm_unpacklo_epi8(T24A, zero); T24_1A = _mm_unpackhi_epi8(T24A, zero); T24A = _mm_add_epi16(T24_0A, T24_1A); T24B = _mm_loadu_si128((__m128i *)(p_src + 24 * i_src + 16)); T24_0B = _mm_unpacklo_epi8(T24B, zero); T24_1B = _mm_unpackhi_epi8(T24B, zero); T24B = _mm_add_epi16(T24_0B, T24_1B); T24 = _mm_add_epi16(T24A, T24B); T25A = _mm_loadu_si128((__m128i *)(p_src + 25 * i_src)); T25_0A = _mm_unpacklo_epi8(T25A, zero); T25_1A = _mm_unpackhi_epi8(T25A, zero); T25A = _mm_add_epi16(T25_0A, T25_1A); T25B = _mm_loadu_si128((__m128i *)(p_src + 25 * i_src + 16)); T25_0B = _mm_unpacklo_epi8(T25B, zero); T25_1B = _mm_unpackhi_epi8(T25B, zero); T25B = _mm_add_epi16(T25_0B, T25_1B); T25 = _mm_add_epi16(T25A, T25B); T26A = _mm_loadu_si128((__m128i *)(p_src + 26 * i_src)); T26_0A = _mm_unpacklo_epi8(T26A, zero); T26_1A = _mm_unpackhi_epi8(T26A, zero); T26A = _mm_add_epi16(T26_0A, T26_1A); T26B = _mm_loadu_si128((__m128i *)(p_src + 26 * i_src + 16)); T26_0B = _mm_unpacklo_epi8(T26B, zero); T26_1B = _mm_unpackhi_epi8(T26B, zero); T26B = _mm_add_epi16(T26_0B, T26_1B); T26 = _mm_add_epi16(T26A, T26B); T27A = _mm_loadu_si128((__m128i *)(p_src + 27 * i_src)); T27_0A = _mm_unpacklo_epi8(T27A, zero); T27_1A = _mm_unpackhi_epi8(T27A, zero); T27A = _mm_add_epi16(T27_0A, T27_1A); T27B = _mm_loadu_si128((__m128i *)(p_src + 27 * i_src + 16)); T27_0B = _mm_unpacklo_epi8(T27B, zero); T27_1B = _mm_unpackhi_epi8(T27B, zero); T27B = _mm_add_epi16(T27_0B, T27_1B); T27 = _mm_add_epi16(T27A, T27B); T28A = _mm_loadu_si128((__m128i *)(p_src + 28 * i_src)); T28_0A = _mm_unpacklo_epi8(T28A, zero); T28_1A = _mm_unpackhi_epi8(T28A, zero); T28A = _mm_add_epi16(T28_0A, T28_1A); T28B = _mm_loadu_si128((__m128i *)(p_src + 28 * i_src + 16)); T28_0B = _mm_unpacklo_epi8(T28B, zero); T28_1B = _mm_unpackhi_epi8(T28B, zero); T28B = _mm_add_epi16(T28_0B, T28_1B); T28 = _mm_add_epi16(T28A, T28B); T29A = _mm_loadu_si128((__m128i *)(p_src + 29 * i_src)); T29_0A = _mm_unpacklo_epi8(T29A, zero); T29_1A = _mm_unpackhi_epi8(T29A, zero); T29A = _mm_add_epi16(T29_0A, T29_1A); T29B = _mm_loadu_si128((__m128i *)(p_src + 29 * i_src + 16)); T29_0B = _mm_unpacklo_epi8(T29B, zero); T29_1B = _mm_unpackhi_epi8(T29B, zero); T29B = _mm_add_epi16(T29_0B, T29_1B); T29 = _mm_add_epi16(T29A, T29B); T30A = _mm_loadu_si128((__m128i *)(p_src + 30 * i_src)); T30_0A = _mm_unpacklo_epi8(T30A, zero); T30_1A = _mm_unpackhi_epi8(T30A, zero); T30A = _mm_add_epi16(T30_0A, T30_1A); T30B = _mm_loadu_si128((__m128i *)(p_src + 30 * i_src + 16)); T30_0B = _mm_unpacklo_epi8(T30B, zero); T30_1B = _mm_unpackhi_epi8(T30B, zero); T30B = _mm_add_epi16(T30_0B, T30_1B); T30 = _mm_add_epi16(T30A, T30B); T31A = _mm_loadu_si128((__m128i *)(p_src + 31 * i_src)); T31_0A = _mm_unpacklo_epi8(T31A, zero); T31_1A = _mm_unpackhi_epi8(T31A, zero); T31A = _mm_add_epi16(T31_0A, T31_1A); T31B = _mm_loadu_si128((__m128i *)(p_src + 31 * i_src + 16)); T31_0B = _mm_unpacklo_epi8(T31B, zero); T31_1B = _mm_unpackhi_epi8(T31B, zero); T31B = _mm_add_epi16(T31_0B, T31_1B); T31 = _mm_add_epi16(T31A, T31B); S = _mm_add_epi16(T0, T1); S = _mm_add_epi16(S, T2); S = _mm_add_epi16(S, T3); S = _mm_add_epi16(S, T4); S = _mm_add_epi16(S, T5); S = _mm_add_epi16(S, T6); S = _mm_add_epi16(S, T7); S = _mm_add_epi16(S, T8); S = _mm_add_epi16(S, T9); S = _mm_add_epi16(S, T10); S = _mm_add_epi16(S, T11); S = _mm_add_epi16(S, T12); S = _mm_add_epi16(S, T13); S = _mm_add_epi16(S, T14); S = _mm_add_epi16(S, T15); S = _mm_add_epi16(S, T16); S = _mm_add_epi16(S, T17); S = _mm_add_epi16(S, T18); S = _mm_add_epi16(S, T19); S = _mm_add_epi16(S, T20); S = _mm_add_epi16(S, T21); S = _mm_add_epi16(S, T22); S = _mm_add_epi16(S, T23); S = _mm_add_epi16(S, T24); S = _mm_add_epi16(S, T25); S = _mm_add_epi16(S, T26); S = _mm_add_epi16(S, T27); S = _mm_add_epi16(S, T28); S = _mm_add_epi16(S, T29); S = _mm_add_epi16(S, T30); S = _mm_add_epi16(S, T31); sum = M128_I16(S, 0) + M128_I16(S, 1) + M128_I16(S, 2) + M128_I16(S, 3) + M128_I16(S, 4) + M128_I16(S, 5) + M128_I16(S, 6) + M128_I16(S, 7); f_avg = (sum + (num_pix >> 1)) / num_pix; avg = _mm_set1_epi16((short)f_avg); /* cal mad */ T0_0A = _mm_sub_epi16(T0_0A, avg); T0_1A = _mm_sub_epi16(T0_1A, avg); T0_0B = _mm_sub_epi16(T0_0B, avg); T0_1B = _mm_sub_epi16(T0_1B, avg); T1_0A = _mm_sub_epi16(T1_0A, avg); T1_1A = _mm_sub_epi16(T1_1A, avg); T1_0B = _mm_sub_epi16(T1_0B, avg); T1_1B = _mm_sub_epi16(T1_1B, avg); T2_0A = _mm_sub_epi16(T2_0A, avg); T2_1A = _mm_sub_epi16(T2_1A, avg); T2_0B = _mm_sub_epi16(T2_0B, avg); T2_1B = _mm_sub_epi16(T2_1B, avg); T3_0A = _mm_sub_epi16(T3_0A, avg); T3_1A = _mm_sub_epi16(T3_1A, avg); T3_0B = _mm_sub_epi16(T3_0B, avg); T3_1B = _mm_sub_epi16(T3_1B, avg); T4_0A = _mm_sub_epi16(T4_0A, avg); T4_1A = _mm_sub_epi16(T4_1A, avg); T4_0B = _mm_sub_epi16(T4_0B, avg); T4_1B = _mm_sub_epi16(T4_1B, avg); T5_0A = _mm_sub_epi16(T5_0A, avg); T5_1A = _mm_sub_epi16(T5_1A, avg); T5_0B = _mm_sub_epi16(T5_0B, avg); T5_1B = _mm_sub_epi16(T5_1B, avg); T6_0A = _mm_sub_epi16(T6_0A, avg); T6_1A = _mm_sub_epi16(T6_1A, avg); T6_0B = _mm_sub_epi16(T6_0B, avg); T6_1B = _mm_sub_epi16(T6_1B, avg); T7_0A = _mm_sub_epi16(T7_0A, avg); T7_1A = _mm_sub_epi16(T7_1A, avg); T7_0B = _mm_sub_epi16(T7_0B, avg); T7_1B = _mm_sub_epi16(T7_1B, avg); T8_0A = _mm_sub_epi16(T8_0A, avg); T8_1A = _mm_sub_epi16(T8_1A, avg); T8_0B = _mm_sub_epi16(T8_0B, avg); T8_1B = _mm_sub_epi16(T8_1B, avg); T9_0A = _mm_sub_epi16(T9_0A, avg); T9_1A = _mm_sub_epi16(T9_1A, avg); T9_0B = _mm_sub_epi16(T9_0B, avg); T9_1B = _mm_sub_epi16(T9_1B, avg); T10_0A = _mm_sub_epi16(T10_0A, avg); T10_1A = _mm_sub_epi16(T10_1A, avg); T10_0B = _mm_sub_epi16(T10_0B, avg); T10_1B = _mm_sub_epi16(T10_1B, avg); T11_0A = _mm_sub_epi16(T11_0A, avg); T11_1A = _mm_sub_epi16(T11_1A, avg); T11_0B = _mm_sub_epi16(T11_0B, avg); T11_1B = _mm_sub_epi16(T11_1B, avg); T12_0A = _mm_sub_epi16(T12_0A, avg); T12_1A = _mm_sub_epi16(T12_1A, avg); T12_0B = _mm_sub_epi16(T12_0B, avg); T12_1B = _mm_sub_epi16(T12_1B, avg); T13_0A = _mm_sub_epi16(T13_0A, avg); T13_1A = _mm_sub_epi16(T13_1A, avg); T13_0B = _mm_sub_epi16(T13_0B, avg); T13_1B = _mm_sub_epi16(T13_1B, avg); T14_0A = _mm_sub_epi16(T14_0A, avg); T14_1A = _mm_sub_epi16(T14_1A, avg); T14_0B = _mm_sub_epi16(T14_0B, avg); T14_1B = _mm_sub_epi16(T14_1B, avg); T15_0A = _mm_sub_epi16(T15_0A, avg); T15_1A = _mm_sub_epi16(T15_1A, avg); T15_0B = _mm_sub_epi16(T15_0B, avg); T15_1B = _mm_sub_epi16(T15_1B, avg); T16_0A = _mm_sub_epi16(T16_0A, avg); T16_1A = _mm_sub_epi16(T16_1A, avg); T16_0B = _mm_sub_epi16(T16_0B, avg); T16_1B = _mm_sub_epi16(T16_1B, avg); T17_0A = _mm_sub_epi16(T17_0A, avg); T17_1A = _mm_sub_epi16(T17_1A, avg); T17_0B = _mm_sub_epi16(T17_0B, avg); T17_1B = _mm_sub_epi16(T17_1B, avg); T18_0A = _mm_sub_epi16(T18_0A, avg); T18_1A = _mm_sub_epi16(T18_1A, avg); T18_0B = _mm_sub_epi16(T18_0B, avg); T18_1B = _mm_sub_epi16(T18_1B, avg); T19_0A = _mm_sub_epi16(T19_0A, avg); T19_1A = _mm_sub_epi16(T19_1A, avg); T19_0B = _mm_sub_epi16(T19_0B, avg); T19_1B = _mm_sub_epi16(T19_1B, avg); T20_0A = _mm_sub_epi16(T20_0A, avg); T20_1A = _mm_sub_epi16(T20_1A, avg); T20_0B = _mm_sub_epi16(T20_0B, avg); T20_1B = _mm_sub_epi16(T20_1B, avg); T21_0A = _mm_sub_epi16(T21_0A, avg); T21_1A = _mm_sub_epi16(T21_1A, avg); T21_0B = _mm_sub_epi16(T21_0B, avg); T21_1B = _mm_sub_epi16(T21_1B, avg); T22_0A = _mm_sub_epi16(T22_0A, avg); T22_1A = _mm_sub_epi16(T22_1A, avg); T22_0B = _mm_sub_epi16(T22_0B, avg); T22_1B = _mm_sub_epi16(T22_1B, avg); T23_0A = _mm_sub_epi16(T23_0A, avg); T23_1A = _mm_sub_epi16(T23_1A, avg); T23_0B = _mm_sub_epi16(T23_0B, avg); T23_1B = _mm_sub_epi16(T23_1B, avg); T24_0A = _mm_sub_epi16(T24_0A, avg); T24_1A = _mm_sub_epi16(T24_1A, avg); T24_0B = _mm_sub_epi16(T24_0B, avg); T24_1B = _mm_sub_epi16(T24_1B, avg); T25_0A = _mm_sub_epi16(T25_0A, avg); T25_1A = _mm_sub_epi16(T25_1A, avg); T25_0B = _mm_sub_epi16(T25_0B, avg); T25_1B = _mm_sub_epi16(T25_1B, avg); T26_0A = _mm_sub_epi16(T26_0A, avg); T26_1A = _mm_sub_epi16(T26_1A, avg); T26_0B = _mm_sub_epi16(T26_0B, avg); T26_1B = _mm_sub_epi16(T26_1B, avg); T27_0A = _mm_sub_epi16(T27_0A, avg); T27_1A = _mm_sub_epi16(T27_1A, avg); T27_0B = _mm_sub_epi16(T27_0B, avg); T27_1B = _mm_sub_epi16(T27_1B, avg); T28_0A = _mm_sub_epi16(T28_0A, avg); T28_1A = _mm_sub_epi16(T28_1A, avg); T28_0B = _mm_sub_epi16(T28_0B, avg); T28_1B = _mm_sub_epi16(T28_1B, avg); T29_0A = _mm_sub_epi16(T29_0A, avg); T29_1A = _mm_sub_epi16(T29_1A, avg); T29_0B = _mm_sub_epi16(T29_0B, avg); T29_1B = _mm_sub_epi16(T29_1B, avg); T30_0A = _mm_sub_epi16(T30_0A, avg); T30_1A = _mm_sub_epi16(T30_1A, avg); T30_0B = _mm_sub_epi16(T30_0B, avg); T30_1B = _mm_sub_epi16(T30_1B, avg); T31_0A = _mm_sub_epi16(T31_0A, avg); T31_1A = _mm_sub_epi16(T31_1A, avg); T31_0B = _mm_sub_epi16(T31_0B, avg); T31_1B = _mm_sub_epi16(T31_1B, avg); T0_0A = _mm_abs_epi16(T0_0A); T0_1A = _mm_abs_epi16(T0_1A); T0_0B = _mm_abs_epi16(T0_0B); T0_1B = _mm_abs_epi16(T0_1B); T1_0A = _mm_abs_epi16(T1_0A); T1_1A = _mm_abs_epi16(T1_1A); T1_0B = _mm_abs_epi16(T1_0B); T1_1B = _mm_abs_epi16(T1_1B); T2_0A = _mm_abs_epi16(T2_0A); T2_1A = _mm_abs_epi16(T2_1A); T2_0B = _mm_abs_epi16(T2_0B); T2_1B = _mm_abs_epi16(T2_1B); T3_0A = _mm_abs_epi16(T3_0A); T3_1A = _mm_abs_epi16(T3_1A); T3_0B = _mm_abs_epi16(T3_0B); T3_1B = _mm_abs_epi16(T3_1B); T4_0A = _mm_abs_epi16(T4_0A); T4_1A = _mm_abs_epi16(T4_1A); T4_0B = _mm_abs_epi16(T4_0B); T4_1B = _mm_abs_epi16(T4_1B); T5_0A = _mm_abs_epi16(T5_0A); T5_1A = _mm_abs_epi16(T5_1A); T5_0B = _mm_abs_epi16(T5_0B); T5_1B = _mm_abs_epi16(T5_1B); T6_0A = _mm_abs_epi16(T6_0A); T6_1A = _mm_abs_epi16(T6_1A); T6_0B = _mm_abs_epi16(T6_0B); T6_1B = _mm_abs_epi16(T6_1B); T7_0A = _mm_abs_epi16(T7_0A); T7_1A = _mm_abs_epi16(T7_1A); T7_0B = _mm_abs_epi16(T7_0B); T7_1B = _mm_abs_epi16(T7_1B); T8_0A = _mm_abs_epi16(T8_0A); T8_1A = _mm_abs_epi16(T8_1A); T8_0B = _mm_abs_epi16(T8_0B); T8_1B = _mm_abs_epi16(T8_1B); T9_0A = _mm_abs_epi16(T9_0A); T9_1A = _mm_abs_epi16(T9_1A); T9_0B = _mm_abs_epi16(T9_0B); T9_1B = _mm_abs_epi16(T9_1B); T10_0A = _mm_abs_epi16(T10_0A); T10_1A = _mm_abs_epi16(T10_1A); T10_0B = _mm_abs_epi16(T10_0B); T10_1B = _mm_abs_epi16(T10_1B); T11_0A = _mm_abs_epi16(T11_0A); T11_1A = _mm_abs_epi16(T11_1A); T11_0B = _mm_abs_epi16(T11_0B); T11_1B = _mm_abs_epi16(T11_1B); T12_0A = _mm_abs_epi16(T12_0A); T12_1A = _mm_abs_epi16(T12_1A); T12_0B = _mm_abs_epi16(T12_0B); T12_1B = _mm_abs_epi16(T12_1B); T13_0A = _mm_abs_epi16(T13_0A); T13_1A = _mm_abs_epi16(T13_1A); T13_0B = _mm_abs_epi16(T13_0B); T13_1B = _mm_abs_epi16(T13_1B); T14_0A = _mm_abs_epi16(T14_0A); T14_1A = _mm_abs_epi16(T14_1A); T14_0B = _mm_abs_epi16(T14_0B); T14_1B = _mm_abs_epi16(T14_1B); T15_0A = _mm_abs_epi16(T15_0A); T15_1A = _mm_abs_epi16(T15_1A); T15_0B = _mm_abs_epi16(T15_0B); T15_1B = _mm_abs_epi16(T15_1B); T16_0A = _mm_abs_epi16(T16_0A); T16_1A = _mm_abs_epi16(T16_1A); T16_0B = _mm_abs_epi16(T16_0B); T16_1B = _mm_abs_epi16(T16_1B); T17_0A = _mm_abs_epi16(T17_0A); T17_1A = _mm_abs_epi16(T17_1A); T17_0B = _mm_abs_epi16(T17_0B); T17_1B = _mm_abs_epi16(T17_1B); T18_0A = _mm_abs_epi16(T18_0A); T18_1A = _mm_abs_epi16(T18_1A); T18_0B = _mm_abs_epi16(T18_0B); T18_1B = _mm_abs_epi16(T18_1B); T19_0A = _mm_abs_epi16(T19_0A); T19_1A = _mm_abs_epi16(T19_1A); T19_0B = _mm_abs_epi16(T19_0B); T19_1B = _mm_abs_epi16(T19_1B); T20_0A = _mm_abs_epi16(T20_0A); T20_1A = _mm_abs_epi16(T20_1A); T20_0B = _mm_abs_epi16(T20_0B); T20_1B = _mm_abs_epi16(T20_1B); T21_0A = _mm_abs_epi16(T21_0A); T21_1A = _mm_abs_epi16(T21_1A); T21_0B = _mm_abs_epi16(T21_0B); T21_1B = _mm_abs_epi16(T21_1B); T22_0A = _mm_abs_epi16(T22_0A); T22_1A = _mm_abs_epi16(T22_1A); T22_0B = _mm_abs_epi16(T22_0B); T22_1B = _mm_abs_epi16(T22_1B); T23_0A = _mm_abs_epi16(T23_0A); T23_1A = _mm_abs_epi16(T23_1A); T23_0B = _mm_abs_epi16(T23_0B); T23_1B = _mm_abs_epi16(T23_1B); T24_0A = _mm_abs_epi16(T24_0A); T24_1A = _mm_abs_epi16(T24_1A); T24_0B = _mm_abs_epi16(T24_0B); T24_1B = _mm_abs_epi16(T24_1B); T25_0A = _mm_abs_epi16(T25_0A); T25_1A = _mm_abs_epi16(T25_1A); T25_0B = _mm_abs_epi16(T25_0B); T25_1B = _mm_abs_epi16(T25_1B); T26_0A = _mm_abs_epi16(T26_0A); T26_1A = _mm_abs_epi16(T26_1A); T26_0B = _mm_abs_epi16(T26_0B); T26_1B = _mm_abs_epi16(T26_1B); T27_0A = _mm_abs_epi16(T27_0A); T27_1A = _mm_abs_epi16(T27_1A); T27_0B = _mm_abs_epi16(T27_0B); T27_1B = _mm_abs_epi16(T27_1B); T28_0A = _mm_abs_epi16(T28_0A); T28_1A = _mm_abs_epi16(T28_1A); T28_0B = _mm_abs_epi16(T28_0B); T28_1B = _mm_abs_epi16(T28_1B); T29_0A = _mm_abs_epi16(T29_0A); T29_1A = _mm_abs_epi16(T29_1A); T29_0B = _mm_abs_epi16(T29_0B); T29_1B = _mm_abs_epi16(T29_1B); T30_0A = _mm_abs_epi16(T30_0A); T30_1A = _mm_abs_epi16(T30_1A); T30_0B = _mm_abs_epi16(T30_0B); T30_1B = _mm_abs_epi16(T30_1B); T31_0A = _mm_abs_epi16(T31_0A); T31_1A = _mm_abs_epi16(T31_1A); T31_0B = _mm_abs_epi16(T31_0B); T31_1B = _mm_abs_epi16(T31_1B); T0 = _mm_add_epi16(T0_0A, T0_1A); T0 = _mm_add_epi16(T0, T0_0B); T0 = _mm_add_epi16(T0, T0_1B); T1 = _mm_add_epi16(T1_0A, T1_1A); T1 = _mm_add_epi16(T1, T1_0B); T1 = _mm_add_epi16(T1, T1_1B); T2 = _mm_add_epi16(T2_0A, T2_1A); T2 = _mm_add_epi16(T2, T2_0B); T2 = _mm_add_epi16(T2, T2_1B); T3 = _mm_add_epi16(T3_0A, T3_1A); T3 = _mm_add_epi16(T3, T3_0B); T3 = _mm_add_epi16(T3, T3_1B); T4 = _mm_add_epi16(T4_0A, T4_1A); T4 = _mm_add_epi16(T4, T4_0B); T4 = _mm_add_epi16(T4, T4_1B); T5 = _mm_add_epi16(T5_0A, T5_1A); T5 = _mm_add_epi16(T5, T5_0B); T5 = _mm_add_epi16(T5, T5_1B); T6 = _mm_add_epi16(T6_0A, T6_1A); T6 = _mm_add_epi16(T6, T6_0B); T6 = _mm_add_epi16(T6, T6_1B); T7 = _mm_add_epi16(T7_0A, T7_1A); T7 = _mm_add_epi16(T7, T7_0B); T7 = _mm_add_epi16(T7, T7_1B); T8 = _mm_add_epi16(T8_0A, T8_1A); T8 = _mm_add_epi16(T8, T8_0B); T8 = _mm_add_epi16(T8, T8_1B); T9 = _mm_add_epi16(T9_0A, T9_1A); T9 = _mm_add_epi16(T9, T9_0B); T9 = _mm_add_epi16(T9, T9_1B); T10 = _mm_add_epi16(T10_0A, T10_1A); T10 = _mm_add_epi16(T10, T10_0B); T10 = _mm_add_epi16(T10, T10_1B); T11 = _mm_add_epi16(T11_0A, T11_1A); T11 = _mm_add_epi16(T11, T11_0B); T11 = _mm_add_epi16(T11, T11_1B); T12 = _mm_add_epi16(T12_0A, T12_1A); T12 = _mm_add_epi16(T12, T12_0B); T12 = _mm_add_epi16(T12, T12_1B); T13 = _mm_add_epi16(T13_0A, T13_1A); T13 = _mm_add_epi16(T13, T13_0B); T13 = _mm_add_epi16(T13, T13_1B); T14 = _mm_add_epi16(T14_0A, T14_1A); T14 = _mm_add_epi16(T14, T14_0B); T14 = _mm_add_epi16(T14, T14_1B); T15 = _mm_add_epi16(T15_0A, T15_1A); T15 = _mm_add_epi16(T15, T15_0B); T15 = _mm_add_epi16(T15, T15_1B); T16 = _mm_add_epi16(T16_0A, T16_1A); T16 = _mm_add_epi16(T16, T16_0B); T16 = _mm_add_epi16(T16, T16_1B); T17 = _mm_add_epi16(T17_0A, T17_1A); T17 = _mm_add_epi16(T17, T17_0B); T17 = _mm_add_epi16(T17, T17_1B); T18 = _mm_add_epi16(T18_0A, T18_1A); T18 = _mm_add_epi16(T18, T18_0B); T18 = _mm_add_epi16(T18, T18_1B); T19 = _mm_add_epi16(T19_0A, T19_1A); T19 = _mm_add_epi16(T19, T19_0B); T19 = _mm_add_epi16(T19, T19_1B); T20 = _mm_add_epi16(T20_0A, T20_1A); T20 = _mm_add_epi16(T20, T20_0B); T20 = _mm_add_epi16(T20, T20_1B); T21 = _mm_add_epi16(T21_0A, T21_1A); T21 = _mm_add_epi16(T21, T21_0B); T21 = _mm_add_epi16(T21, T21_1B); T22 = _mm_add_epi16(T22_0A, T22_1A); T22 = _mm_add_epi16(T22, T22_0B); T22 = _mm_add_epi16(T22, T22_1B); T23 = _mm_add_epi16(T23_0A, T23_1A); T23 = _mm_add_epi16(T23, T23_0B); T23 = _mm_add_epi16(T23, T23_1B); T24 = _mm_add_epi16(T24_0A, T24_1A); T24 = _mm_add_epi16(T24, T24_0B); T24 = _mm_add_epi16(T24, T24_1B); T25 = _mm_add_epi16(T25_0A, T25_1A); T25 = _mm_add_epi16(T25, T25_0B); T25 = _mm_add_epi16(T25, T25_1B); T26 = _mm_add_epi16(T26_0A, T26_1A); T26 = _mm_add_epi16(T26, T26_0B); T26 = _mm_add_epi16(T26, T26_1B); T27 = _mm_add_epi16(T27_0A, T27_1A); T27 = _mm_add_epi16(T27, T27_0B); T27 = _mm_add_epi16(T27, T27_1B); T28 = _mm_add_epi16(T28_0A, T28_1A); T28 = _mm_add_epi16(T28, T28_0B); T28 = _mm_add_epi16(T28, T28_1B); T29 = _mm_add_epi16(T29_0A, T29_1A); T29 = _mm_add_epi16(T29, T29_0B); T29 = _mm_add_epi16(T29, T29_1B); T30 = _mm_add_epi16(T30_0A, T30_1A); T30 = _mm_add_epi16(T30, T30_0B); T30 = _mm_add_epi16(T30, T30_1B); T31 = _mm_add_epi16(T31_0A, T31_1A); T31 = _mm_add_epi16(T31, T31_0B); T31 = _mm_add_epi16(T31, T31_1B); M = _mm_add_epi16(T0, T1); M = _mm_add_epi16(M, T2); M = _mm_add_epi16(M, T3); M = _mm_add_epi16(M, T4); M = _mm_add_epi16(M, T5); M = _mm_add_epi16(M, T6); M = _mm_add_epi16(M, T7); M = _mm_add_epi16(M, T8); M = _mm_add_epi16(M, T9); M = _mm_add_epi16(M, T10); M = _mm_add_epi16(M, T11); M = _mm_add_epi16(M, T12); M = _mm_add_epi16(M, T13); M = _mm_add_epi16(M, T14); M = _mm_add_epi16(M, T15); M = _mm_add_epi16(M, T16); M = _mm_add_epi16(M, T17); M = _mm_add_epi16(M, T18); M = _mm_add_epi16(M, T19); M = _mm_add_epi16(M, T20); M = _mm_add_epi16(M, T21); M = _mm_add_epi16(M, T22); M = _mm_add_epi16(M, T23); M = _mm_add_epi16(M, T24); M = _mm_add_epi16(M, T25); M = _mm_add_epi16(M, T26); M = _mm_add_epi16(M, T27); M = _mm_add_epi16(M, T28); M = _mm_add_epi16(M, T29); M = _mm_add_epi16(M, T30); M = _mm_add_epi16(M, T31); mad = M128_I16(M, 0) + M128_I16(M, 1) + M128_I16(M, 2) + M128_I16(M, 3) + M128_I16(M, 4) + M128_I16(M, 5) + M128_I16(M, 6) + M128_I16(M, 7); return mad; } /* --------------------------------------------------------------------------- */ int mad_64x64_sse128(pel_t *p_src, int i_src, int cu_size) { __m128i zero; __m128i T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45, T46, T47, T48, T49, T50, T51, T52, T53, T54, T55, T56, T57, T58, T59, T60, T61, T62, T63; __m128i T0A, T1A, T2A, T3A, T4A, T5A, T6A, T7A, T8A, T9A, T10A, T11A, T12A, T13A, T14A, T15A, T16A, T17A, T18A, T19A, T20A, T21A, T22A, T23A, T24A, T25A, T26A, T27A, T28A, T29A, T30A, T31A, T32A, T33A, T34A, T35A, T36A, T37A, T38A, T39A, T40A, T41A, T42A, T43A, T44A, T45A, T46A, T47A, T48A, T49A, T50A, T51A, T52A, T53A, T54A, T55A, T56A, T57A, T58A, T59A, T60A, T61A, T62A, T63A; __m128i T0B, T1B, T2B, T3B, T4B, T5B, T6B, T7B, T8B, T9B, T10B, T11B, T12B, T13B, T14B, T15B, T16B, T17B, T18B, T19B, T20B, T21B, T22B, T23B, T24B, T25B, T26B, T27B, T28B, T29B, T30B, T31B, T32B, T33B, T34B, T35B, T36B, T37B, T38B, T39B, T40B, T41B, T42B, T43B, T44B, T45B, T46B, T47B, T48B, T49B, T50B, T51B, T52B, T53B, T54B, T55B, T56B, T57B, T58B, T59B, T60B, T61B, T62B, T63B; __m128i T0C, T1C, T2C, T3C, T4C, T5C, T6C, T7C, T8C, T9C, T10C, T11C, T12C, T13C, T14C, T15C, T16C, T17C, T18C, T19C, T20C, T21C, T22C, T23C, T24C, T25C, T26C, T27C, T28C, T29C, T30C, T31C, T32C, T33C, T34C, T35C, T36C, T37C, T38C, T39C, T40C, T41C, T42C, T43C, T44C, T45C, T46C, T47C, T48C, T49C, T50C, T51C, T52C, T53C, T54C, T55C, T56C, T57C, T58C, T59C, T60C, T61C, T62C, T63C; __m128i T0D, T1D, T2D, T3D, T4D, T5D, T6D, T7D, T8D, T9D, T10D, T11D, T12D, T13D, T14D, T15D, T16D, T17D, T18D, T19D, T20D, T21D, T22D, T23D, T24D, T25D, T26D, T27D, T28D, T29D, T30D, T31D, T32D, T33D, T34D, T35D, T36D, T37D, T38D, T39D, T40D, T41D, T42D, T43D, T44D, T45D, T46D, T47D, T48D, T49D, T50D, T51D, T52D, T53D, T54D, T55D, T56D, T57D, T58D, T59D, T60D, T61D, T62D, T63D; __m128i T0_0A, T1_0A, T2_0A, T3_0A, T4_0A, T5_0A, T6_0A, T7_0A, T8_0A, T9_0A, T10_0A, T11_0A, T12_0A, T13_0A, T14_0A, T15_0A, T16_0A, T17_0A, T18_0A, T19_0A, T20_0A, T21_0A, T22_0A, T23_0A, T24_0A, T25_0A, T26_0A, T27_0A, T28_0A, T29_0A, T30_0A, T31_0A, T32_0A, T33_0A, T34_0A, T35_0A, T36_0A, T37_0A, T38_0A, T39_0A, T40_0A, T41_0A, T42_0A, T43_0A, T44_0A, T45_0A, T46_0A, T47_0A, T48_0A, T49_0A, T50_0A, T51_0A, T52_0A, T53_0A, T54_0A, T55_0A, T56_0A, T57_0A, T58_0A, T59_0A, T60_0A, T61_0A, T62_0A, T63_0A; __m128i T0_1A, T1_1A, T2_1A, T3_1A, T4_1A, T5_1A, T6_1A, T7_1A, T8_1A, T9_1A, T10_1A, T11_1A, T12_1A, T13_1A, T14_1A, T15_1A, T16_1A, T17_1A, T18_1A, T19_1A, T20_1A, T21_1A, T22_1A, T23_1A, T24_1A, T25_1A, T26_1A, T27_1A, T28_1A, T29_1A, T30_1A, T31_1A, T32_1A, T33_1A, T34_1A, T35_1A, T36_1A, T37_1A, T38_1A, T39_1A, T40_1A, T41_1A, T42_1A, T43_1A, T44_1A, T45_1A, T46_1A, T47_1A, T48_1A, T49_1A, T50_1A, T51_1A, T52_1A, T53_1A, T54_1A, T55_1A, T56_1A, T57_1A, T58_1A, T59_1A, T60_1A, T61_1A, T62_1A, T63_1A; __m128i T0_0B, T1_0B, T2_0B, T3_0B, T4_0B, T5_0B, T6_0B, T7_0B, T8_0B, T9_0B, T10_0B, T11_0B, T12_0B, T13_0B, T14_0B, T15_0B, T16_0B, T17_0B, T18_0B, T19_0B, T20_0B, T21_0B, T22_0B, T23_0B, T24_0B, T25_0B, T26_0B, T27_0B, T28_0B, T29_0B, T30_0B, T31_0B, T32_0B, T33_0B, T34_0B, T35_0B, T36_0B, T37_0B, T38_0B, T39_0B, T40_0B, T41_0B, T42_0B, T43_0B, T44_0B, T45_0B, T46_0B, T47_0B, T48_0B, T49_0B, T50_0B, T51_0B, T52_0B, T53_0B, T54_0B, T55_0B, T56_0B, T57_0B, T58_0B, T59_0B, T60_0B, T61_0B, T62_0B, T63_0B; __m128i T0_1B, T1_1B, T2_1B, T3_1B, T4_1B, T5_1B, T6_1B, T7_1B, T8_1B, T9_1B, T10_1B, T11_1B, T12_1B, T13_1B, T14_1B, T15_1B, T16_1B, T17_1B, T18_1B, T19_1B, T20_1B, T21_1B, T22_1B, T23_1B, T24_1B, T25_1B, T26_1B, T27_1B, T28_1B, T29_1B, T30_1B, T31_1B, T32_1B, T33_1B, T34_1B, T35_1B, T36_1B, T37_1B, T38_1B, T39_1B, T40_1B, T41_1B, T42_1B, T43_1B, T44_1B, T45_1B, T46_1B, T47_1B, T48_1B, T49_1B, T50_1B, T51_1B, T52_1B, T53_1B, T54_1B, T55_1B, T56_1B, T57_1B, T58_1B, T59_1B, T60_1B, T61_1B, T62_1B, T63_1B; __m128i T0_0C, T1_0C, T2_0C, T3_0C, T4_0C, T5_0C, T6_0C, T7_0C, T8_0C, T9_0C, T10_0C, T11_0C, T12_0C, T13_0C, T14_0C, T15_0C, T16_0C, T17_0C, T18_0C, T19_0C, T20_0C, T21_0C, T22_0C, T23_0C, T24_0C, T25_0C, T26_0C, T27_0C, T28_0C, T29_0C, T30_0C, T31_0C, T32_0C, T33_0C, T34_0C, T35_0C, T36_0C, T37_0C, T38_0C, T39_0C, T40_0C, T41_0C, T42_0C, T43_0C, T44_0C, T45_0C, T46_0C, T47_0C, T48_0C, T49_0C, T50_0C, T51_0C, T52_0C, T53_0C, T54_0C, T55_0C, T56_0C, T57_0C, T58_0C, T59_0C, T60_0C, T61_0C, T62_0C, T63_0C; __m128i T0_1C, T1_1C, T2_1C, T3_1C, T4_1C, T5_1C, T6_1C, T7_1C, T8_1C, T9_1C, T10_1C, T11_1C, T12_1C, T13_1C, T14_1C, T15_1C, T16_1C, T17_1C, T18_1C, T19_1C, T20_1C, T21_1C, T22_1C, T23_1C, T24_1C, T25_1C, T26_1C, T27_1C, T28_1C, T29_1C, T30_1C, T31_1C, T32_1C, T33_1C, T34_1C, T35_1C, T36_1C, T37_1C, T38_1C, T39_1C, T40_1C, T41_1C, T42_1C, T43_1C, T44_1C, T45_1C, T46_1C, T47_1C, T48_1C, T49_1C, T50_1C, T51_1C, T52_1C, T53_1C, T54_1C, T55_1C, T56_1C, T57_1C, T58_1C, T59_1C, T60_1C, T61_1C, T62_1C, T63_1C; __m128i T0_0D, T1_0D, T2_0D, T3_0D, T4_0D, T5_0D, T6_0D, T7_0D, T8_0D, T9_0D, T10_0D, T11_0D, T12_0D, T13_0D, T14_0D, T15_0D, T16_0D, T17_0D, T18_0D, T19_0D, T20_0D, T21_0D, T22_0D, T23_0D, T24_0D, T25_0D, T26_0D, T27_0D, T28_0D, T29_0D, T30_0D, T31_0D, T32_0D, T33_0D, T34_0D, T35_0D, T36_0D, T37_0D, T38_0D, T39_0D, T40_0D, T41_0D, T42_0D, T43_0D, T44_0D, T45_0D, T46_0D, T47_0D, T48_0D, T49_0D, T50_0D, T51_0D, T52_0D, T53_0D, T54_0D, T55_0D, T56_0D, T57_0D, T58_0D, T59_0D, T60_0D, T61_0D, T62_0D, T63_0D; __m128i T0_1D, T1_1D, T2_1D, T3_1D, T4_1D, T5_1D, T6_1D, T7_1D, T8_1D, T9_1D, T10_1D, T11_1D, T12_1D, T13_1D, T14_1D, T15_1D, T16_1D, T17_1D, T18_1D, T19_1D, T20_1D, T21_1D, T22_1D, T23_1D, T24_1D, T25_1D, T26_1D, T27_1D, T28_1D, T29_1D, T30_1D, T31_1D, T32_1D, T33_1D, T34_1D, T35_1D, T36_1D, T37_1D, T38_1D, T39_1D, T40_1D, T41_1D, T42_1D, T43_1D, T44_1D, T45_1D, T46_1D, T47_1D, T48_1D, T49_1D, T50_1D, T51_1D, T52_1D, T53_1D, T54_1D, T55_1D, T56_1D, T57_1D, T58_1D, T59_1D, T60_1D, T61_1D, T62_1D, T63_1D; __m128i S1, S2, S3, S; __m128i avg; __m128i M1, M2, M; int sum1, sum2, sum3; int mad1, mad2, mads; int num_pix = cu_size * cu_size; int sum = 0; int f_avg = 0; /* average of all pixels in current block */ int mad = 0; /* cal average */ /*for (int y = 0; y < cu_size; ++y) { int sum_row = 0; for (int x = 0; x < cu_size; ++x) { sum_row += p_src[x]; } sum += sum_row; p_src += i_src; } f_avg = sum / num_pix;*/ zero = _mm_set1_epi8(0); T0A = _mm_loadu_si128((__m128i *)p_src); T0_0A = _mm_unpacklo_epi8(T0A, zero); T0_1A = _mm_unpackhi_epi8(T0A, zero); T0A = _mm_add_epi16(T0_0A, T0_1A); T0B = _mm_loadu_si128((__m128i *)(p_src + 16)); T0_0B = _mm_unpacklo_epi8(T0B, zero); T0_1B = _mm_unpackhi_epi8(T0B, zero); T0B = _mm_add_epi16(T0_0B, T0_1B); T0C = _mm_loadu_si128((__m128i *)(p_src + 32)); T0_0C = _mm_unpacklo_epi8(T0C, zero); T0_1C = _mm_unpackhi_epi8(T0C, zero); T0C = _mm_add_epi16(T0_0C, T0_1C); T0D = _mm_loadu_si128((__m128i *)(p_src + 48)); T0_0D = _mm_unpacklo_epi8(T0D, zero); T0_1D = _mm_unpackhi_epi8(T0D, zero); T0D = _mm_add_epi16(T0_0D, T0_1D); T0 = _mm_add_epi16(T0A, T0B); T0 = _mm_add_epi16(T0, T0C); T0 = _mm_add_epi16(T0, T0D); T1A = _mm_loadu_si128((__m128i *)(p_src + i_src)); T1_0A = _mm_unpacklo_epi8(T1A, zero); T1_1A = _mm_unpackhi_epi8(T1A, zero); T1A = _mm_add_epi16(T1_0A, T1_1A); T1B = _mm_loadu_si128((__m128i *)(p_src + i_src + 16)); T1_0B = _mm_unpacklo_epi8(T1B, zero); T1_1B = _mm_unpackhi_epi8(T1B, zero); T1B = _mm_add_epi16(T1_0B, T1_1B); T1C = _mm_loadu_si128((__m128i *)(p_src + i_src + 32)); T1_0C = _mm_unpacklo_epi8(T1C, zero); T1_1C = _mm_unpackhi_epi8(T1C, zero); T1C = _mm_add_epi16(T1_0C, T1_1C); T1D = _mm_loadu_si128((__m128i *)(p_src + i_src + 48)); T1_0D = _mm_unpacklo_epi8(T1D, zero); T1_1D = _mm_unpackhi_epi8(T1D, zero); T1D = _mm_add_epi16(T1_0D, T1_1D); T1 = _mm_add_epi16(T1A, T1B); T1 = _mm_add_epi16(T1, T1C); T1 = _mm_add_epi16(T1, T1D); T2A = _mm_loadu_si128((__m128i *)(p_src + 2 * i_src)); T2_0A = _mm_unpacklo_epi8(T2A, zero); T2_1A = _mm_unpackhi_epi8(T2A, zero); T2A = _mm_add_epi16(T2_0A, T2_1A); T2B = _mm_loadu_si128((__m128i *)(p_src + 2 * i_src + 16)); T2_0B = _mm_unpacklo_epi8(T2B, zero); T2_1B = _mm_unpackhi_epi8(T2B, zero); T2B = _mm_add_epi16(T2_0B, T2_1B); T2C = _mm_loadu_si128((__m128i *)(p_src + 2 * i_src + 32)); T2_0C = _mm_unpacklo_epi8(T2C, zero); T2_1C = _mm_unpackhi_epi8(T2C, zero); T2C = _mm_add_epi16(T2_0C, T2_1C); T2D = _mm_loadu_si128((__m128i *)(p_src + 2 * i_src + 48)); T2_0D = _mm_unpacklo_epi8(T2D, zero); T2_1D = _mm_unpackhi_epi8(T2D, zero); T2D = _mm_add_epi16(T2_0D, T2_1D); T2 = _mm_add_epi16(T2A, T2B); T2 = _mm_add_epi16(T2, T2C); T2 = _mm_add_epi16(T2, T2D); T3A = _mm_loadu_si128((__m128i *)(p_src + 3 * i_src)); T3_0A = _mm_unpacklo_epi8(T3A, zero); T3_1A = _mm_unpackhi_epi8(T3A, zero); T3A = _mm_add_epi16(T3_0A, T3_1A); T3B = _mm_loadu_si128((__m128i *)(p_src + 3 * i_src + 16)); T3_0B = _mm_unpacklo_epi8(T3B, zero); T3_1B = _mm_unpackhi_epi8(T3B, zero); T3B = _mm_add_epi16(T3_0B, T3_1B); T3C = _mm_loadu_si128((__m128i *)(p_src + 3 * i_src + 32)); T3_0C = _mm_unpacklo_epi8(T3C, zero); T3_1C = _mm_unpackhi_epi8(T3C, zero); T3C = _mm_add_epi16(T3_0C, T3_1C); T3D = _mm_loadu_si128((__m128i *)(p_src + 3 * i_src + 48)); T3_0D = _mm_unpacklo_epi8(T3D, zero); T3_1D = _mm_unpackhi_epi8(T3D, zero); T3D = _mm_add_epi16(T3_0D, T3_1D); T3 = _mm_add_epi16(T3A, T3B); T3 = _mm_add_epi16(T3, T3C); T3 = _mm_add_epi16(T3, T3D); T4A = _mm_loadu_si128((__m128i *)(p_src + 4 * i_src)); T4_0A = _mm_unpacklo_epi8(T4A, zero); T4_1A = _mm_unpackhi_epi8(T4A, zero); T4A = _mm_add_epi16(T4_0A, T4_1A); T4B = _mm_loadu_si128((__m128i *)(p_src + 4 * i_src + 16)); T4_0B = _mm_unpacklo_epi8(T4B, zero); T4_1B = _mm_unpackhi_epi8(T4B, zero); T4B = _mm_add_epi16(T4_0B, T4_1B); T4C = _mm_loadu_si128((__m128i *)(p_src + 4 * i_src + 32)); T4_0C = _mm_unpacklo_epi8(T4C, zero); T4_1C = _mm_unpackhi_epi8(T4C, zero); T4C = _mm_add_epi16(T4_0C, T4_1C); T4D = _mm_loadu_si128((__m128i *)(p_src + 4 * i_src + 48)); T4_0D = _mm_unpacklo_epi8(T4D, zero); T4_1D = _mm_unpackhi_epi8(T4D, zero); T4D = _mm_add_epi16(T4_0D, T4_1D); T4 = _mm_add_epi16(T4A, T4B); T4 = _mm_add_epi16(T4, T4C); T4 = _mm_add_epi16(T4, T4D); T5A = _mm_loadu_si128((__m128i *)(p_src + 5 * i_src)); T5_0A = _mm_unpacklo_epi8(T5A, zero); T5_1A = _mm_unpackhi_epi8(T5A, zero); T5A = _mm_add_epi16(T5_0A, T5_1A); T5B = _mm_loadu_si128((__m128i *)(p_src + 5 * i_src + 16)); T5_0B = _mm_unpacklo_epi8(T5B, zero); T5_1B = _mm_unpackhi_epi8(T5B, zero); T5B = _mm_add_epi16(T5_0B, T5_1B); T5C = _mm_loadu_si128((__m128i *)(p_src + 5 * i_src + 32)); T5_0C = _mm_unpacklo_epi8(T5C, zero); T5_1C = _mm_unpackhi_epi8(T5C, zero); T5C = _mm_add_epi16(T5_0C, T5_1C); T5D = _mm_loadu_si128((__m128i *)(p_src + 5 * i_src + 48)); T5_0D = _mm_unpacklo_epi8(T5D, zero); T5_1D = _mm_unpackhi_epi8(T5D, zero); T5D = _mm_add_epi16(T5_0D, T5_1D); T5 = _mm_add_epi16(T5A, T5B); T5 = _mm_add_epi16(T5, T5C); T5 = _mm_add_epi16(T5, T5D); T6A = _mm_loadu_si128((__m128i *)(p_src + 6 * i_src)); T6_0A = _mm_unpacklo_epi8(T6A, zero); T6_1A = _mm_unpackhi_epi8(T6A, zero); T6A = _mm_add_epi16(T6_0A, T6_1A); T6B = _mm_loadu_si128((__m128i *)(p_src + 6 * i_src + 16)); T6_0B = _mm_unpacklo_epi8(T6B, zero); T6_1B = _mm_unpackhi_epi8(T6B, zero); T6B = _mm_add_epi16(T6_0B, T6_1B); T6C = _mm_loadu_si128((__m128i *)(p_src + 6 * i_src + 32)); T6_0C = _mm_unpacklo_epi8(T6C, zero); T6_1C = _mm_unpackhi_epi8(T6C, zero); T6C = _mm_add_epi16(T6_0C, T6_1C); T6D = _mm_loadu_si128((__m128i *)(p_src + 6 * i_src + 48)); T6_0D = _mm_unpacklo_epi8(T6D, zero); T6_1D = _mm_unpackhi_epi8(T6D, zero); T6D = _mm_add_epi16(T6_0D, T6_1D); T6 = _mm_add_epi16(T6A, T6B); T6 = _mm_add_epi16(T6, T6C); T6 = _mm_add_epi16(T6, T6D); T7A = _mm_loadu_si128((__m128i *)(p_src + 7 * i_src)); T7_0A = _mm_unpacklo_epi8(T7A, zero); T7_1A = _mm_unpackhi_epi8(T7A, zero); T7A = _mm_add_epi16(T7_0A, T7_1A); T7B = _mm_loadu_si128((__m128i *)(p_src + 7 * i_src + 16)); T7_0B = _mm_unpacklo_epi8(T7B, zero); T7_1B = _mm_unpackhi_epi8(T7B, zero); T7B = _mm_add_epi16(T7_0B, T7_1B); T7C = _mm_loadu_si128((__m128i *)(p_src + 7 * i_src + 32)); T7_0C = _mm_unpacklo_epi8(T7C, zero); T7_1C = _mm_unpackhi_epi8(T7C, zero); T7C = _mm_add_epi16(T7_0C, T7_1C); T7D = _mm_loadu_si128((__m128i *)(p_src + 7 * i_src + 48)); T7_0D = _mm_unpacklo_epi8(T7D, zero); T7_1D = _mm_unpackhi_epi8(T7D, zero); T7D = _mm_add_epi16(T7_0D, T7_1D); T7 = _mm_add_epi16(T7A, T7B); T7 = _mm_add_epi16(T7, T7C); T7 = _mm_add_epi16(T7, T7D); T8A = _mm_loadu_si128((__m128i *)(p_src + 8 * i_src)); T8_0A = _mm_unpacklo_epi8(T8A, zero); T8_1A = _mm_unpackhi_epi8(T8A, zero); T8A = _mm_add_epi16(T8_0A, T8_1A); T8B = _mm_loadu_si128((__m128i *)(p_src + 8 * i_src + 16)); T8_0B = _mm_unpacklo_epi8(T8B, zero); T8_1B = _mm_unpackhi_epi8(T8B, zero); T8B = _mm_add_epi16(T8_0B, T8_1B); T8C = _mm_loadu_si128((__m128i *)(p_src + 8 * i_src + 32)); T8_0C = _mm_unpacklo_epi8(T8C, zero); T8_1C = _mm_unpackhi_epi8(T8C, zero); T8C = _mm_add_epi16(T8_0C, T8_1C); T8D = _mm_loadu_si128((__m128i *)(p_src + 8 * i_src + 48)); T8_0D = _mm_unpacklo_epi8(T8D, zero); T8_1D = _mm_unpackhi_epi8(T8D, zero); T8D = _mm_add_epi16(T8_0D, T8_1D); T8 = _mm_add_epi16(T8A, T8B); T8 = _mm_add_epi16(T8, T8C); T8 = _mm_add_epi16(T8, T8D); T9A = _mm_loadu_si128((__m128i *)(p_src + 9 * i_src)); T9_0A = _mm_unpacklo_epi8(T9A, zero); T9_1A = _mm_unpackhi_epi8(T9A, zero); T9A = _mm_add_epi16(T9_0A, T9_1A); T9B = _mm_loadu_si128((__m128i *)(p_src + 9 * i_src + 16)); T9_0B = _mm_unpacklo_epi8(T9B, zero); T9_1B = _mm_unpackhi_epi8(T9B, zero); T9B = _mm_add_epi16(T9_0B, T9_1B); T9C = _mm_loadu_si128((__m128i *)(p_src + 9 * i_src + 32)); T9_0C = _mm_unpacklo_epi8(T9C, zero); T9_1C = _mm_unpackhi_epi8(T9C, zero); T9C = _mm_add_epi16(T9_0C, T9_1C); T9D = _mm_loadu_si128((__m128i *)(p_src + 9 * i_src + 48)); T9_0D = _mm_unpacklo_epi8(T9D, zero); T9_1D = _mm_unpackhi_epi8(T9D, zero); T9D = _mm_add_epi16(T9_0D, T9_1D); T9 = _mm_add_epi16(T9A, T9B); T9 = _mm_add_epi16(T9, T9C); T9 = _mm_add_epi16(T9, T9D); T10A = _mm_loadu_si128((__m128i *)(p_src + 10 * i_src)); T10_0A = _mm_unpacklo_epi8(T10A, zero); T10_1A = _mm_unpackhi_epi8(T10A, zero); T10A = _mm_add_epi16(T10_0A, T10_1A); T10B = _mm_loadu_si128((__m128i *)(p_src + 10 * i_src + 16)); T10_0B = _mm_unpacklo_epi8(T10B, zero); T10_1B = _mm_unpackhi_epi8(T10B, zero); T10B = _mm_add_epi16(T10_0B, T10_1B); T10C = _mm_loadu_si128((__m128i *)(p_src + 10 * i_src + 32)); T10_0C = _mm_unpacklo_epi8(T10C, zero); T10_1C = _mm_unpackhi_epi8(T10C, zero); T10C = _mm_add_epi16(T10_0C, T10_1C); T10D = _mm_loadu_si128((__m128i *)(p_src + 10 * i_src + 48)); T10_0D = _mm_unpacklo_epi8(T10D, zero); T10_1D = _mm_unpackhi_epi8(T10D, zero); T10D = _mm_add_epi16(T10_0D, T10_1D); T10 = _mm_add_epi16(T10A, T10B); T10 = _mm_add_epi16(T10, T10C); T10 = _mm_add_epi16(T10, T10D); T11A = _mm_loadu_si128((__m128i *)(p_src + 11 * i_src)); T11_0A = _mm_unpacklo_epi8(T11A, zero); T11_1A = _mm_unpackhi_epi8(T11A, zero); T11A = _mm_add_epi16(T11_0A, T11_1A); T11B = _mm_loadu_si128((__m128i *)(p_src + 11 * i_src + 16)); T11_0B = _mm_unpacklo_epi8(T11B, zero); T11_1B = _mm_unpackhi_epi8(T11B, zero); T11B = _mm_add_epi16(T11_0B, T11_1B); T11C = _mm_loadu_si128((__m128i *)(p_src + 11 * i_src + 32)); T11_0C = _mm_unpacklo_epi8(T11C, zero); T11_1C = _mm_unpackhi_epi8(T11C, zero); T11C = _mm_add_epi16(T11_0C, T11_1C); T11D = _mm_loadu_si128((__m128i *)(p_src + 11 * i_src + 48)); T11_0D = _mm_unpacklo_epi8(T11D, zero); T11_1D = _mm_unpackhi_epi8(T11D, zero); T11D = _mm_add_epi16(T11_0D, T11_1D); T11 = _mm_add_epi16(T11A, T11B); T11 = _mm_add_epi16(T11, T11C); T11 = _mm_add_epi16(T11, T11D); T12A = _mm_loadu_si128((__m128i *)(p_src + 12 * i_src)); T12_0A = _mm_unpacklo_epi8(T12A, zero); T12_1A = _mm_unpackhi_epi8(T12A, zero); T12A = _mm_add_epi16(T12_0A, T12_1A); T12B = _mm_loadu_si128((__m128i *)(p_src + 12 * i_src + 16)); T12_0B = _mm_unpacklo_epi8(T12B, zero); T12_1B = _mm_unpackhi_epi8(T12B, zero); T12B = _mm_add_epi16(T12_0B, T12_1B); T12C = _mm_loadu_si128((__m128i *)(p_src + 12 * i_src + 32)); T12_0C = _mm_unpacklo_epi8(T12C, zero); T12_1C = _mm_unpackhi_epi8(T12C, zero); T12C = _mm_add_epi16(T12_0C, T12_1C); T12D = _mm_loadu_si128((__m128i *)(p_src + 12 * i_src + 48)); T12_0D = _mm_unpacklo_epi8(T12D, zero); T12_1D = _mm_unpackhi_epi8(T12D, zero); T12D = _mm_add_epi16(T12_0D, T12_1D); T12 = _mm_add_epi16(T12A, T12B); T12 = _mm_add_epi16(T12, T12C); T12 = _mm_add_epi16(T12, T12D); T13A = _mm_loadu_si128((__m128i *)(p_src + 13 * i_src)); T13_0A = _mm_unpacklo_epi8(T13A, zero); T13_1A = _mm_unpackhi_epi8(T13A, zero); T13A = _mm_add_epi16(T13_0A, T13_1A); T13B = _mm_loadu_si128((__m128i *)(p_src + 13 * i_src + 16)); T13_0B = _mm_unpacklo_epi8(T13B, zero); T13_1B = _mm_unpackhi_epi8(T13B, zero); T13B = _mm_add_epi16(T13_0B, T13_1B); T13C = _mm_loadu_si128((__m128i *)(p_src + 13 * i_src + 32)); T13_0C = _mm_unpacklo_epi8(T13C, zero); T13_1C = _mm_unpackhi_epi8(T13C, zero); T13C = _mm_add_epi16(T13_0C, T13_1C); T13D = _mm_loadu_si128((__m128i *)(p_src + 13 * i_src + 48)); T13_0D = _mm_unpacklo_epi8(T13D, zero); T13_1D = _mm_unpackhi_epi8(T13D, zero); T13D = _mm_add_epi16(T13_0D, T13_1D); T13 = _mm_add_epi16(T13A, T13B); T13 = _mm_add_epi16(T13, T13C); T13 = _mm_add_epi16(T13, T13D); T14A = _mm_loadu_si128((__m128i *)(p_src + 14 * i_src)); T14_0A = _mm_unpacklo_epi8(T14A, zero); T14_1A = _mm_unpackhi_epi8(T14A, zero); T14A = _mm_add_epi16(T14_0A, T14_1A); T14B = _mm_loadu_si128((__m128i *)(p_src + 14 * i_src + 16)); T14_0B = _mm_unpacklo_epi8(T14B, zero); T14_1B = _mm_unpackhi_epi8(T14B, zero); T14B = _mm_add_epi16(T14_0B, T14_1B); T14C = _mm_loadu_si128((__m128i *)(p_src + 14 * i_src + 32)); T14_0C = _mm_unpacklo_epi8(T14C, zero); T14_1C = _mm_unpackhi_epi8(T14C, zero); T14C = _mm_add_epi16(T14_0C, T14_1C); T14D = _mm_loadu_si128((__m128i *)(p_src + 14 * i_src + 48)); T14_0D = _mm_unpacklo_epi8(T14D, zero); T14_1D = _mm_unpackhi_epi8(T14D, zero); T14D = _mm_add_epi16(T14_0D, T14_1D); T14 = _mm_add_epi16(T14A, T14B); T14 = _mm_add_epi16(T14, T14C); T14 = _mm_add_epi16(T14, T14D); T15A = _mm_loadu_si128((__m128i *)(p_src + 15 * i_src)); T15_0A = _mm_unpacklo_epi8(T15A, zero); T15_1A = _mm_unpackhi_epi8(T15A, zero); T15A = _mm_add_epi16(T15_0A, T15_1A); T15B = _mm_loadu_si128((__m128i *)(p_src + 15 * i_src + 16)); T15_0B = _mm_unpacklo_epi8(T15B, zero); T15_1B = _mm_unpackhi_epi8(T15B, zero); T15B = _mm_add_epi16(T15_0B, T15_1B); T15C = _mm_loadu_si128((__m128i *)(p_src + 15 * i_src + 32)); T15_0C = _mm_unpacklo_epi8(T15C, zero); T15_1C = _mm_unpackhi_epi8(T15C, zero); T15C = _mm_add_epi16(T15_0C, T15_1C); T15D = _mm_loadu_si128((__m128i *)(p_src + 15 * i_src + 48)); T15_0D = _mm_unpacklo_epi8(T15D, zero); T15_1D = _mm_unpackhi_epi8(T15D, zero); T15D = _mm_add_epi16(T15_0D, T15_1D); T15 = _mm_add_epi16(T15A, T15B); T15 = _mm_add_epi16(T15, T15C); T15 = _mm_add_epi16(T15, T15D); T16A = _mm_loadu_si128((__m128i *)(p_src + 16 * i_src)); T16_0A = _mm_unpacklo_epi8(T16A, zero); T16_1A = _mm_unpackhi_epi8(T16A, zero); T16A = _mm_add_epi16(T16_0A, T16_1A); T16B = _mm_loadu_si128((__m128i *)(p_src + 16 * i_src + 16)); T16_0B = _mm_unpacklo_epi8(T16B, zero); T16_1B = _mm_unpackhi_epi8(T16B, zero); T16B = _mm_add_epi16(T16_0B, T16_1B); T16C = _mm_loadu_si128((__m128i *)(p_src + 16 * i_src + 32)); T16_0C = _mm_unpacklo_epi8(T16C, zero); T16_1C = _mm_unpackhi_epi8(T16C, zero); T16C = _mm_add_epi16(T16_0C, T16_1C); T16D = _mm_loadu_si128((__m128i *)(p_src + 16 * i_src + 48)); T16_0D = _mm_unpacklo_epi8(T16D, zero); T16_1D = _mm_unpackhi_epi8(T16D, zero); T16D = _mm_add_epi16(T16_0D, T16_1D); T16 = _mm_add_epi16(T16A, T16B); T16 = _mm_add_epi16(T16, T16C); T16 = _mm_add_epi16(T16, T16D); T17A = _mm_loadu_si128((__m128i *)(p_src + 17 * i_src)); T17_0A = _mm_unpacklo_epi8(T17A, zero); T17_1A = _mm_unpackhi_epi8(T17A, zero); T17A = _mm_add_epi16(T17_0A, T17_1A); T17B = _mm_loadu_si128((__m128i *)(p_src + 17 * i_src + 16)); T17_0B = _mm_unpacklo_epi8(T17B, zero); T17_1B = _mm_unpackhi_epi8(T17B, zero); T17B = _mm_add_epi16(T17_0B, T17_1B); T17C = _mm_loadu_si128((__m128i *)(p_src + 17 * i_src + 32)); T17_0C = _mm_unpacklo_epi8(T17C, zero); T17_1C = _mm_unpackhi_epi8(T17C, zero); T17C = _mm_add_epi16(T17_0C, T17_1C); T17D = _mm_loadu_si128((__m128i *)(p_src + 17 * i_src + 48)); T17_0D = _mm_unpacklo_epi8(T17D, zero); T17_1D = _mm_unpackhi_epi8(T17D, zero); T17D = _mm_add_epi16(T17_0D, T17_1D); T17 = _mm_add_epi16(T17A, T17B); T17 = _mm_add_epi16(T17, T17C); T17 = _mm_add_epi16(T17, T17D); T18A = _mm_loadu_si128((__m128i *)(p_src + 18 * i_src)); T18_0A = _mm_unpacklo_epi8(T18A, zero); T18_1A = _mm_unpackhi_epi8(T18A, zero); T18A = _mm_add_epi16(T18_0A, T18_1A); T18B = _mm_loadu_si128((__m128i *)(p_src + 18 * i_src + 16)); T18_0B = _mm_unpacklo_epi8(T18B, zero); T18_1B = _mm_unpackhi_epi8(T18B, zero); T18B = _mm_add_epi16(T18_0B, T18_1B); T18C = _mm_loadu_si128((__m128i *)(p_src + 18 * i_src + 32)); T18_0C = _mm_unpacklo_epi8(T18C, zero); T18_1C = _mm_unpackhi_epi8(T18C, zero); T18C = _mm_add_epi16(T18_0C, T18_1C); T18D = _mm_loadu_si128((__m128i *)(p_src + 18 * i_src + 48)); T18_0D = _mm_unpacklo_epi8(T18D, zero); T18_1D = _mm_unpackhi_epi8(T18D, zero); T18D = _mm_add_epi16(T18_0D, T18_1D); T18 = _mm_add_epi16(T18A, T18B); T18 = _mm_add_epi16(T18, T18C); T18 = _mm_add_epi16(T18, T18D); T19A = _mm_loadu_si128((__m128i *)(p_src + 19 * i_src)); T19_0A = _mm_unpacklo_epi8(T19A, zero); T19_1A = _mm_unpackhi_epi8(T19A, zero); T19A = _mm_add_epi16(T19_0A, T19_1A); T19B = _mm_loadu_si128((__m128i *)(p_src + 19 * i_src + 16)); T19_0B = _mm_unpacklo_epi8(T19B, zero); T19_1B = _mm_unpackhi_epi8(T19B, zero); T19B = _mm_add_epi16(T19_0B, T19_1B); T19C = _mm_loadu_si128((__m128i *)(p_src + 19 * i_src + 32)); T19_0C = _mm_unpacklo_epi8(T19C, zero); T19_1C = _mm_unpackhi_epi8(T19C, zero); T19C = _mm_add_epi16(T19_0C, T19_1C); T19D = _mm_loadu_si128((__m128i *)(p_src + 19 * i_src + 48)); T19_0D = _mm_unpacklo_epi8(T19D, zero); T19_1D = _mm_unpackhi_epi8(T19D, zero); T19D = _mm_add_epi16(T19_0D, T19_1D); T19 = _mm_add_epi16(T19A, T19B); T19 = _mm_add_epi16(T19, T19C); T19 = _mm_add_epi16(T19, T19D); T20A = _mm_loadu_si128((__m128i *)(p_src + 20 * i_src)); T20_0A = _mm_unpacklo_epi8(T20A, zero); T20_1A = _mm_unpackhi_epi8(T20A, zero); T20A = _mm_add_epi16(T20_0A, T20_1A); T20B = _mm_loadu_si128((__m128i *)(p_src + 20 * i_src + 16)); T20_0B = _mm_unpacklo_epi8(T20B, zero); T20_1B = _mm_unpackhi_epi8(T20B, zero); T20B = _mm_add_epi16(T20_0B, T20_1B); T20C = _mm_loadu_si128((__m128i *)(p_src + 20 * i_src + 32)); T20_0C = _mm_unpacklo_epi8(T20C, zero); T20_1C = _mm_unpackhi_epi8(T20C, zero); T20C = _mm_add_epi16(T20_0C, T20_1C); T20D = _mm_loadu_si128((__m128i *)(p_src + 20 * i_src + 48)); T20_0D = _mm_unpacklo_epi8(T20D, zero); T20_1D = _mm_unpackhi_epi8(T20D, zero); T20D = _mm_add_epi16(T20_0D, T20_1D); T20 = _mm_add_epi16(T20A, T20B); T20 = _mm_add_epi16(T20, T20C); T20 = _mm_add_epi16(T20, T20D); T21A = _mm_loadu_si128((__m128i *)(p_src + 21 * i_src)); T21_0A = _mm_unpacklo_epi8(T21A, zero); T21_1A = _mm_unpackhi_epi8(T21A, zero); T21A = _mm_add_epi16(T21_0A, T21_1A); T21B = _mm_loadu_si128((__m128i *)(p_src + 21 * i_src + 16)); T21_0B = _mm_unpacklo_epi8(T21B, zero); T21_1B = _mm_unpackhi_epi8(T21B, zero); T21B = _mm_add_epi16(T21_0B, T21_1B); T21C = _mm_loadu_si128((__m128i *)(p_src + 21 * i_src + 32)); T21_0C = _mm_unpacklo_epi8(T21C, zero); T21_1C = _mm_unpackhi_epi8(T21C, zero); T21C = _mm_add_epi16(T21_0C, T21_1C); T21D = _mm_loadu_si128((__m128i *)(p_src + 21 * i_src + 48)); T21_0D = _mm_unpacklo_epi8(T21D, zero); T21_1D = _mm_unpackhi_epi8(T21D, zero); T21D = _mm_add_epi16(T21_0D, T21_1D); T21 = _mm_add_epi16(T21A, T21B); T21 = _mm_add_epi16(T21, T21C); T21 = _mm_add_epi16(T21, T21D); T22A = _mm_loadu_si128((__m128i *)(p_src + 22 * i_src)); T22_0A = _mm_unpacklo_epi8(T22A, zero); T22_1A = _mm_unpackhi_epi8(T22A, zero); T22A = _mm_add_epi16(T22_0A, T22_1A); T22B = _mm_loadu_si128((__m128i *)(p_src + 22 * i_src + 16)); T22_0B = _mm_unpacklo_epi8(T22B, zero); T22_1B = _mm_unpackhi_epi8(T22B, zero); T22B = _mm_add_epi16(T22_0B, T22_1B); T22C = _mm_loadu_si128((__m128i *)(p_src + 22 * i_src + 32)); T22_0C = _mm_unpacklo_epi8(T22C, zero); T22_1C = _mm_unpackhi_epi8(T22C, zero); T22C = _mm_add_epi16(T22_0C, T22_1C); T22D = _mm_loadu_si128((__m128i *)(p_src + 22 * i_src + 48)); T22_0D = _mm_unpacklo_epi8(T22D, zero); T22_1D = _mm_unpackhi_epi8(T22D, zero); T22D = _mm_add_epi16(T22_0D, T22_1D); T22 = _mm_add_epi16(T22A, T22B); T22 = _mm_add_epi16(T22, T22C); T22 = _mm_add_epi16(T22, T22D); T23A = _mm_loadu_si128((__m128i *)(p_src + 23 * i_src)); T23_0A = _mm_unpacklo_epi8(T23A, zero); T23_1A = _mm_unpackhi_epi8(T23A, zero); T23A = _mm_add_epi16(T23_0A, T23_1A); T23B = _mm_loadu_si128((__m128i *)(p_src + 23 * i_src + 16)); T23_0B = _mm_unpacklo_epi8(T23B, zero); T23_1B = _mm_unpackhi_epi8(T23B, zero); T23B = _mm_add_epi16(T23_0B, T23_1B); T23C = _mm_loadu_si128((__m128i *)(p_src + 23 * i_src + 32)); T23_0C = _mm_unpacklo_epi8(T23C, zero); T23_1C = _mm_unpackhi_epi8(T23C, zero); T23C = _mm_add_epi16(T23_0C, T23_1C); T23D = _mm_loadu_si128((__m128i *)(p_src + 23 * i_src + 48)); T23_0D = _mm_unpacklo_epi8(T23D, zero); T23_1D = _mm_unpackhi_epi8(T23D, zero); T23D = _mm_add_epi16(T23_0D, T23_1D); T23 = _mm_add_epi16(T23A, T23B); T23 = _mm_add_epi16(T23, T23C); T23 = _mm_add_epi16(T23, T23D); T24A = _mm_loadu_si128((__m128i *)(p_src + 24 * i_src)); T24_0A = _mm_unpacklo_epi8(T24A, zero); T24_1A = _mm_unpackhi_epi8(T24A, zero); T24A = _mm_add_epi16(T24_0A, T24_1A); T24B = _mm_loadu_si128((__m128i *)(p_src + 24 * i_src + 16)); T24_0B = _mm_unpacklo_epi8(T24B, zero); T24_1B = _mm_unpackhi_epi8(T24B, zero); T24B = _mm_add_epi16(T24_0B, T24_1B); T24C = _mm_loadu_si128((__m128i *)(p_src + 24 * i_src + 32)); T24_0C = _mm_unpacklo_epi8(T24C, zero); T24_1C = _mm_unpackhi_epi8(T24C, zero); T24C = _mm_add_epi16(T24_0C, T24_1C); T24D = _mm_loadu_si128((__m128i *)(p_src + 24 * i_src + 48)); T24_0D = _mm_unpacklo_epi8(T24D, zero); T24_1D = _mm_unpackhi_epi8(T24D, zero); T24D = _mm_add_epi16(T24_0D, T24_1D); T24 = _mm_add_epi16(T24A, T24B); T24 = _mm_add_epi16(T24, T24C); T24 = _mm_add_epi16(T24, T24D); T25A = _mm_loadu_si128((__m128i *)(p_src + 25 * i_src)); T25_0A = _mm_unpacklo_epi8(T25A, zero); T25_1A = _mm_unpackhi_epi8(T25A, zero); T25A = _mm_add_epi16(T25_0A, T25_1A); T25B = _mm_loadu_si128((__m128i *)(p_src + 25 * i_src + 16)); T25_0B = _mm_unpacklo_epi8(T25B, zero); T25_1B = _mm_unpackhi_epi8(T25B, zero); T25B = _mm_add_epi16(T25_0B, T25_1B); T25C = _mm_loadu_si128((__m128i *)(p_src + 25 * i_src + 32)); T25_0C = _mm_unpacklo_epi8(T25C, zero); T25_1C = _mm_unpackhi_epi8(T25C, zero); T25C = _mm_add_epi16(T25_0C, T25_1C); T25D = _mm_loadu_si128((__m128i *)(p_src + 25 * i_src + 48)); T25_0D = _mm_unpacklo_epi8(T25D, zero); T25_1D = _mm_unpackhi_epi8(T25D, zero); T25D = _mm_add_epi16(T25_0D, T25_1D); T25 = _mm_add_epi16(T25A, T25B); T25 = _mm_add_epi16(T25, T25C); T25 = _mm_add_epi16(T25, T25D); T26A = _mm_loadu_si128((__m128i *)(p_src + 26 * i_src)); T26_0A = _mm_unpacklo_epi8(T26A, zero); T26_1A = _mm_unpackhi_epi8(T26A, zero); T26A = _mm_add_epi16(T26_0A, T26_1A); T26B = _mm_loadu_si128((__m128i *)(p_src + 26 * i_src + 16)); T26_0B = _mm_unpacklo_epi8(T26B, zero); T26_1B = _mm_unpackhi_epi8(T26B, zero); T26B = _mm_add_epi16(T26_0B, T26_1B); T26C = _mm_loadu_si128((__m128i *)(p_src + 26 * i_src + 32)); T26_0C = _mm_unpacklo_epi8(T26C, zero); T26_1C = _mm_unpackhi_epi8(T26C, zero); T26C = _mm_add_epi16(T26_0C, T26_1C); T26D = _mm_loadu_si128((__m128i *)(p_src + 26 * i_src + 48)); T26_0D = _mm_unpacklo_epi8(T26D, zero); T26_1D = _mm_unpackhi_epi8(T26D, zero); T26D = _mm_add_epi16(T26_0D, T26_1D); T26 = _mm_add_epi16(T26A, T26B); T26 = _mm_add_epi16(T26, T26C); T26 = _mm_add_epi16(T26, T26D); T27A = _mm_loadu_si128((__m128i *)(p_src + 27 * i_src)); T27_0A = _mm_unpacklo_epi8(T27A, zero); T27_1A = _mm_unpackhi_epi8(T27A, zero); T27A = _mm_add_epi16(T27_0A, T27_1A); T27B = _mm_loadu_si128((__m128i *)(p_src + 27 * i_src + 16)); T27_0B = _mm_unpacklo_epi8(T27B, zero); T27_1B = _mm_unpackhi_epi8(T27B, zero); T27B = _mm_add_epi16(T27_0B, T27_1B); T27C = _mm_loadu_si128((__m128i *)(p_src + 27 * i_src + 32)); T27_0C = _mm_unpacklo_epi8(T27C, zero); T27_1C = _mm_unpackhi_epi8(T27C, zero); T27C = _mm_add_epi16(T27_0C, T27_1C); T27D = _mm_loadu_si128((__m128i *)(p_src + 27 * i_src + 48)); T27_0D = _mm_unpacklo_epi8(T27D, zero); T27_1D = _mm_unpackhi_epi8(T27D, zero); T27D = _mm_add_epi16(T27_0D, T27_1D); T27 = _mm_add_epi16(T27A, T27B); T27 = _mm_add_epi16(T27, T27C); T27 = _mm_add_epi16(T27, T27D); T28A = _mm_loadu_si128((__m128i *)(p_src + 28 * i_src)); T28_0A = _mm_unpacklo_epi8(T28A, zero); T28_1A = _mm_unpackhi_epi8(T28A, zero); T28A = _mm_add_epi16(T28_0A, T28_1A); T28B = _mm_loadu_si128((__m128i *)(p_src + 28 * i_src + 16)); T28_0B = _mm_unpacklo_epi8(T28B, zero); T28_1B = _mm_unpackhi_epi8(T28B, zero); T28B = _mm_add_epi16(T28_0B, T28_1B); T28C = _mm_loadu_si128((__m128i *)(p_src + 28 * i_src + 32)); T28_0C = _mm_unpacklo_epi8(T28C, zero); T28_1C = _mm_unpackhi_epi8(T28C, zero); T28C = _mm_add_epi16(T28_0C, T28_1C); T28D = _mm_loadu_si128((__m128i *)(p_src + 28 * i_src + 48)); T28_0D = _mm_unpacklo_epi8(T28D, zero); T28_1D = _mm_unpackhi_epi8(T28D, zero); T28D = _mm_add_epi16(T28_0D, T28_1D); T28 = _mm_add_epi16(T28A, T28B); T28 = _mm_add_epi16(T28, T28C); T28 = _mm_add_epi16(T28, T28D); T29A = _mm_loadu_si128((__m128i *)(p_src + 29 * i_src)); T29_0A = _mm_unpacklo_epi8(T29A, zero); T29_1A = _mm_unpackhi_epi8(T29A, zero); T29A = _mm_add_epi16(T29_0A, T29_1A); T29B = _mm_loadu_si128((__m128i *)(p_src + 29 * i_src + 16)); T29_0B = _mm_unpacklo_epi8(T29B, zero); T29_1B = _mm_unpackhi_epi8(T29B, zero); T29B = _mm_add_epi16(T29_0B, T29_1B); T29C = _mm_loadu_si128((__m128i *)(p_src + 29 * i_src + 32)); T29_0C = _mm_unpacklo_epi8(T29C, zero); T29_1C = _mm_unpackhi_epi8(T29C, zero); T29C = _mm_add_epi16(T29_0C, T29_1C); T29D = _mm_loadu_si128((__m128i *)(p_src + 29 * i_src + 48)); T29_0D = _mm_unpacklo_epi8(T29D, zero); T29_1D = _mm_unpackhi_epi8(T29D, zero); T29D = _mm_add_epi16(T29_0D, T29_1D); T29 = _mm_add_epi16(T29A, T29B); T29 = _mm_add_epi16(T29, T29C); T29 = _mm_add_epi16(T29, T29D); T30A = _mm_loadu_si128((__m128i *)(p_src + 30 * i_src)); T30_0A = _mm_unpacklo_epi8(T30A, zero); T30_1A = _mm_unpackhi_epi8(T30A, zero); T30A = _mm_add_epi16(T30_0A, T30_1A); T30B = _mm_loadu_si128((__m128i *)(p_src + 30 * i_src + 16)); T30_0B = _mm_unpacklo_epi8(T30B, zero); T30_1B = _mm_unpackhi_epi8(T30B, zero); T30B = _mm_add_epi16(T30_0B, T30_1B); T30C = _mm_loadu_si128((__m128i *)(p_src + 30 * i_src + 32)); T30_0C = _mm_unpacklo_epi8(T30C, zero); T30_1C = _mm_unpackhi_epi8(T30C, zero); T30C = _mm_add_epi16(T30_0C, T30_1C); T30D = _mm_loadu_si128((__m128i *)(p_src + 30 * i_src + 48)); T30_0D = _mm_unpacklo_epi8(T30D, zero); T30_1D = _mm_unpackhi_epi8(T30D, zero); T30D = _mm_add_epi16(T30_0D, T30_1D); T30 = _mm_add_epi16(T30A, T30B); T30 = _mm_add_epi16(T30, T30C); T30 = _mm_add_epi16(T30, T30D); T31A = _mm_loadu_si128((__m128i *)(p_src + 31 * i_src)); T31_0A = _mm_unpacklo_epi8(T31A, zero); T31_1A = _mm_unpackhi_epi8(T31A, zero); T31A = _mm_add_epi16(T31_0A, T31_1A); T31B = _mm_loadu_si128((__m128i *)(p_src + 31 * i_src + 16)); T31_0B = _mm_unpacklo_epi8(T31B, zero); T31_1B = _mm_unpackhi_epi8(T31B, zero); T31B = _mm_add_epi16(T31_0B, T31_1B); T31C = _mm_loadu_si128((__m128i *)(p_src + 31 * i_src + 32)); T31_0C = _mm_unpacklo_epi8(T31C, zero); T31_1C = _mm_unpackhi_epi8(T31C, zero); T31C = _mm_add_epi16(T31_0C, T31_1C); T31D = _mm_loadu_si128((__m128i *)(p_src + 31 * i_src + 48)); T31_0D = _mm_unpacklo_epi8(T31D, zero); T31_1D = _mm_unpackhi_epi8(T31D, zero); T31D = _mm_add_epi16(T31_0D, T31_1D); T31 = _mm_add_epi16(T31A, T31B); T31 = _mm_add_epi16(T31, T31C); T31 = _mm_add_epi16(T31, T31D); T32A = _mm_loadu_si128((__m128i *)(p_src + 32 * i_src)); T32_0A = _mm_unpacklo_epi8(T32A, zero); T32_1A = _mm_unpackhi_epi8(T32A, zero); T32A = _mm_add_epi16(T32_0A, T32_1A); T32B = _mm_loadu_si128((__m128i *)(p_src + 32 * i_src + 16)); T32_0B = _mm_unpacklo_epi8(T32B, zero); T32_1B = _mm_unpackhi_epi8(T32B, zero); T32B = _mm_add_epi16(T32_0B, T32_1B); T32C = _mm_loadu_si128((__m128i *)(p_src + 32 * i_src + 32)); T32_0C = _mm_unpacklo_epi8(T32C, zero); T32_1C = _mm_unpackhi_epi8(T32C, zero); T32C = _mm_add_epi16(T32_0C, T32_1C); T32D = _mm_loadu_si128((__m128i *)(p_src + 32 * i_src + 48)); T32_0D = _mm_unpacklo_epi8(T32D, zero); T32_1D = _mm_unpackhi_epi8(T32D, zero); T32D = _mm_add_epi16(T32_0D, T32_1D); T32 = _mm_add_epi16(T32A, T32B); T32 = _mm_add_epi16(T32, T32C); T32 = _mm_add_epi16(T32, T32D); T33A = _mm_loadu_si128((__m128i *)(p_src + 33 * i_src)); T33_0A = _mm_unpacklo_epi8(T33A, zero); T33_1A = _mm_unpackhi_epi8(T33A, zero); T33A = _mm_add_epi16(T33_0A, T33_1A); T33B = _mm_loadu_si128((__m128i *)(p_src + 33 * i_src + 16)); T33_0B = _mm_unpacklo_epi8(T33B, zero); T33_1B = _mm_unpackhi_epi8(T33B, zero); T33B = _mm_add_epi16(T33_0B, T33_1B); T33C = _mm_loadu_si128((__m128i *)(p_src + 33 * i_src + 32)); T33_0C = _mm_unpacklo_epi8(T33C, zero); T33_1C = _mm_unpackhi_epi8(T33C, zero); T33C = _mm_add_epi16(T33_0C, T33_1C); T33D = _mm_loadu_si128((__m128i *)(p_src + 33 * i_src + 48)); T33_0D = _mm_unpacklo_epi8(T33D, zero); T33_1D = _mm_unpackhi_epi8(T33D, zero); T33D = _mm_add_epi16(T33_0D, T33_1D); T33 = _mm_add_epi16(T33A, T33B); T33 = _mm_add_epi16(T33, T33C); T33 = _mm_add_epi16(T33, T33D); T34A = _mm_loadu_si128((__m128i *)(p_src + 34 * i_src)); T34_0A = _mm_unpacklo_epi8(T34A, zero); T34_1A = _mm_unpackhi_epi8(T34A, zero); T34A = _mm_add_epi16(T34_0A, T34_1A); T34B = _mm_loadu_si128((__m128i *)(p_src + 34 * i_src + 16)); T34_0B = _mm_unpacklo_epi8(T34B, zero); T34_1B = _mm_unpackhi_epi8(T34B, zero); T34B = _mm_add_epi16(T34_0B, T34_1B); T34C = _mm_loadu_si128((__m128i *)(p_src + 34 * i_src + 32)); T34_0C = _mm_unpacklo_epi8(T34C, zero); T34_1C = _mm_unpackhi_epi8(T34C, zero); T34C = _mm_add_epi16(T34_0C, T34_1C); T34D = _mm_loadu_si128((__m128i *)(p_src + 34 * i_src + 48)); T34_0D = _mm_unpacklo_epi8(T34D, zero); T34_1D = _mm_unpackhi_epi8(T34D, zero); T34D = _mm_add_epi16(T34_0D, T34_1D); T34 = _mm_add_epi16(T34A, T34B); T34 = _mm_add_epi16(T34, T34C); T34 = _mm_add_epi16(T34, T34D); T35A = _mm_loadu_si128((__m128i *)(p_src + 35 * i_src)); T35_0A = _mm_unpacklo_epi8(T35A, zero); T35_1A = _mm_unpackhi_epi8(T35A, zero); T35A = _mm_add_epi16(T35_0A, T35_1A); T35B = _mm_loadu_si128((__m128i *)(p_src + 35 * i_src + 16)); T35_0B = _mm_unpacklo_epi8(T35B, zero); T35_1B = _mm_unpackhi_epi8(T35B, zero); T35B = _mm_add_epi16(T35_0B, T35_1B); T35C = _mm_loadu_si128((__m128i *)(p_src + 35 * i_src + 32)); T35_0C = _mm_unpacklo_epi8(T35C, zero); T35_1C = _mm_unpackhi_epi8(T35C, zero); T35C = _mm_add_epi16(T35_0C, T35_1C); T35D = _mm_loadu_si128((__m128i *)(p_src + 35 * i_src + 48)); T35_0D = _mm_unpacklo_epi8(T35D, zero); T35_1D = _mm_unpackhi_epi8(T35D, zero); T35D = _mm_add_epi16(T35_0D, T35_1D); T35 = _mm_add_epi16(T35A, T35B); T35 = _mm_add_epi16(T35, T35C); T35 = _mm_add_epi16(T35, T35D); T36A = _mm_loadu_si128((__m128i *)(p_src + 36 * i_src)); T36_0A = _mm_unpacklo_epi8(T36A, zero); T36_1A = _mm_unpackhi_epi8(T36A, zero); T36A = _mm_add_epi16(T36_0A, T36_1A); T36B = _mm_loadu_si128((__m128i *)(p_src + 36 * i_src + 16)); T36_0B = _mm_unpacklo_epi8(T36B, zero); T36_1B = _mm_unpackhi_epi8(T36B, zero); T36B = _mm_add_epi16(T36_0B, T36_1B); T36C = _mm_loadu_si128((__m128i *)(p_src + 36 * i_src + 32)); T36_0C = _mm_unpacklo_epi8(T36C, zero); T36_1C = _mm_unpackhi_epi8(T36C, zero); T36C = _mm_add_epi16(T36_0C, T36_1C); T36D = _mm_loadu_si128((__m128i *)(p_src + 36 * i_src + 48)); T36_0D = _mm_unpacklo_epi8(T36D, zero); T36_1D = _mm_unpackhi_epi8(T36D, zero); T36D = _mm_add_epi16(T36_0D, T36_1D); T36 = _mm_add_epi16(T36A, T36B); T36 = _mm_add_epi16(T36, T36C); T36 = _mm_add_epi16(T36, T36D); T37A = _mm_loadu_si128((__m128i *)(p_src + 37 * i_src)); T37_0A = _mm_unpacklo_epi8(T37A, zero); T37_1A = _mm_unpackhi_epi8(T37A, zero); T37A = _mm_add_epi16(T37_0A, T37_1A); T37B = _mm_loadu_si128((__m128i *)(p_src + 37 * i_src + 16)); T37_0B = _mm_unpacklo_epi8(T37B, zero); T37_1B = _mm_unpackhi_epi8(T37B, zero); T37B = _mm_add_epi16(T37_0B, T37_1B); T37C = _mm_loadu_si128((__m128i *)(p_src + 37 * i_src + 32)); T37_0C = _mm_unpacklo_epi8(T37C, zero); T37_1C = _mm_unpackhi_epi8(T37C, zero); T37C = _mm_add_epi16(T37_0C, T37_1C); T37D = _mm_loadu_si128((__m128i *)(p_src + 37 * i_src + 48)); T37_0D = _mm_unpacklo_epi8(T37D, zero); T37_1D = _mm_unpackhi_epi8(T37D, zero); T37D = _mm_add_epi16(T37_0D, T37_1D); T37 = _mm_add_epi16(T37A, T37B); T37 = _mm_add_epi16(T37, T37C); T37 = _mm_add_epi16(T37, T37D); T38A = _mm_loadu_si128((__m128i *)(p_src + 38 * i_src)); T38_0A = _mm_unpacklo_epi8(T38A, zero); T38_1A = _mm_unpackhi_epi8(T38A, zero); T38A = _mm_add_epi16(T38_0A, T38_1A); T38B = _mm_loadu_si128((__m128i *)(p_src + 38 * i_src + 16)); T38_0B = _mm_unpacklo_epi8(T38B, zero); T38_1B = _mm_unpackhi_epi8(T38B, zero); T38B = _mm_add_epi16(T38_0B, T38_1B); T38C = _mm_loadu_si128((__m128i *)(p_src + 38 * i_src + 32)); T38_0C = _mm_unpacklo_epi8(T38C, zero); T38_1C = _mm_unpackhi_epi8(T38C, zero); T38C = _mm_add_epi16(T38_0C, T38_1C); T38D = _mm_loadu_si128((__m128i *)(p_src + 38 * i_src + 48)); T38_0D = _mm_unpacklo_epi8(T38D, zero); T38_1D = _mm_unpackhi_epi8(T38D, zero); T38D = _mm_add_epi16(T38_0D, T38_1D); T38 = _mm_add_epi16(T38A, T38B); T38 = _mm_add_epi16(T38, T38C); T38 = _mm_add_epi16(T38, T38D); T39A = _mm_loadu_si128((__m128i *)(p_src + 39 * i_src)); T39_0A = _mm_unpacklo_epi8(T39A, zero); T39_1A = _mm_unpackhi_epi8(T39A, zero); T39A = _mm_add_epi16(T39_0A, T39_1A); T39B = _mm_loadu_si128((__m128i *)(p_src + 39 * i_src + 16)); T39_0B = _mm_unpacklo_epi8(T39B, zero); T39_1B = _mm_unpackhi_epi8(T39B, zero); T39B = _mm_add_epi16(T39_0B, T39_1B); T39C = _mm_loadu_si128((__m128i *)(p_src + 39 * i_src + 32)); T39_0C = _mm_unpacklo_epi8(T39C, zero); T39_1C = _mm_unpackhi_epi8(T39C, zero); T39C = _mm_add_epi16(T39_0C, T39_1C); T39D = _mm_loadu_si128((__m128i *)(p_src + 39 * i_src + 48)); T39_0D = _mm_unpacklo_epi8(T39D, zero); T39_1D = _mm_unpackhi_epi8(T39D, zero); T39D = _mm_add_epi16(T39_0D, T39_1D); T39 = _mm_add_epi16(T39A, T39B); T39 = _mm_add_epi16(T39, T39C); T39 = _mm_add_epi16(T39, T39D); T40A = _mm_loadu_si128((__m128i *)(p_src + 40 * i_src)); T40_0A = _mm_unpacklo_epi8(T40A, zero); T40_1A = _mm_unpackhi_epi8(T40A, zero); T40A = _mm_add_epi16(T40_0A, T40_1A); T40B = _mm_loadu_si128((__m128i *)(p_src + 40 * i_src + 16)); T40_0B = _mm_unpacklo_epi8(T40B, zero); T40_1B = _mm_unpackhi_epi8(T40B, zero); T40B = _mm_add_epi16(T40_0B, T40_1B); T40C = _mm_loadu_si128((__m128i *)(p_src + 40 * i_src + 32)); T40_0C = _mm_unpacklo_epi8(T40C, zero); T40_1C = _mm_unpackhi_epi8(T40C, zero); T40C = _mm_add_epi16(T40_0C, T40_1C); T40D = _mm_loadu_si128((__m128i *)(p_src + 40 * i_src + 48)); T40_0D = _mm_unpacklo_epi8(T40D, zero); T40_1D = _mm_unpackhi_epi8(T40D, zero); T40D = _mm_add_epi16(T40_0D, T40_1D); T40 = _mm_add_epi16(T40A, T40B); T40 = _mm_add_epi16(T40, T40C); T40 = _mm_add_epi16(T40, T40D); T41A = _mm_loadu_si128((__m128i *)(p_src + 41 * i_src)); T41_0A = _mm_unpacklo_epi8(T41A, zero); T41_1A = _mm_unpackhi_epi8(T41A, zero); T41A = _mm_add_epi16(T41_0A, T41_1A); T41B = _mm_loadu_si128((__m128i *)(p_src + 41 * i_src + 16)); T41_0B = _mm_unpacklo_epi8(T41B, zero); T41_1B = _mm_unpackhi_epi8(T41B, zero); T41B = _mm_add_epi16(T41_0B, T41_1B); T41C = _mm_loadu_si128((__m128i *)(p_src + 41 * i_src + 32)); T41_0C = _mm_unpacklo_epi8(T41C, zero); T41_1C = _mm_unpackhi_epi8(T41C, zero); T41C = _mm_add_epi16(T41_0C, T41_1C); T41D = _mm_loadu_si128((__m128i *)(p_src + 41 * i_src + 48)); T41_0D = _mm_unpacklo_epi8(T41D, zero); T41_1D = _mm_unpackhi_epi8(T41D, zero); T41D = _mm_add_epi16(T41_0D, T41_1D); T41 = _mm_add_epi16(T41A, T41B); T41 = _mm_add_epi16(T41, T41C); T41 = _mm_add_epi16(T41, T41D); T42A = _mm_loadu_si128((__m128i *)(p_src + 42 * i_src)); T42_0A = _mm_unpacklo_epi8(T42A, zero); T42_1A = _mm_unpackhi_epi8(T42A, zero); T42A = _mm_add_epi16(T42_0A, T42_1A); T42B = _mm_loadu_si128((__m128i *)(p_src + 42 * i_src + 16)); T42_0B = _mm_unpacklo_epi8(T42B, zero); T42_1B = _mm_unpackhi_epi8(T42B, zero); T42B = _mm_add_epi16(T42_0B, T42_1B); T42C = _mm_loadu_si128((__m128i *)(p_src + 42 * i_src + 32)); T42_0C = _mm_unpacklo_epi8(T42C, zero); T42_1C = _mm_unpackhi_epi8(T42C, zero); T42C = _mm_add_epi16(T42_0C, T42_1C); T42D = _mm_loadu_si128((__m128i *)(p_src + 42 * i_src + 48)); T42_0D = _mm_unpacklo_epi8(T42D, zero); T42_1D = _mm_unpackhi_epi8(T42D, zero); T42D = _mm_add_epi16(T42_0D, T42_1D); T42 = _mm_add_epi16(T42A, T42B); T42 = _mm_add_epi16(T42, T42C); T42 = _mm_add_epi16(T42, T42D); T43A = _mm_loadu_si128((__m128i *)(p_src + 43 * i_src)); T43_0A = _mm_unpacklo_epi8(T43A, zero); T43_1A = _mm_unpackhi_epi8(T43A, zero); T43A = _mm_add_epi16(T43_0A, T43_1A); T43B = _mm_loadu_si128((__m128i *)(p_src + 43 * i_src + 16)); T43_0B = _mm_unpacklo_epi8(T43B, zero); T43_1B = _mm_unpackhi_epi8(T43B, zero); T43B = _mm_add_epi16(T43_0B, T43_1B); T43C = _mm_loadu_si128((__m128i *)(p_src + 43 * i_src + 32)); T43_0C = _mm_unpacklo_epi8(T43C, zero); T43_1C = _mm_unpackhi_epi8(T43C, zero); T43C = _mm_add_epi16(T43_0C, T43_1C); T43D = _mm_loadu_si128((__m128i *)(p_src + 43 * i_src + 48)); T43_0D = _mm_unpacklo_epi8(T43D, zero); T43_1D = _mm_unpackhi_epi8(T43D, zero); T43D = _mm_add_epi16(T43_0D, T43_1D); T43 = _mm_add_epi16(T43A, T43B); T43 = _mm_add_epi16(T43, T43C); T43 = _mm_add_epi16(T43, T43D); T44A = _mm_loadu_si128((__m128i *)(p_src + 44 * i_src)); T44_0A = _mm_unpacklo_epi8(T44A, zero); T44_1A = _mm_unpackhi_epi8(T44A, zero); T44A = _mm_add_epi16(T44_0A, T44_1A); T44B = _mm_loadu_si128((__m128i *)(p_src + 44 * i_src + 16)); T44_0B = _mm_unpacklo_epi8(T44B, zero); T44_1B = _mm_unpackhi_epi8(T44B, zero); T44B = _mm_add_epi16(T44_0B, T44_1B); T44C = _mm_loadu_si128((__m128i *)(p_src + 44 * i_src + 32)); T44_0C = _mm_unpacklo_epi8(T44C, zero); T44_1C = _mm_unpackhi_epi8(T44C, zero); T44C = _mm_add_epi16(T44_0C, T44_1C); T44D = _mm_loadu_si128((__m128i *)(p_src + 44 * i_src + 48)); T44_0D = _mm_unpacklo_epi8(T44D, zero); T44_1D = _mm_unpackhi_epi8(T44D, zero); T44D = _mm_add_epi16(T44_0D, T44_1D); T44 = _mm_add_epi16(T44A, T44B); T44 = _mm_add_epi16(T44, T44C); T44 = _mm_add_epi16(T44, T44D); T45A = _mm_loadu_si128((__m128i *)(p_src + 45 * i_src)); T45_0A = _mm_unpacklo_epi8(T45A, zero); T45_1A = _mm_unpackhi_epi8(T45A, zero); T45A = _mm_add_epi16(T45_0A, T45_1A); T45B = _mm_loadu_si128((__m128i *)(p_src + 45 * i_src + 16)); T45_0B = _mm_unpacklo_epi8(T45B, zero); T45_1B = _mm_unpackhi_epi8(T45B, zero); T45B = _mm_add_epi16(T45_0B, T45_1B); T45C = _mm_loadu_si128((__m128i *)(p_src + 45 * i_src + 32)); T45_0C = _mm_unpacklo_epi8(T45C, zero); T45_1C = _mm_unpackhi_epi8(T45C, zero); T45C = _mm_add_epi16(T45_0C, T45_1C); T45D = _mm_loadu_si128((__m128i *)(p_src + 45 * i_src + 48)); T45_0D = _mm_unpacklo_epi8(T45D, zero); T45_1D = _mm_unpackhi_epi8(T45D, zero); T45D = _mm_add_epi16(T45_0D, T45_1D); T45 = _mm_add_epi16(T45A, T45B); T45 = _mm_add_epi16(T45, T45C); T45 = _mm_add_epi16(T45, T45D); T46A = _mm_loadu_si128((__m128i *)(p_src + 46 * i_src)); T46_0A = _mm_unpacklo_epi8(T46A, zero); T46_1A = _mm_unpackhi_epi8(T46A, zero); T46A = _mm_add_epi16(T46_0A, T46_1A); T46B = _mm_loadu_si128((__m128i *)(p_src + 46 * i_src + 16)); T46_0B = _mm_unpacklo_epi8(T46B, zero); T46_1B = _mm_unpackhi_epi8(T46B, zero); T46B = _mm_add_epi16(T46_0B, T46_1B); T46C = _mm_loadu_si128((__m128i *)(p_src + 46 * i_src + 32)); T46_0C = _mm_unpacklo_epi8(T46C, zero); T46_1C = _mm_unpackhi_epi8(T46C, zero); T46C = _mm_add_epi16(T46_0C, T46_1C); T46D = _mm_loadu_si128((__m128i *)(p_src + 46 * i_src + 48)); T46_0D = _mm_unpacklo_epi8(T46D, zero); T46_1D = _mm_unpackhi_epi8(T46D, zero); T46D = _mm_add_epi16(T46_0D, T46_1D); T46 = _mm_add_epi16(T46A, T46B); T46 = _mm_add_epi16(T46, T46C); T46 = _mm_add_epi16(T46, T46D); T47A = _mm_loadu_si128((__m128i *)(p_src + 47 * i_src)); T47_0A = _mm_unpacklo_epi8(T47A, zero); T47_1A = _mm_unpackhi_epi8(T47A, zero); T47A = _mm_add_epi16(T47_0A, T47_1A); T47B = _mm_loadu_si128((__m128i *)(p_src + 47 * i_src + 16)); T47_0B = _mm_unpacklo_epi8(T47B, zero); T47_1B = _mm_unpackhi_epi8(T47B, zero); T47B = _mm_add_epi16(T47_0B, T47_1B); T47C = _mm_loadu_si128((__m128i *)(p_src + 47 * i_src + 32)); T47_0C = _mm_unpacklo_epi8(T47C, zero); T47_1C = _mm_unpackhi_epi8(T47C, zero); T47C = _mm_add_epi16(T47_0C, T47_1C); T47D = _mm_loadu_si128((__m128i *)(p_src + 47 * i_src + 48)); T47_0D = _mm_unpacklo_epi8(T47D, zero); T47_1D = _mm_unpackhi_epi8(T47D, zero); T47D = _mm_add_epi16(T47_0D, T47_1D); T47 = _mm_add_epi16(T47A, T47B); T47 = _mm_add_epi16(T47, T47C); T47 = _mm_add_epi16(T47, T47D); T48A = _mm_loadu_si128((__m128i *)(p_src + 48 * i_src)); T48_0A = _mm_unpacklo_epi8(T48A, zero); T48_1A = _mm_unpackhi_epi8(T48A, zero); T48A = _mm_add_epi16(T48_0A, T48_1A); T48B = _mm_loadu_si128((__m128i *)(p_src + 48 * i_src + 16)); T48_0B = _mm_unpacklo_epi8(T48B, zero); T48_1B = _mm_unpackhi_epi8(T48B, zero); T48B = _mm_add_epi16(T48_0B, T48_1B); T48C = _mm_loadu_si128((__m128i *)(p_src + 48 * i_src + 32)); T48_0C = _mm_unpacklo_epi8(T48C, zero); T48_1C = _mm_unpackhi_epi8(T48C, zero); T48C = _mm_add_epi16(T48_0C, T48_1C); T48D = _mm_loadu_si128((__m128i *)(p_src + 48 * i_src + 48)); T48_0D = _mm_unpacklo_epi8(T48D, zero); T48_1D = _mm_unpackhi_epi8(T48D, zero); T48D = _mm_add_epi16(T48_0D, T48_1D); T48 = _mm_add_epi16(T48A, T48B); T48 = _mm_add_epi16(T48, T48C); T48 = _mm_add_epi16(T48, T48D); T49A = _mm_loadu_si128((__m128i *)(p_src + 49 * i_src)); T49_0A = _mm_unpacklo_epi8(T49A, zero); T49_1A = _mm_unpackhi_epi8(T49A, zero); T49A = _mm_add_epi16(T49_0A, T49_1A); T49B = _mm_loadu_si128((__m128i *)(p_src + 49 * i_src + 16)); T49_0B = _mm_unpacklo_epi8(T49B, zero); T49_1B = _mm_unpackhi_epi8(T49B, zero); T49B = _mm_add_epi16(T49_0B, T49_1B); T49C = _mm_loadu_si128((__m128i *)(p_src + 49 * i_src + 32)); T49_0C = _mm_unpacklo_epi8(T49C, zero); T49_1C = _mm_unpackhi_epi8(T49C, zero); T49C = _mm_add_epi16(T49_0C, T49_1C); T49D = _mm_loadu_si128((__m128i *)(p_src + 49 * i_src + 48)); T49_0D = _mm_unpacklo_epi8(T49D, zero); T49_1D = _mm_unpackhi_epi8(T49D, zero); T49D = _mm_add_epi16(T49_0D, T49_1D); T49 = _mm_add_epi16(T49A, T49B); T49 = _mm_add_epi16(T49, T49C); T49 = _mm_add_epi16(T49, T49D); T50A = _mm_loadu_si128((__m128i *)(p_src + 50 * i_src)); T50_0A = _mm_unpacklo_epi8(T50A, zero); T50_1A = _mm_unpackhi_epi8(T50A, zero); T50A = _mm_add_epi16(T50_0A, T50_1A); T50B = _mm_loadu_si128((__m128i *)(p_src + 50 * i_src + 16)); T50_0B = _mm_unpacklo_epi8(T50B, zero); T50_1B = _mm_unpackhi_epi8(T50B, zero); T50B = _mm_add_epi16(T50_0B, T50_1B); T50C = _mm_loadu_si128((__m128i *)(p_src + 50 * i_src + 32)); T50_0C = _mm_unpacklo_epi8(T50C, zero); T50_1C = _mm_unpackhi_epi8(T50C, zero); T50C = _mm_add_epi16(T50_0C, T50_1C); T50D = _mm_loadu_si128((__m128i *)(p_src + 50 * i_src + 48)); T50_0D = _mm_unpacklo_epi8(T50D, zero); T50_1D = _mm_unpackhi_epi8(T50D, zero); T50D = _mm_add_epi16(T50_0D, T50_1D); T50 = _mm_add_epi16(T50A, T50B); T50 = _mm_add_epi16(T50, T50C); T50 = _mm_add_epi16(T50, T50D); T51A = _mm_loadu_si128((__m128i *)(p_src + 51 * i_src)); T51_0A = _mm_unpacklo_epi8(T51A, zero); T51_1A = _mm_unpackhi_epi8(T51A, zero); T51A = _mm_add_epi16(T51_0A, T51_1A); T51B = _mm_loadu_si128((__m128i *)(p_src + 51 * i_src + 16)); T51_0B = _mm_unpacklo_epi8(T51B, zero); T51_1B = _mm_unpackhi_epi8(T51B, zero); T51B = _mm_add_epi16(T51_0B, T51_1B); T51C = _mm_loadu_si128((__m128i *)(p_src + 51 * i_src + 32)); T51_0C = _mm_unpacklo_epi8(T51C, zero); T51_1C = _mm_unpackhi_epi8(T51C, zero); T51C = _mm_add_epi16(T51_0C, T51_1C); T51D = _mm_loadu_si128((__m128i *)(p_src + 51 * i_src + 48)); T51_0D = _mm_unpacklo_epi8(T51D, zero); T51_1D = _mm_unpackhi_epi8(T51D, zero); T51D = _mm_add_epi16(T51_0D, T51_1D); T51 = _mm_add_epi16(T51A, T51B); T51 = _mm_add_epi16(T51, T51C); T51 = _mm_add_epi16(T51, T51D); T52A = _mm_loadu_si128((__m128i *)(p_src + 52 * i_src)); T52_0A = _mm_unpacklo_epi8(T52A, zero); T52_1A = _mm_unpackhi_epi8(T52A, zero); T52A = _mm_add_epi16(T52_0A, T52_1A); T52B = _mm_loadu_si128((__m128i *)(p_src + 52 * i_src + 16)); T52_0B = _mm_unpacklo_epi8(T52B, zero); T52_1B = _mm_unpackhi_epi8(T52B, zero); T52B = _mm_add_epi16(T52_0B, T52_1B); T52C = _mm_loadu_si128((__m128i *)(p_src + 52 * i_src + 32)); T52_0C = _mm_unpacklo_epi8(T52C, zero); T52_1C = _mm_unpackhi_epi8(T52C, zero); T52C = _mm_add_epi16(T52_0C, T52_1C); T52D = _mm_loadu_si128((__m128i *)(p_src + 52 * i_src + 48)); T52_0D = _mm_unpacklo_epi8(T52D, zero); T52_1D = _mm_unpackhi_epi8(T52D, zero); T52D = _mm_add_epi16(T52_0D, T52_1D); T52 = _mm_add_epi16(T52A, T52B); T52 = _mm_add_epi16(T52, T52C); T52 = _mm_add_epi16(T52, T52D); T53A = _mm_loadu_si128((__m128i *)(p_src + 53 * i_src)); T53_0A = _mm_unpacklo_epi8(T53A, zero); T53_1A = _mm_unpackhi_epi8(T53A, zero); T53A = _mm_add_epi16(T53_0A, T53_1A); T53B = _mm_loadu_si128((__m128i *)(p_src + 53 * i_src + 16)); T53_0B = _mm_unpacklo_epi8(T53B, zero); T53_1B = _mm_unpackhi_epi8(T53B, zero); T53B = _mm_add_epi16(T53_0B, T53_1B); T53C = _mm_loadu_si128((__m128i *)(p_src + 53 * i_src + 32)); T53_0C = _mm_unpacklo_epi8(T53C, zero); T53_1C = _mm_unpackhi_epi8(T53C, zero); T53C = _mm_add_epi16(T53_0C, T53_1C); T53D = _mm_loadu_si128((__m128i *)(p_src + 53 * i_src + 48)); T53_0D = _mm_unpacklo_epi8(T53D, zero); T53_1D = _mm_unpackhi_epi8(T53D, zero); T53D = _mm_add_epi16(T53_0D, T53_1D); T53 = _mm_add_epi16(T53A, T53B); T53 = _mm_add_epi16(T53, T53C); T53 = _mm_add_epi16(T53, T53D); T54A = _mm_loadu_si128((__m128i *)(p_src + 54 * i_src)); T54_0A = _mm_unpacklo_epi8(T54A, zero); T54_1A = _mm_unpackhi_epi8(T54A, zero); T54A = _mm_add_epi16(T54_0A, T54_1A); T54B = _mm_loadu_si128((__m128i *)(p_src + 54 * i_src + 16)); T54_0B = _mm_unpacklo_epi8(T54B, zero); T54_1B = _mm_unpackhi_epi8(T54B, zero); T54B = _mm_add_epi16(T54_0B, T54_1B); T54C = _mm_loadu_si128((__m128i *)(p_src + 54 * i_src + 32)); T54_0C = _mm_unpacklo_epi8(T54C, zero); T54_1C = _mm_unpackhi_epi8(T54C, zero); T54C = _mm_add_epi16(T54_0C, T54_1C); T54D = _mm_loadu_si128((__m128i *)(p_src + 54 * i_src + 48)); T54_0D = _mm_unpacklo_epi8(T54D, zero); T54_1D = _mm_unpackhi_epi8(T54D, zero); T54D = _mm_add_epi16(T54_0D, T54_1D); T54 = _mm_add_epi16(T54A, T54B); T54 = _mm_add_epi16(T54, T54C); T54 = _mm_add_epi16(T54, T54D); T55A = _mm_loadu_si128((__m128i *)(p_src + 55 * i_src)); T55_0A = _mm_unpacklo_epi8(T55A, zero); T55_1A = _mm_unpackhi_epi8(T55A, zero); T55A = _mm_add_epi16(T55_0A, T55_1A); T55B = _mm_loadu_si128((__m128i *)(p_src + 55 * i_src + 16)); T55_0B = _mm_unpacklo_epi8(T55B, zero); T55_1B = _mm_unpackhi_epi8(T55B, zero); T55B = _mm_add_epi16(T55_0B, T55_1B); T55C = _mm_loadu_si128((__m128i *)(p_src + 55 * i_src + 32)); T55_0C = _mm_unpacklo_epi8(T55C, zero); T55_1C = _mm_unpackhi_epi8(T55C, zero); T55C = _mm_add_epi16(T55_0C, T55_1C); T55D = _mm_loadu_si128((__m128i *)(p_src + 55 * i_src + 48)); T55_0D = _mm_unpacklo_epi8(T55D, zero); T55_1D = _mm_unpackhi_epi8(T55D, zero); T55D = _mm_add_epi16(T55_0D, T55_1D); T55 = _mm_add_epi16(T55A, T55B); T55 = _mm_add_epi16(T55, T55C); T55 = _mm_add_epi16(T55, T55D); T56A = _mm_loadu_si128((__m128i *)(p_src + 56 * i_src)); T56_0A = _mm_unpacklo_epi8(T56A, zero); T56_1A = _mm_unpackhi_epi8(T56A, zero); T56A = _mm_add_epi16(T56_0A, T56_1A); T56B = _mm_loadu_si128((__m128i *)(p_src + 56 * i_src + 16)); T56_0B = _mm_unpacklo_epi8(T56B, zero); T56_1B = _mm_unpackhi_epi8(T56B, zero); T56B = _mm_add_epi16(T56_0B, T56_1B); T56C = _mm_loadu_si128((__m128i *)(p_src + 56 * i_src + 32)); T56_0C = _mm_unpacklo_epi8(T56C, zero); T56_1C = _mm_unpackhi_epi8(T56C, zero); T56C = _mm_add_epi16(T56_0C, T56_1C); T56D = _mm_loadu_si128((__m128i *)(p_src + 56 * i_src + 48)); T56_0D = _mm_unpacklo_epi8(T56D, zero); T56_1D = _mm_unpackhi_epi8(T56D, zero); T56D = _mm_add_epi16(T56_0D, T56_1D); T56 = _mm_add_epi16(T56A, T56B); T56 = _mm_add_epi16(T56, T56C); T56 = _mm_add_epi16(T56, T56D); T57A = _mm_loadu_si128((__m128i *)(p_src + 57 * i_src)); T57_0A = _mm_unpacklo_epi8(T57A, zero); T57_1A = _mm_unpackhi_epi8(T57A, zero); T57A = _mm_add_epi16(T57_0A, T57_1A); T57B = _mm_loadu_si128((__m128i *)(p_src + 57 * i_src + 16)); T57_0B = _mm_unpacklo_epi8(T57B, zero); T57_1B = _mm_unpackhi_epi8(T57B, zero); T57B = _mm_add_epi16(T57_0B, T57_1B); T57C = _mm_loadu_si128((__m128i *)(p_src + 57 * i_src + 32)); T57_0C = _mm_unpacklo_epi8(T57C, zero); T57_1C = _mm_unpackhi_epi8(T57C, zero); T57C = _mm_add_epi16(T57_0C, T57_1C); T57D = _mm_loadu_si128((__m128i *)(p_src + 57 * i_src + 48)); T57_0D = _mm_unpacklo_epi8(T57D, zero); T57_1D = _mm_unpackhi_epi8(T57D, zero); T57D = _mm_add_epi16(T57_0D, T57_1D); T57 = _mm_add_epi16(T57A, T57B); T57 = _mm_add_epi16(T57, T57C); T57 = _mm_add_epi16(T57, T57D); T58A = _mm_loadu_si128((__m128i *)(p_src + 58 * i_src)); T58_0A = _mm_unpacklo_epi8(T58A, zero); T58_1A = _mm_unpackhi_epi8(T58A, zero); T58A = _mm_add_epi16(T58_0A, T58_1A); T58B = _mm_loadu_si128((__m128i *)(p_src + 58 * i_src + 16)); T58_0B = _mm_unpacklo_epi8(T58B, zero); T58_1B = _mm_unpackhi_epi8(T58B, zero); T58B = _mm_add_epi16(T58_0B, T58_1B); T58C = _mm_loadu_si128((__m128i *)(p_src + 58 * i_src + 32)); T58_0C = _mm_unpacklo_epi8(T58C, zero); T58_1C = _mm_unpackhi_epi8(T58C, zero); T58C = _mm_add_epi16(T58_0C, T58_1C); T58D = _mm_loadu_si128((__m128i *)(p_src + 58 * i_src + 48)); T58_0D = _mm_unpacklo_epi8(T58D, zero); T58_1D = _mm_unpackhi_epi8(T58D, zero); T58D = _mm_add_epi16(T58_0D, T58_1D); T58 = _mm_add_epi16(T58A, T58B); T58 = _mm_add_epi16(T58, T58C); T58 = _mm_add_epi16(T58, T58D); T59A = _mm_loadu_si128((__m128i *)(p_src + 59 * i_src)); T59_0A = _mm_unpacklo_epi8(T59A, zero); T59_1A = _mm_unpackhi_epi8(T59A, zero); T59A = _mm_add_epi16(T59_0A, T59_1A); T59B = _mm_loadu_si128((__m128i *)(p_src + 59 * i_src + 16)); T59_0B = _mm_unpacklo_epi8(T59B, zero); T59_1B = _mm_unpackhi_epi8(T59B, zero); T59B = _mm_add_epi16(T59_0B, T59_1B); T59C = _mm_loadu_si128((__m128i *)(p_src + 59 * i_src + 32)); T59_0C = _mm_unpacklo_epi8(T59C, zero); T59_1C = _mm_unpackhi_epi8(T59C, zero); T59C = _mm_add_epi16(T59_0C, T59_1C); T59D = _mm_loadu_si128((__m128i *)(p_src + 59 * i_src + 48)); T59_0D = _mm_unpacklo_epi8(T59D, zero); T59_1D = _mm_unpackhi_epi8(T59D, zero); T59D = _mm_add_epi16(T59_0D, T59_1D); T59 = _mm_add_epi16(T59A, T59B); T59 = _mm_add_epi16(T59, T59C); T59 = _mm_add_epi16(T59, T59D); T60A = _mm_loadu_si128((__m128i *)(p_src + 60 * i_src)); T60_0A = _mm_unpacklo_epi8(T60A, zero); T60_1A = _mm_unpackhi_epi8(T60A, zero); T60A = _mm_add_epi16(T60_0A, T60_1A); T60B = _mm_loadu_si128((__m128i *)(p_src + 60 * i_src + 16)); T60_0B = _mm_unpacklo_epi8(T60B, zero); T60_1B = _mm_unpackhi_epi8(T60B, zero); T60B = _mm_add_epi16(T60_0B, T60_1B); T60C = _mm_loadu_si128((__m128i *)(p_src + 60 * i_src + 32)); T60_0C = _mm_unpacklo_epi8(T60C, zero); T60_1C = _mm_unpackhi_epi8(T60C, zero); T60C = _mm_add_epi16(T60_0C, T60_1C); T60D = _mm_loadu_si128((__m128i *)(p_src + 60 * i_src + 48)); T60_0D = _mm_unpacklo_epi8(T60D, zero); T60_1D = _mm_unpackhi_epi8(T60D, zero); T60D = _mm_add_epi16(T60_0D, T60_1D); T60 = _mm_add_epi16(T60A, T60B); T60 = _mm_add_epi16(T60, T60C); T60 = _mm_add_epi16(T60, T60D); T61A = _mm_loadu_si128((__m128i *)(p_src + 61 * i_src)); T61_0A = _mm_unpacklo_epi8(T61A, zero); T61_1A = _mm_unpackhi_epi8(T61A, zero); T61A = _mm_add_epi16(T61_0A, T61_1A); T61B = _mm_loadu_si128((__m128i *)(p_src + 61 * i_src + 16)); T61_0B = _mm_unpacklo_epi8(T61B, zero); T61_1B = _mm_unpackhi_epi8(T61B, zero); T61B = _mm_add_epi16(T61_0B, T61_1B); T61C = _mm_loadu_si128((__m128i *)(p_src + 61 * i_src + 32)); T61_0C = _mm_unpacklo_epi8(T61C, zero); T61_1C = _mm_unpackhi_epi8(T61C, zero); T61C = _mm_add_epi16(T61_0C, T61_1C); T61D = _mm_loadu_si128((__m128i *)(p_src + 61 * i_src + 48)); T61_0D = _mm_unpacklo_epi8(T61D, zero); T61_1D = _mm_unpackhi_epi8(T61D, zero); T61D = _mm_add_epi16(T61_0D, T61_1D); T61 = _mm_add_epi16(T61A, T61B); T61 = _mm_add_epi16(T61, T61C); T61 = _mm_add_epi16(T61, T61D); T62A = _mm_loadu_si128((__m128i *)(p_src + 62 * i_src)); T62_0A = _mm_unpacklo_epi8(T62A, zero); T62_1A = _mm_unpackhi_epi8(T62A, zero); T62A = _mm_add_epi16(T62_0A, T62_1A); T62B = _mm_loadu_si128((__m128i *)(p_src + 62 * i_src + 16)); T62_0B = _mm_unpacklo_epi8(T62B, zero); T62_1B = _mm_unpackhi_epi8(T62B, zero); T62B = _mm_add_epi16(T62_0B, T62_1B); T62C = _mm_loadu_si128((__m128i *)(p_src + 62 * i_src + 32)); T62_0C = _mm_unpacklo_epi8(T62C, zero); T62_1C = _mm_unpackhi_epi8(T62C, zero); T62C = _mm_add_epi16(T62_0C, T62_1C); T62D = _mm_loadu_si128((__m128i *)(p_src + 62 * i_src + 48)); T62_0D = _mm_unpacklo_epi8(T62D, zero); T62_1D = _mm_unpackhi_epi8(T62D, zero); T62D = _mm_add_epi16(T62_0D, T62_1D); T62 = _mm_add_epi16(T62A, T62B); T62 = _mm_add_epi16(T62, T62C); T62 = _mm_add_epi16(T62, T62D); T63A = _mm_loadu_si128((__m128i *)(p_src + 63 * i_src)); T63_0A = _mm_unpacklo_epi8(T63A, zero); T63_1A = _mm_unpackhi_epi8(T63A, zero); T63A = _mm_add_epi16(T63_0A, T63_1A); T63B = _mm_loadu_si128((__m128i *)(p_src + 63 * i_src + 16)); T63_0B = _mm_unpacklo_epi8(T63B, zero); T63_1B = _mm_unpackhi_epi8(T63B, zero); T63B = _mm_add_epi16(T63_0B, T63_1B); T63C = _mm_loadu_si128((__m128i *)(p_src + 63 * i_src + 32)); T63_0C = _mm_unpacklo_epi8(T63C, zero); T63_1C = _mm_unpackhi_epi8(T63C, zero); T63C = _mm_add_epi16(T63_0C, T63_1C); T63D = _mm_loadu_si128((__m128i *)(p_src + 63 * i_src + 48)); T63_0D = _mm_unpacklo_epi8(T63D, zero); T63_1D = _mm_unpackhi_epi8(T63D, zero); T63D = _mm_add_epi16(T63_0D, T63_1D); T63 = _mm_add_epi16(T63A, T63B); T63 = _mm_add_epi16(T63, T63C); T63 = _mm_add_epi16(T63, T63D); S1 = _mm_add_epi16(T0, T1); S1 = _mm_add_epi16(S1, T2); S1 = _mm_add_epi16(S1, T3); S1 = _mm_add_epi16(S1, T4); S1 = _mm_add_epi16(S1, T5); S1 = _mm_add_epi16(S1, T6); S1 = _mm_add_epi16(S1, T7); S1 = _mm_add_epi16(S1, T8); S1 = _mm_add_epi16(S1, T9); S1 = _mm_add_epi16(S1, T10); S1 = _mm_add_epi16(S1, T11); S1 = _mm_add_epi16(S1, T12); S1 = _mm_add_epi16(S1, T13); S1 = _mm_add_epi16(S1, T14); S1 = _mm_add_epi16(S1, T15); S2 = _mm_add_epi16(T16, T17); S2 = _mm_add_epi16(S2, T18); S2 = _mm_add_epi16(S2, T19); S2 = _mm_add_epi16(S2, T20); S2 = _mm_add_epi16(S2, T21); S2 = _mm_add_epi16(S2, T22); S2 = _mm_add_epi16(S2, T23); S2 = _mm_add_epi16(S2, T24); S2 = _mm_add_epi16(S2, T25); S2 = _mm_add_epi16(S2, T26); S2 = _mm_add_epi16(S2, T27); S2 = _mm_add_epi16(S2, T28); S2 = _mm_add_epi16(S2, T29); S2 = _mm_add_epi16(S2, T30); S2 = _mm_add_epi16(S2, T31); S3 = _mm_add_epi16(T32, T33); S3 = _mm_add_epi16(S3, T34); S3 = _mm_add_epi16(S3, T35); S3 = _mm_add_epi16(S3, T36); S3 = _mm_add_epi16(S3, T37); S3 = _mm_add_epi16(S3, T38); S3 = _mm_add_epi16(S3, T39); S3 = _mm_add_epi16(S3, T40); S3 = _mm_add_epi16(S3, T41); S3 = _mm_add_epi16(S3, T42); S3 = _mm_add_epi16(S3, T43); S3 = _mm_add_epi16(S3, T44); S3 = _mm_add_epi16(S3, T45); S3 = _mm_add_epi16(S3, T46); S3 = _mm_add_epi16(S3, T47); S = _mm_add_epi16(T48, T49); S = _mm_add_epi16(S, T50); S = _mm_add_epi16(S, T51); S = _mm_add_epi16(S, T52); S = _mm_add_epi16(S, T53); S = _mm_add_epi16(S, T54); S = _mm_add_epi16(S, T55); S = _mm_add_epi16(S, T56); S = _mm_add_epi16(S, T57); S = _mm_add_epi16(S, T58); S = _mm_add_epi16(S, T59); S = _mm_add_epi16(S, T60); S = _mm_add_epi16(S, T61); S = _mm_add_epi16(S, T62); S = _mm_add_epi16(S, T63); sum1 = M128_U16(S1, 0) + M128_U16(S1, 1) + M128_U16(S1, 2) + M128_U16(S1, 3) + M128_U16(S1, 4) + M128_U16(S1, 5) + M128_U16(S1, 6) + M128_U16(S1, 7); sum2 = M128_U16(S2, 0) + M128_U16(S2, 1) + M128_U16(S2, 2) + M128_U16(S2, 3) + M128_U16(S2, 4) + M128_U16(S2, 5) + M128_U16(S2, 6) + M128_U16(S2, 7); sum3 = M128_U16(S3, 0) + M128_U16(S3, 1) + M128_U16(S3, 2) + M128_U16(S3, 3) + M128_U16(S3, 4) + M128_U16(S3, 5) + M128_U16(S3, 6) + M128_U16(S3, 7); sum = M128_U16(S, 0) + M128_U16(S, 1) + M128_U16(S, 2) + M128_U16(S, 3) + M128_U16(S, 4) + M128_U16(S, 5) + M128_U16(S, 6) + M128_U16(S, 7); sum = sum + sum1 + sum2 + sum3; f_avg = (sum + (num_pix >> 1)) / num_pix; avg = _mm_set1_epi16((short)f_avg); /* cal mad */ /*for (int y = 0; y < cu_size; ++y) { for (int x = 0; x < cu_size; ++x) { int f_pxl = p_src[x]; mad += AVS2_ABS(f_pxl - f_avg); } p_src += i_src; }*/ T0_0A = _mm_sub_epi16(T0_0A, avg); T0_1A = _mm_sub_epi16(T0_1A, avg); T0_0B = _mm_sub_epi16(T0_0B, avg); T0_1B = _mm_sub_epi16(T0_1B, avg); T0_0C = _mm_sub_epi16(T0_0C, avg); T0_1C = _mm_sub_epi16(T0_1C, avg); T0_0D = _mm_sub_epi16(T0_0D, avg); T0_1D = _mm_sub_epi16(T0_1D, avg); T1_0A = _mm_sub_epi16(T1_0A, avg); T1_1A = _mm_sub_epi16(T1_1A, avg); T1_0B = _mm_sub_epi16(T1_0B, avg); T1_1B = _mm_sub_epi16(T1_1B, avg); T1_0C = _mm_sub_epi16(T1_0C, avg); T1_1C = _mm_sub_epi16(T1_1C, avg); T1_0D = _mm_sub_epi16(T1_0D, avg); T1_1D = _mm_sub_epi16(T1_1D, avg); T2_0A = _mm_sub_epi16(T2_0A, avg); T2_1A = _mm_sub_epi16(T2_1A, avg); T2_0B = _mm_sub_epi16(T2_0B, avg); T2_1B = _mm_sub_epi16(T2_1B, avg); T2_0C = _mm_sub_epi16(T2_0C, avg); T2_1C = _mm_sub_epi16(T2_1C, avg); T2_0D = _mm_sub_epi16(T2_0D, avg); T2_1D = _mm_sub_epi16(T2_1D, avg); T3_0A = _mm_sub_epi16(T3_0A, avg); T3_1A = _mm_sub_epi16(T3_1A, avg); T3_0B = _mm_sub_epi16(T3_0B, avg); T3_1B = _mm_sub_epi16(T3_1B, avg); T3_0C = _mm_sub_epi16(T3_0C, avg); T3_1C = _mm_sub_epi16(T3_1C, avg); T3_0D = _mm_sub_epi16(T3_0D, avg); T3_1D = _mm_sub_epi16(T3_1D, avg); T4_0A = _mm_sub_epi16(T4_0A, avg); T4_1A = _mm_sub_epi16(T4_1A, avg); T4_0B = _mm_sub_epi16(T4_0B, avg); T4_1B = _mm_sub_epi16(T4_1B, avg); T4_0C = _mm_sub_epi16(T4_0C, avg); T4_1C = _mm_sub_epi16(T4_1C, avg); T4_0D = _mm_sub_epi16(T4_0D, avg); T4_1D = _mm_sub_epi16(T4_1D, avg); T5_0A = _mm_sub_epi16(T5_0A, avg); T5_1A = _mm_sub_epi16(T5_1A, avg); T5_0B = _mm_sub_epi16(T5_0B, avg); T5_1B = _mm_sub_epi16(T5_1B, avg); T5_0C = _mm_sub_epi16(T5_0C, avg); T5_1C = _mm_sub_epi16(T5_1C, avg); T5_0D = _mm_sub_epi16(T5_0D, avg); T5_1D = _mm_sub_epi16(T5_1D, avg); T6_0A = _mm_sub_epi16(T6_0A, avg); T6_1A = _mm_sub_epi16(T6_1A, avg); T6_0B = _mm_sub_epi16(T6_0B, avg); T6_1B = _mm_sub_epi16(T6_1B, avg); T6_0C = _mm_sub_epi16(T6_0C, avg); T6_1C = _mm_sub_epi16(T6_1C, avg); T6_0D = _mm_sub_epi16(T6_0D, avg); T6_1D = _mm_sub_epi16(T6_1D, avg); T7_0A = _mm_sub_epi16(T7_0A, avg); T7_1A = _mm_sub_epi16(T7_1A, avg); T7_0B = _mm_sub_epi16(T7_0B, avg); T7_1B = _mm_sub_epi16(T7_1B, avg); T7_0C = _mm_sub_epi16(T7_0C, avg); T7_1C = _mm_sub_epi16(T7_1C, avg); T7_0D = _mm_sub_epi16(T7_0D, avg); T7_1D = _mm_sub_epi16(T7_1D, avg); T8_0A = _mm_sub_epi16(T8_0A, avg); T8_1A = _mm_sub_epi16(T8_1A, avg); T8_0B = _mm_sub_epi16(T8_0B, avg); T8_1B = _mm_sub_epi16(T8_1B, avg); T8_0C = _mm_sub_epi16(T8_0C, avg); T8_1C = _mm_sub_epi16(T8_1C, avg); T8_0D = _mm_sub_epi16(T8_0D, avg); T8_1D = _mm_sub_epi16(T8_1D, avg); T9_0A = _mm_sub_epi16(T9_0A, avg); T9_1A = _mm_sub_epi16(T9_1A, avg); T9_0B = _mm_sub_epi16(T9_0B, avg); T9_1B = _mm_sub_epi16(T9_1B, avg); T9_0C = _mm_sub_epi16(T9_0C, avg); T9_1C = _mm_sub_epi16(T9_1C, avg); T9_0D = _mm_sub_epi16(T9_0D, avg); T9_1D = _mm_sub_epi16(T9_1D, avg); T10_0A = _mm_sub_epi16(T10_0A, avg); T10_1A = _mm_sub_epi16(T10_1A, avg); T10_0B = _mm_sub_epi16(T10_0B, avg); T10_1B = _mm_sub_epi16(T10_1B, avg); T10_0C = _mm_sub_epi16(T10_0C, avg); T10_1C = _mm_sub_epi16(T10_1C, avg); T10_0D = _mm_sub_epi16(T10_0D, avg); T10_1D = _mm_sub_epi16(T10_1D, avg); T11_0A = _mm_sub_epi16(T11_0A, avg); T11_1A = _mm_sub_epi16(T11_1A, avg); T11_0B = _mm_sub_epi16(T11_0B, avg); T11_1B = _mm_sub_epi16(T11_1B, avg); T11_0C = _mm_sub_epi16(T11_0C, avg); T11_1C = _mm_sub_epi16(T11_1C, avg); T11_0D = _mm_sub_epi16(T11_0D, avg); T11_1D = _mm_sub_epi16(T11_1D, avg); T12_0A = _mm_sub_epi16(T12_0A, avg); T12_1A = _mm_sub_epi16(T12_1A, avg); T12_0B = _mm_sub_epi16(T12_0B, avg); T12_1B = _mm_sub_epi16(T12_1B, avg); T12_0C = _mm_sub_epi16(T12_0C, avg); T12_1C = _mm_sub_epi16(T12_1C, avg); T12_0D = _mm_sub_epi16(T12_0D, avg); T12_1D = _mm_sub_epi16(T12_1D, avg); T13_0A = _mm_sub_epi16(T13_0A, avg); T13_1A = _mm_sub_epi16(T13_1A, avg); T13_0B = _mm_sub_epi16(T13_0B, avg); T13_1B = _mm_sub_epi16(T13_1B, avg); T13_0C = _mm_sub_epi16(T13_0C, avg); T13_1C = _mm_sub_epi16(T13_1C, avg); T13_0D = _mm_sub_epi16(T13_0D, avg); T13_1D = _mm_sub_epi16(T13_1D, avg); T14_0A = _mm_sub_epi16(T14_0A, avg); T14_1A = _mm_sub_epi16(T14_1A, avg); T14_0B = _mm_sub_epi16(T14_0B, avg); T14_1B = _mm_sub_epi16(T14_1B, avg); T14_0C = _mm_sub_epi16(T14_0C, avg); T14_1C = _mm_sub_epi16(T14_1C, avg); T14_0D = _mm_sub_epi16(T14_0D, avg); T14_1D = _mm_sub_epi16(T14_1D, avg); T15_0A = _mm_sub_epi16(T15_0A, avg); T15_1A = _mm_sub_epi16(T15_1A, avg); T15_0B = _mm_sub_epi16(T15_0B, avg); T15_1B = _mm_sub_epi16(T15_1B, avg); T15_0C = _mm_sub_epi16(T15_0C, avg); T15_1C = _mm_sub_epi16(T15_1C, avg); T15_0D = _mm_sub_epi16(T15_0D, avg); T15_1D = _mm_sub_epi16(T15_1D, avg); T16_0A = _mm_sub_epi16(T16_0A, avg); T16_1A = _mm_sub_epi16(T16_1A, avg); T16_0B = _mm_sub_epi16(T16_0B, avg); T16_1B = _mm_sub_epi16(T16_1B, avg); T16_0C = _mm_sub_epi16(T16_0C, avg); T16_1C = _mm_sub_epi16(T16_1C, avg); T16_0D = _mm_sub_epi16(T16_0D, avg); T16_1D = _mm_sub_epi16(T16_1D, avg); T17_0A = _mm_sub_epi16(T17_0A, avg); T17_1A = _mm_sub_epi16(T17_1A, avg); T17_0B = _mm_sub_epi16(T17_0B, avg); T17_1B = _mm_sub_epi16(T17_1B, avg); T17_0C = _mm_sub_epi16(T17_0C, avg); T17_1C = _mm_sub_epi16(T17_1C, avg); T17_0D = _mm_sub_epi16(T17_0D, avg); T17_1D = _mm_sub_epi16(T17_1D, avg); T18_0A = _mm_sub_epi16(T18_0A, avg); T18_1A = _mm_sub_epi16(T18_1A, avg); T18_0B = _mm_sub_epi16(T18_0B, avg); T18_1B = _mm_sub_epi16(T18_1B, avg); T18_0C = _mm_sub_epi16(T18_0C, avg); T18_1C = _mm_sub_epi16(T18_1C, avg); T18_0D = _mm_sub_epi16(T18_0D, avg); T18_1D = _mm_sub_epi16(T18_1D, avg); T19_0A = _mm_sub_epi16(T19_0A, avg); T19_1A = _mm_sub_epi16(T19_1A, avg); T19_0B = _mm_sub_epi16(T19_0B, avg); T19_1B = _mm_sub_epi16(T19_1B, avg); T19_0C = _mm_sub_epi16(T19_0C, avg); T19_1C = _mm_sub_epi16(T19_1C, avg); T19_0D = _mm_sub_epi16(T19_0D, avg); T19_1D = _mm_sub_epi16(T19_1D, avg); T20_0A = _mm_sub_epi16(T20_0A, avg); T20_1A = _mm_sub_epi16(T20_1A, avg); T20_0B = _mm_sub_epi16(T20_0B, avg); T20_1B = _mm_sub_epi16(T20_1B, avg); T20_0C = _mm_sub_epi16(T20_0C, avg); T20_1C = _mm_sub_epi16(T20_1C, avg); T20_0D = _mm_sub_epi16(T20_0D, avg); T20_1D = _mm_sub_epi16(T20_1D, avg); T21_0A = _mm_sub_epi16(T21_0A, avg); T21_1A = _mm_sub_epi16(T21_1A, avg); T21_0B = _mm_sub_epi16(T21_0B, avg); T21_1B = _mm_sub_epi16(T21_1B, avg); T21_0C = _mm_sub_epi16(T21_0C, avg); T21_1C = _mm_sub_epi16(T21_1C, avg); T21_0D = _mm_sub_epi16(T21_0D, avg); T21_1D = _mm_sub_epi16(T21_1D, avg); T22_0A = _mm_sub_epi16(T22_0A, avg); T22_1A = _mm_sub_epi16(T22_1A, avg); T22_0B = _mm_sub_epi16(T22_0B, avg); T22_1B = _mm_sub_epi16(T22_1B, avg); T22_0C = _mm_sub_epi16(T22_0C, avg); T22_1C = _mm_sub_epi16(T22_1C, avg); T22_0D = _mm_sub_epi16(T22_0D, avg); T22_1D = _mm_sub_epi16(T22_1D, avg); T23_0A = _mm_sub_epi16(T23_0A, avg); T23_1A = _mm_sub_epi16(T23_1A, avg); T23_0B = _mm_sub_epi16(T23_0B, avg); T23_1B = _mm_sub_epi16(T23_1B, avg); T23_0C = _mm_sub_epi16(T23_0C, avg); T23_1C = _mm_sub_epi16(T23_1C, avg); T23_0D = _mm_sub_epi16(T23_0D, avg); T23_1D = _mm_sub_epi16(T23_1D, avg); T24_0A = _mm_sub_epi16(T24_0A, avg); T24_1A = _mm_sub_epi16(T24_1A, avg); T24_0B = _mm_sub_epi16(T24_0B, avg); T24_1B = _mm_sub_epi16(T24_1B, avg); T24_0C = _mm_sub_epi16(T24_0C, avg); T24_1C = _mm_sub_epi16(T24_1C, avg); T24_0D = _mm_sub_epi16(T24_0D, avg); T24_1D = _mm_sub_epi16(T24_1D, avg); T25_0A = _mm_sub_epi16(T25_0A, avg); T25_1A = _mm_sub_epi16(T25_1A, avg); T25_0B = _mm_sub_epi16(T25_0B, avg); T25_1B = _mm_sub_epi16(T25_1B, avg); T25_0C = _mm_sub_epi16(T25_0C, avg); T25_1C = _mm_sub_epi16(T25_1C, avg); T25_0D = _mm_sub_epi16(T25_0D, avg); T25_1D = _mm_sub_epi16(T25_1D, avg); T26_0A = _mm_sub_epi16(T26_0A, avg); T26_1A = _mm_sub_epi16(T26_1A, avg); T26_0B = _mm_sub_epi16(T26_0B, avg); T26_1B = _mm_sub_epi16(T26_1B, avg); T26_0C = _mm_sub_epi16(T26_0C, avg); T26_1C = _mm_sub_epi16(T26_1C, avg); T26_0D = _mm_sub_epi16(T26_0D, avg); T26_1D = _mm_sub_epi16(T26_1D, avg); T27_0A = _mm_sub_epi16(T27_0A, avg); T27_1A = _mm_sub_epi16(T27_1A, avg); T27_0B = _mm_sub_epi16(T27_0B, avg); T27_1B = _mm_sub_epi16(T27_1B, avg); T27_0C = _mm_sub_epi16(T27_0C, avg); T27_1C = _mm_sub_epi16(T27_1C, avg); T27_0D = _mm_sub_epi16(T27_0D, avg); T27_1D = _mm_sub_epi16(T27_1D, avg); T28_0A = _mm_sub_epi16(T28_0A, avg); T28_1A = _mm_sub_epi16(T28_1A, avg); T28_0B = _mm_sub_epi16(T28_0B, avg); T28_1B = _mm_sub_epi16(T28_1B, avg); T28_0C = _mm_sub_epi16(T28_0C, avg); T28_1C = _mm_sub_epi16(T28_1C, avg); T28_0D = _mm_sub_epi16(T28_0D, avg); T28_1D = _mm_sub_epi16(T28_1D, avg); T29_0A = _mm_sub_epi16(T29_0A, avg); T29_1A = _mm_sub_epi16(T29_1A, avg); T29_0B = _mm_sub_epi16(T29_0B, avg); T29_1B = _mm_sub_epi16(T29_1B, avg); T29_0C = _mm_sub_epi16(T29_0C, avg); T29_1C = _mm_sub_epi16(T29_1C, avg); T29_0D = _mm_sub_epi16(T29_0D, avg); T29_1D = _mm_sub_epi16(T29_1D, avg); T30_0A = _mm_sub_epi16(T30_0A, avg); T30_1A = _mm_sub_epi16(T30_1A, avg); T30_0B = _mm_sub_epi16(T30_0B, avg); T30_1B = _mm_sub_epi16(T30_1B, avg); T30_0C = _mm_sub_epi16(T30_0C, avg); T30_1C = _mm_sub_epi16(T30_1C, avg); T30_0D = _mm_sub_epi16(T30_0D, avg); T30_1D = _mm_sub_epi16(T30_1D, avg); T31_0A = _mm_sub_epi16(T31_0A, avg); T31_1A = _mm_sub_epi16(T31_1A, avg); T31_0B = _mm_sub_epi16(T31_0B, avg); T31_1B = _mm_sub_epi16(T31_1B, avg); T31_0C = _mm_sub_epi16(T31_0C, avg); T31_1C = _mm_sub_epi16(T31_1C, avg); T31_0D = _mm_sub_epi16(T31_0D, avg); T31_1D = _mm_sub_epi16(T31_1D, avg); T32_0A = _mm_sub_epi16(T32_0A, avg); T32_1A = _mm_sub_epi16(T32_1A, avg); T32_0B = _mm_sub_epi16(T32_0B, avg); T32_1B = _mm_sub_epi16(T32_1B, avg); T32_0C = _mm_sub_epi16(T32_0C, avg); T32_1C = _mm_sub_epi16(T32_1C, avg); T32_0D = _mm_sub_epi16(T32_0D, avg); T32_1D = _mm_sub_epi16(T32_1D, avg); T33_0A = _mm_sub_epi16(T33_0A, avg); T33_1A = _mm_sub_epi16(T33_1A, avg); T33_0B = _mm_sub_epi16(T33_0B, avg); T33_1B = _mm_sub_epi16(T33_1B, avg); T33_0C = _mm_sub_epi16(T33_0C, avg); T33_1C = _mm_sub_epi16(T33_1C, avg); T33_0D = _mm_sub_epi16(T33_0D, avg); T33_1D = _mm_sub_epi16(T33_1D, avg); T34_0A = _mm_sub_epi16(T34_0A, avg); T34_1A = _mm_sub_epi16(T34_1A, avg); T34_0B = _mm_sub_epi16(T34_0B, avg); T34_1B = _mm_sub_epi16(T34_1B, avg); T34_0C = _mm_sub_epi16(T34_0C, avg); T34_1C = _mm_sub_epi16(T34_1C, avg); T34_0D = _mm_sub_epi16(T34_0D, avg); T34_1D = _mm_sub_epi16(T34_1D, avg); T35_0A = _mm_sub_epi16(T35_0A, avg); T35_1A = _mm_sub_epi16(T35_1A, avg); T35_0B = _mm_sub_epi16(T35_0B, avg); T35_1B = _mm_sub_epi16(T35_1B, avg); T35_0C = _mm_sub_epi16(T35_0C, avg); T35_1C = _mm_sub_epi16(T35_1C, avg); T35_0D = _mm_sub_epi16(T35_0D, avg); T35_1D = _mm_sub_epi16(T35_1D, avg); T36_0A = _mm_sub_epi16(T36_0A, avg); T36_1A = _mm_sub_epi16(T36_1A, avg); T36_0B = _mm_sub_epi16(T36_0B, avg); T36_1B = _mm_sub_epi16(T36_1B, avg); T36_0C = _mm_sub_epi16(T36_0C, avg); T36_1C = _mm_sub_epi16(T36_1C, avg); T36_0D = _mm_sub_epi16(T36_0D, avg); T36_1D = _mm_sub_epi16(T36_1D, avg); T37_0A = _mm_sub_epi16(T37_0A, avg); T37_1A = _mm_sub_epi16(T37_1A, avg); T37_0B = _mm_sub_epi16(T37_0B, avg); T37_1B = _mm_sub_epi16(T37_1B, avg); T37_0C = _mm_sub_epi16(T37_0C, avg); T37_1C = _mm_sub_epi16(T37_1C, avg); T37_0D = _mm_sub_epi16(T37_0D, avg); T37_1D = _mm_sub_epi16(T37_1D, avg); T38_0A = _mm_sub_epi16(T38_0A, avg); T38_1A = _mm_sub_epi16(T38_1A, avg); T38_0B = _mm_sub_epi16(T38_0B, avg); T38_1B = _mm_sub_epi16(T38_1B, avg); T38_0C = _mm_sub_epi16(T38_0C, avg); T38_1C = _mm_sub_epi16(T38_1C, avg); T38_0D = _mm_sub_epi16(T38_0D, avg); T38_1D = _mm_sub_epi16(T38_1D, avg); T39_0A = _mm_sub_epi16(T39_0A, avg); T39_1A = _mm_sub_epi16(T39_1A, avg); T39_0B = _mm_sub_epi16(T39_0B, avg); T39_1B = _mm_sub_epi16(T39_1B, avg); T39_0C = _mm_sub_epi16(T39_0C, avg); T39_1C = _mm_sub_epi16(T39_1C, avg); T39_0D = _mm_sub_epi16(T39_0D, avg); T39_1D = _mm_sub_epi16(T39_1D, avg); T40_0A = _mm_sub_epi16(T40_0A, avg); T40_1A = _mm_sub_epi16(T40_1A, avg); T40_0B = _mm_sub_epi16(T40_0B, avg); T40_1B = _mm_sub_epi16(T40_1B, avg); T40_0C = _mm_sub_epi16(T40_0C, avg); T40_1C = _mm_sub_epi16(T40_1C, avg); T40_0D = _mm_sub_epi16(T40_0D, avg); T40_1D = _mm_sub_epi16(T40_1D, avg); T41_0A = _mm_sub_epi16(T41_0A, avg); T41_1A = _mm_sub_epi16(T41_1A, avg); T41_0B = _mm_sub_epi16(T41_0B, avg); T41_1B = _mm_sub_epi16(T41_1B, avg); T41_0C = _mm_sub_epi16(T41_0C, avg); T41_1C = _mm_sub_epi16(T41_1C, avg); T41_0D = _mm_sub_epi16(T41_0D, avg); T41_1D = _mm_sub_epi16(T41_1D, avg); T42_0A = _mm_sub_epi16(T42_0A, avg); T42_1A = _mm_sub_epi16(T42_1A, avg); T42_0B = _mm_sub_epi16(T42_0B, avg); T42_1B = _mm_sub_epi16(T42_1B, avg); T42_0C = _mm_sub_epi16(T42_0C, avg); T42_1C = _mm_sub_epi16(T42_1C, avg); T42_0D = _mm_sub_epi16(T42_0D, avg); T42_1D = _mm_sub_epi16(T42_1D, avg); T43_0A = _mm_sub_epi16(T43_0A, avg); T43_1A = _mm_sub_epi16(T43_1A, avg); T43_0B = _mm_sub_epi16(T43_0B, avg); T43_1B = _mm_sub_epi16(T43_1B, avg); T43_0C = _mm_sub_epi16(T43_0C, avg); T43_1C = _mm_sub_epi16(T43_1C, avg); T43_0D = _mm_sub_epi16(T43_0D, avg); T43_1D = _mm_sub_epi16(T43_1D, avg); T44_0A = _mm_sub_epi16(T44_0A, avg); T44_1A = _mm_sub_epi16(T44_1A, avg); T44_0B = _mm_sub_epi16(T44_0B, avg); T44_1B = _mm_sub_epi16(T44_1B, avg); T44_0C = _mm_sub_epi16(T44_0C, avg); T44_1C = _mm_sub_epi16(T44_1C, avg); T44_0D = _mm_sub_epi16(T44_0D, avg); T44_1D = _mm_sub_epi16(T44_1D, avg); T45_0A = _mm_sub_epi16(T45_0A, avg); T45_1A = _mm_sub_epi16(T45_1A, avg); T45_0B = _mm_sub_epi16(T45_0B, avg); T45_1B = _mm_sub_epi16(T45_1B, avg); T45_0C = _mm_sub_epi16(T45_0C, avg); T45_1C = _mm_sub_epi16(T45_1C, avg); T45_0D = _mm_sub_epi16(T45_0D, avg); T45_1D = _mm_sub_epi16(T45_1D, avg); T46_0A = _mm_sub_epi16(T46_0A, avg); T46_1A = _mm_sub_epi16(T46_1A, avg); T46_0B = _mm_sub_epi16(T46_0B, avg); T46_1B = _mm_sub_epi16(T46_1B, avg); T46_0C = _mm_sub_epi16(T46_0C, avg); T46_1C = _mm_sub_epi16(T46_1C, avg); T46_0D = _mm_sub_epi16(T46_0D, avg); T46_1D = _mm_sub_epi16(T46_1D, avg); T47_0A = _mm_sub_epi16(T47_0A, avg); T47_1A = _mm_sub_epi16(T47_1A, avg); T47_0B = _mm_sub_epi16(T47_0B, avg); T47_1B = _mm_sub_epi16(T47_1B, avg); T47_0C = _mm_sub_epi16(T47_0C, avg); T47_1C = _mm_sub_epi16(T47_1C, avg); T47_0D = _mm_sub_epi16(T47_0D, avg); T47_1D = _mm_sub_epi16(T47_1D, avg); T48_0A = _mm_sub_epi16(T48_0A, avg); T48_1A = _mm_sub_epi16(T48_1A, avg); T48_0B = _mm_sub_epi16(T48_0B, avg); T48_1B = _mm_sub_epi16(T48_1B, avg); T48_0C = _mm_sub_epi16(T48_0C, avg); T48_1C = _mm_sub_epi16(T48_1C, avg); T48_0D = _mm_sub_epi16(T48_0D, avg); T48_1D = _mm_sub_epi16(T48_1D, avg); T49_0A = _mm_sub_epi16(T49_0A, avg); T49_1A = _mm_sub_epi16(T49_1A, avg); T49_0B = _mm_sub_epi16(T49_0B, avg); T49_1B = _mm_sub_epi16(T49_1B, avg); T49_0C = _mm_sub_epi16(T49_0C, avg); T49_1C = _mm_sub_epi16(T49_1C, avg); T49_0D = _mm_sub_epi16(T49_0D, avg); T49_1D = _mm_sub_epi16(T49_1D, avg); T50_0A = _mm_sub_epi16(T50_0A, avg); T50_1A = _mm_sub_epi16(T50_1A, avg); T50_0B = _mm_sub_epi16(T50_0B, avg); T50_1B = _mm_sub_epi16(T50_1B, avg); T50_0C = _mm_sub_epi16(T50_0C, avg); T50_1C = _mm_sub_epi16(T50_1C, avg); T50_0D = _mm_sub_epi16(T50_0D, avg); T50_1D = _mm_sub_epi16(T50_1D, avg); T51_0A = _mm_sub_epi16(T51_0A, avg); T51_1A = _mm_sub_epi16(T51_1A, avg); T51_0B = _mm_sub_epi16(T51_0B, avg); T51_1B = _mm_sub_epi16(T51_1B, avg); T51_0C = _mm_sub_epi16(T51_0C, avg); T51_1C = _mm_sub_epi16(T51_1C, avg); T51_0D = _mm_sub_epi16(T51_0D, avg); T51_1D = _mm_sub_epi16(T51_1D, avg); T52_0A = _mm_sub_epi16(T52_0A, avg); T52_1A = _mm_sub_epi16(T52_1A, avg); T52_0B = _mm_sub_epi16(T52_0B, avg); T52_1B = _mm_sub_epi16(T52_1B, avg); T52_0C = _mm_sub_epi16(T52_0C, avg); T52_1C = _mm_sub_epi16(T52_1C, avg); T52_0D = _mm_sub_epi16(T52_0D, avg); T52_1D = _mm_sub_epi16(T52_1D, avg); T53_0A = _mm_sub_epi16(T53_0A, avg); T53_1A = _mm_sub_epi16(T53_1A, avg); T53_0B = _mm_sub_epi16(T53_0B, avg); T53_1B = _mm_sub_epi16(T53_1B, avg); T53_0C = _mm_sub_epi16(T53_0C, avg); T53_1C = _mm_sub_epi16(T53_1C, avg); T53_0D = _mm_sub_epi16(T53_0D, avg); T53_1D = _mm_sub_epi16(T53_1D, avg); T54_0A = _mm_sub_epi16(T54_0A, avg); T54_1A = _mm_sub_epi16(T54_1A, avg); T54_0B = _mm_sub_epi16(T54_0B, avg); T54_1B = _mm_sub_epi16(T54_1B, avg); T54_0C = _mm_sub_epi16(T54_0C, avg); T54_1C = _mm_sub_epi16(T54_1C, avg); T54_0D = _mm_sub_epi16(T54_0D, avg); T54_1D = _mm_sub_epi16(T54_1D, avg); T55_0A = _mm_sub_epi16(T55_0A, avg); T55_1A = _mm_sub_epi16(T55_1A, avg); T55_0B = _mm_sub_epi16(T55_0B, avg); T55_1B = _mm_sub_epi16(T55_1B, avg); T55_0C = _mm_sub_epi16(T55_0C, avg); T55_1C = _mm_sub_epi16(T55_1C, avg); T55_0D = _mm_sub_epi16(T55_0D, avg); T55_1D = _mm_sub_epi16(T55_1D, avg); T56_0A = _mm_sub_epi16(T56_0A, avg); T56_1A = _mm_sub_epi16(T56_1A, avg); T56_0B = _mm_sub_epi16(T56_0B, avg); T56_1B = _mm_sub_epi16(T56_1B, avg); T56_0C = _mm_sub_epi16(T56_0C, avg); T56_1C = _mm_sub_epi16(T56_1C, avg); T56_0D = _mm_sub_epi16(T56_0D, avg); T56_1D = _mm_sub_epi16(T56_1D, avg); T57_0A = _mm_sub_epi16(T57_0A, avg); T57_1A = _mm_sub_epi16(T57_1A, avg); T57_0B = _mm_sub_epi16(T57_0B, avg); T57_1B = _mm_sub_epi16(T57_1B, avg); T57_0C = _mm_sub_epi16(T57_0C, avg); T57_1C = _mm_sub_epi16(T57_1C, avg); T57_0D = _mm_sub_epi16(T57_0D, avg); T57_1D = _mm_sub_epi16(T57_1D, avg); T58_0A = _mm_sub_epi16(T58_0A, avg); T58_1A = _mm_sub_epi16(T58_1A, avg); T58_0B = _mm_sub_epi16(T58_0B, avg); T58_1B = _mm_sub_epi16(T58_1B, avg); T58_0C = _mm_sub_epi16(T58_0C, avg); T58_1C = _mm_sub_epi16(T58_1C, avg); T58_0D = _mm_sub_epi16(T58_0D, avg); T58_1D = _mm_sub_epi16(T58_1D, avg); T59_0A = _mm_sub_epi16(T59_0A, avg); T59_1A = _mm_sub_epi16(T59_1A, avg); T59_0B = _mm_sub_epi16(T59_0B, avg); T59_1B = _mm_sub_epi16(T59_1B, avg); T59_0C = _mm_sub_epi16(T59_0C, avg); T59_1C = _mm_sub_epi16(T59_1C, avg); T59_0D = _mm_sub_epi16(T59_0D, avg); T59_1D = _mm_sub_epi16(T59_1D, avg); T60_0A = _mm_sub_epi16(T60_0A, avg); T60_1A = _mm_sub_epi16(T60_1A, avg); T60_0B = _mm_sub_epi16(T60_0B, avg); T60_1B = _mm_sub_epi16(T60_1B, avg); T60_0C = _mm_sub_epi16(T60_0C, avg); T60_1C = _mm_sub_epi16(T60_1C, avg); T60_0D = _mm_sub_epi16(T60_0D, avg); T60_1D = _mm_sub_epi16(T60_1D, avg); T61_0A = _mm_sub_epi16(T61_0A, avg); T61_1A = _mm_sub_epi16(T61_1A, avg); T61_0B = _mm_sub_epi16(T61_0B, avg); T61_1B = _mm_sub_epi16(T61_1B, avg); T61_0C = _mm_sub_epi16(T61_0C, avg); T61_1C = _mm_sub_epi16(T61_1C, avg); T61_0D = _mm_sub_epi16(T61_0D, avg); T61_1D = _mm_sub_epi16(T61_1D, avg); T62_0A = _mm_sub_epi16(T62_0A, avg); T62_1A = _mm_sub_epi16(T62_1A, avg); T62_0B = _mm_sub_epi16(T62_0B, avg); T62_1B = _mm_sub_epi16(T62_1B, avg); T62_0C = _mm_sub_epi16(T62_0C, avg); T62_1C = _mm_sub_epi16(T62_1C, avg); T62_0D = _mm_sub_epi16(T62_0D, avg); T62_1D = _mm_sub_epi16(T62_1D, avg); T63_0A = _mm_sub_epi16(T63_0A, avg); T63_1A = _mm_sub_epi16(T63_1A, avg); T63_0B = _mm_sub_epi16(T63_0B, avg); T63_1B = _mm_sub_epi16(T63_1B, avg); T63_0C = _mm_sub_epi16(T63_0C, avg); T63_1C = _mm_sub_epi16(T63_1C, avg); T63_0D = _mm_sub_epi16(T63_0D, avg); T63_1D = _mm_sub_epi16(T63_1D, avg); T0_0A = _mm_abs_epi16(T0_0A); T0_1A = _mm_abs_epi16(T0_1A); T0_0B = _mm_abs_epi16(T0_0B); T0_1B = _mm_abs_epi16(T0_1B); T0_0C = _mm_abs_epi16(T0_0C); T0_1C = _mm_abs_epi16(T0_1C); T0_0D = _mm_abs_epi16(T0_0D); T0_1D = _mm_abs_epi16(T0_1D); T0 = _mm_add_epi16(T0_0A, T0_1A); T0 = _mm_add_epi16(T0, T0_0B); T0 = _mm_add_epi16(T0, T0_1B); T0 = _mm_add_epi16(T0, T0_0C); T0 = _mm_add_epi16(T0, T0_1C); T0 = _mm_add_epi16(T0, T0_0D); T0 = _mm_add_epi16(T0, T0_1D); T1_0A = _mm_abs_epi16(T1_0A); T1_1A = _mm_abs_epi16(T1_1A); T1_0B = _mm_abs_epi16(T1_0B); T1_1B = _mm_abs_epi16(T1_1B); T1_0C = _mm_abs_epi16(T1_0C); T1_1C = _mm_abs_epi16(T1_1C); T1_0D = _mm_abs_epi16(T1_0D); T1_1D = _mm_abs_epi16(T1_1D); T1 = _mm_add_epi16(T1_0A, T1_1A); T1 = _mm_add_epi16(T1, T1_0B); T1 = _mm_add_epi16(T1, T1_1B); T1 = _mm_add_epi16(T1, T1_0C); T1 = _mm_add_epi16(T1, T1_1C); T1 = _mm_add_epi16(T1, T1_0D); T1 = _mm_add_epi16(T1, T1_1D); T2_0A = _mm_abs_epi16(T2_0A); T2_1A = _mm_abs_epi16(T2_1A); T2_0B = _mm_abs_epi16(T2_0B); T2_1B = _mm_abs_epi16(T2_1B); T2_0C = _mm_abs_epi16(T2_0C); T2_1C = _mm_abs_epi16(T2_1C); T2_0D = _mm_abs_epi16(T2_0D); T2_1D = _mm_abs_epi16(T2_1D); T2 = _mm_add_epi16(T2_0A, T2_1A); T2 = _mm_add_epi16(T2, T2_0B); T2 = _mm_add_epi16(T2, T2_1B); T2 = _mm_add_epi16(T2, T2_0C); T2 = _mm_add_epi16(T2, T2_1C); T2 = _mm_add_epi16(T2, T2_0D); T2 = _mm_add_epi16(T2, T2_1D); T3_0A = _mm_abs_epi16(T3_0A); T3_1A = _mm_abs_epi16(T3_1A); T3_0B = _mm_abs_epi16(T3_0B); T3_1B = _mm_abs_epi16(T3_1B); T3_0C = _mm_abs_epi16(T3_0C); T3_1C = _mm_abs_epi16(T3_1C); T3_0D = _mm_abs_epi16(T3_0D); T3_1D = _mm_abs_epi16(T3_1D); T3 = _mm_add_epi16(T3_0A, T3_1A); T3 = _mm_add_epi16(T3, T3_0B); T3 = _mm_add_epi16(T3, T3_1B); T3 = _mm_add_epi16(T3, T3_0C); T3 = _mm_add_epi16(T3, T3_1C); T3 = _mm_add_epi16(T3, T3_0D); T3 = _mm_add_epi16(T3, T3_1D); T4_0A = _mm_abs_epi16(T4_0A); T4_1A = _mm_abs_epi16(T4_1A); T4_0B = _mm_abs_epi16(T4_0B); T4_1B = _mm_abs_epi16(T4_1B); T4_0C = _mm_abs_epi16(T4_0C); T4_1C = _mm_abs_epi16(T4_1C); T4_0D = _mm_abs_epi16(T4_0D); T4_1D = _mm_abs_epi16(T4_1D); T4 = _mm_add_epi16(T4_0A, T4_1A); T4 = _mm_add_epi16(T4, T4_0B); T4 = _mm_add_epi16(T4, T4_1B); T4 = _mm_add_epi16(T4, T4_0C); T4 = _mm_add_epi16(T4, T4_1C); T4 = _mm_add_epi16(T4, T4_0D); T4 = _mm_add_epi16(T4, T4_1D); T5_0A = _mm_abs_epi16(T5_0A); T5_1A = _mm_abs_epi16(T5_1A); T5_0B = _mm_abs_epi16(T5_0B); T5_1B = _mm_abs_epi16(T5_1B); T5_0C = _mm_abs_epi16(T5_0C); T5_1C = _mm_abs_epi16(T5_1C); T5_0D = _mm_abs_epi16(T5_0D); T5_1D = _mm_abs_epi16(T5_1D); T5 = _mm_add_epi16(T5_0A, T5_1A); T5 = _mm_add_epi16(T5, T5_0B); T5 = _mm_add_epi16(T5, T5_1B); T5 = _mm_add_epi16(T5, T5_0C); T5 = _mm_add_epi16(T5, T5_1C); T5 = _mm_add_epi16(T5, T5_0D); T5 = _mm_add_epi16(T5, T5_1D); T6_0A = _mm_abs_epi16(T6_0A); T6_1A = _mm_abs_epi16(T6_1A); T6_0B = _mm_abs_epi16(T6_0B); T6_1B = _mm_abs_epi16(T6_1B); T6_0C = _mm_abs_epi16(T6_0C); T6_1C = _mm_abs_epi16(T6_1C); T6_0D = _mm_abs_epi16(T6_0D); T6_1D = _mm_abs_epi16(T6_1D); T6 = _mm_add_epi16(T6_0A, T6_1A); T6 = _mm_add_epi16(T6, T6_0B); T6 = _mm_add_epi16(T6, T6_1B); T6 = _mm_add_epi16(T6, T6_0C); T6 = _mm_add_epi16(T6, T6_1C); T6 = _mm_add_epi16(T6, T6_0D); T6 = _mm_add_epi16(T6, T6_1D); T7_0A = _mm_abs_epi16(T7_0A); T7_1A = _mm_abs_epi16(T7_1A); T7_0B = _mm_abs_epi16(T7_0B); T7_1B = _mm_abs_epi16(T7_1B); T7_0C = _mm_abs_epi16(T7_0C); T7_1C = _mm_abs_epi16(T7_1C); T7_0D = _mm_abs_epi16(T7_0D); T7_1D = _mm_abs_epi16(T7_1D); T7 = _mm_add_epi16(T7_0A, T7_1A); T7 = _mm_add_epi16(T7, T7_0B); T7 = _mm_add_epi16(T7, T7_1B); T7 = _mm_add_epi16(T7, T7_0C); T7 = _mm_add_epi16(T7, T7_1C); T7 = _mm_add_epi16(T7, T7_0D); T7 = _mm_add_epi16(T7, T7_1D); T8_0A = _mm_abs_epi16(T8_0A); T8_1A = _mm_abs_epi16(T8_1A); T8_0B = _mm_abs_epi16(T8_0B); T8_1B = _mm_abs_epi16(T8_1B); T8_0C = _mm_abs_epi16(T8_0C); T8_1C = _mm_abs_epi16(T8_1C); T8_0D = _mm_abs_epi16(T8_0D); T8_1D = _mm_abs_epi16(T8_1D); T8 = _mm_add_epi16(T8_0A, T8_1A); T8 = _mm_add_epi16(T8, T8_0B); T8 = _mm_add_epi16(T8, T8_1B); T8 = _mm_add_epi16(T8, T8_0C); T8 = _mm_add_epi16(T8, T8_1C); T8 = _mm_add_epi16(T8, T8_0D); T8 = _mm_add_epi16(T8, T8_1D); T9_0A = _mm_abs_epi16(T9_0A); T9_1A = _mm_abs_epi16(T9_1A); T9_0B = _mm_abs_epi16(T9_0B); T9_1B = _mm_abs_epi16(T9_1B); T9_0C = _mm_abs_epi16(T9_0C); T9_1C = _mm_abs_epi16(T9_1C); T9_0D = _mm_abs_epi16(T9_0D); T9_1D = _mm_abs_epi16(T9_1D); T9 = _mm_add_epi16(T9_0A, T9_1A); T9 = _mm_add_epi16(T9, T9_0B); T9 = _mm_add_epi16(T9, T9_1B); T9 = _mm_add_epi16(T9, T9_0C); T9 = _mm_add_epi16(T9, T9_1C); T9 = _mm_add_epi16(T9, T9_0D); T9 = _mm_add_epi16(T9, T9_1D); T10_0A = _mm_abs_epi16(T10_0A); T10_1A = _mm_abs_epi16(T10_1A); T10_0B = _mm_abs_epi16(T10_0B); T10_1B = _mm_abs_epi16(T10_1B); T10_0C = _mm_abs_epi16(T10_0C); T10_1C = _mm_abs_epi16(T10_1C); T10_0D = _mm_abs_epi16(T10_0D); T10_1D = _mm_abs_epi16(T10_1D); T10 = _mm_add_epi16(T10_0A, T10_1A); T10 = _mm_add_epi16(T10, T10_0B); T10 = _mm_add_epi16(T10, T10_1B); T10 = _mm_add_epi16(T10, T10_0C); T10 = _mm_add_epi16(T10, T10_1C); T10 = _mm_add_epi16(T10, T10_0D); T10 = _mm_add_epi16(T10, T10_1D); T11_0A = _mm_abs_epi16(T11_0A); T11_1A = _mm_abs_epi16(T11_1A); T11_0B = _mm_abs_epi16(T11_0B); T11_1B = _mm_abs_epi16(T11_1B); T11_0C = _mm_abs_epi16(T11_0C); T11_1C = _mm_abs_epi16(T11_1C); T11_0D = _mm_abs_epi16(T11_0D); T11_1D = _mm_abs_epi16(T11_1D); T11 = _mm_add_epi16(T11_0A, T11_1A); T11 = _mm_add_epi16(T11, T11_0B); T11 = _mm_add_epi16(T11, T11_1B); T11 = _mm_add_epi16(T11, T11_0C); T11 = _mm_add_epi16(T11, T11_1C); T11 = _mm_add_epi16(T11, T11_0D); T11 = _mm_add_epi16(T11, T11_1D); T12_0A = _mm_abs_epi16(T12_0A); T12_1A = _mm_abs_epi16(T12_1A); T12_0B = _mm_abs_epi16(T12_0B); T12_1B = _mm_abs_epi16(T12_1B); T12_0C = _mm_abs_epi16(T12_0C); T12_1C = _mm_abs_epi16(T12_1C); T12_0D = _mm_abs_epi16(T12_0D); T12_1D = _mm_abs_epi16(T12_1D); T12 = _mm_add_epi16(T12_0A, T12_1A); T12 = _mm_add_epi16(T12, T12_0B); T12 = _mm_add_epi16(T12, T12_1B); T12 = _mm_add_epi16(T12, T12_0C); T12 = _mm_add_epi16(T12, T12_1C); T12 = _mm_add_epi16(T12, T12_0D); T12 = _mm_add_epi16(T12, T12_1D); T13_0A = _mm_abs_epi16(T13_0A); T13_1A = _mm_abs_epi16(T13_1A); T13_0B = _mm_abs_epi16(T13_0B); T13_1B = _mm_abs_epi16(T13_1B); T13_0C = _mm_abs_epi16(T13_0C); T13_1C = _mm_abs_epi16(T13_1C); T13_0D = _mm_abs_epi16(T13_0D); T13_1D = _mm_abs_epi16(T13_1D); T13 = _mm_add_epi16(T13_0A, T13_1A); T13 = _mm_add_epi16(T13, T13_0B); T13 = _mm_add_epi16(T13, T13_1B); T13 = _mm_add_epi16(T13, T13_0C); T13 = _mm_add_epi16(T13, T13_1C); T13 = _mm_add_epi16(T13, T13_0D); T13 = _mm_add_epi16(T13, T13_1D); T14_0A = _mm_abs_epi16(T14_0A); T14_1A = _mm_abs_epi16(T14_1A); T14_0B = _mm_abs_epi16(T14_0B); T14_1B = _mm_abs_epi16(T14_1B); T14_0C = _mm_abs_epi16(T14_0C); T14_1C = _mm_abs_epi16(T14_1C); T14_0D = _mm_abs_epi16(T14_0D); T14_1D = _mm_abs_epi16(T14_1D); T14 = _mm_add_epi16(T14_0A, T14_1A); T14 = _mm_add_epi16(T14, T14_0B); T14 = _mm_add_epi16(T14, T14_1B); T14 = _mm_add_epi16(T14, T14_0C); T14 = _mm_add_epi16(T14, T14_1C); T14 = _mm_add_epi16(T14, T14_0D); T14 = _mm_add_epi16(T14, T14_1D); T15_0A = _mm_abs_epi16(T15_0A); T15_1A = _mm_abs_epi16(T15_1A); T15_0B = _mm_abs_epi16(T15_0B); T15_1B = _mm_abs_epi16(T15_1B); T15_0C = _mm_abs_epi16(T15_0C); T15_1C = _mm_abs_epi16(T15_1C); T15_0D = _mm_abs_epi16(T15_0D); T15_1D = _mm_abs_epi16(T15_1D); T15 = _mm_add_epi16(T15_0A, T15_1A); T15 = _mm_add_epi16(T15, T15_0B); T15 = _mm_add_epi16(T15, T15_1B); T15 = _mm_add_epi16(T15, T15_0C); T15 = _mm_add_epi16(T15, T15_1C); T15 = _mm_add_epi16(T15, T15_0D); T15 = _mm_add_epi16(T15, T15_1D); T16_0A = _mm_abs_epi16(T16_0A); T16_1A = _mm_abs_epi16(T16_1A); T16_0B = _mm_abs_epi16(T16_0B); T16_1B = _mm_abs_epi16(T16_1B); T16_0C = _mm_abs_epi16(T16_0C); T16_1C = _mm_abs_epi16(T16_1C); T16_0D = _mm_abs_epi16(T16_0D); T16_1D = _mm_abs_epi16(T16_1D); T16 = _mm_add_epi16(T16_0A, T16_1A); T16 = _mm_add_epi16(T16, T16_0B); T16 = _mm_add_epi16(T16, T16_1B); T16 = _mm_add_epi16(T16, T16_0C); T16 = _mm_add_epi16(T16, T16_1C); T16 = _mm_add_epi16(T16, T16_0D); T16 = _mm_add_epi16(T16, T16_1D); T17_0A = _mm_abs_epi16(T17_0A); T17_1A = _mm_abs_epi16(T17_1A); T17_0B = _mm_abs_epi16(T17_0B); T17_1B = _mm_abs_epi16(T17_1B); T17_0C = _mm_abs_epi16(T17_0C); T17_1C = _mm_abs_epi16(T17_1C); T17_0D = _mm_abs_epi16(T17_0D); T17_1D = _mm_abs_epi16(T17_1D); T17 = _mm_add_epi16(T17_0A, T17_1A); T17 = _mm_add_epi16(T17, T17_0B); T17 = _mm_add_epi16(T17, T17_1B); T17 = _mm_add_epi16(T17, T17_0C); T17 = _mm_add_epi16(T17, T17_1C); T17 = _mm_add_epi16(T17, T17_0D); T17 = _mm_add_epi16(T17, T17_1D); T18_0A = _mm_abs_epi16(T18_0A); T18_1A = _mm_abs_epi16(T18_1A); T18_0B = _mm_abs_epi16(T18_0B); T18_1B = _mm_abs_epi16(T18_1B); T18_0C = _mm_abs_epi16(T18_0C); T18_1C = _mm_abs_epi16(T18_1C); T18_0D = _mm_abs_epi16(T18_0D); T18_1D = _mm_abs_epi16(T18_1D); T18 = _mm_add_epi16(T18_0A, T18_1A); T18 = _mm_add_epi16(T18, T18_0B); T18 = _mm_add_epi16(T18, T18_1B); T18 = _mm_add_epi16(T18, T18_0C); T18 = _mm_add_epi16(T18, T18_1C); T18 = _mm_add_epi16(T18, T18_0D); T18 = _mm_add_epi16(T18, T18_1D); T19_0A = _mm_abs_epi16(T19_0A); T19_1A = _mm_abs_epi16(T19_1A); T19_0B = _mm_abs_epi16(T19_0B); T19_1B = _mm_abs_epi16(T19_1B); T19_0C = _mm_abs_epi16(T19_0C); T19_1C = _mm_abs_epi16(T19_1C); T19_0D = _mm_abs_epi16(T19_0D); T19_1D = _mm_abs_epi16(T19_1D); T19 = _mm_add_epi16(T19_0A, T19_1A); T19 = _mm_add_epi16(T19, T19_0B); T19 = _mm_add_epi16(T19, T19_1B); T19 = _mm_add_epi16(T19, T19_0C); T19 = _mm_add_epi16(T19, T19_1C); T19 = _mm_add_epi16(T19, T19_0D); T19 = _mm_add_epi16(T19, T19_1D); T20_0A = _mm_abs_epi16(T20_0A); T20_1A = _mm_abs_epi16(T20_1A); T20_0B = _mm_abs_epi16(T20_0B); T20_1B = _mm_abs_epi16(T20_1B); T20_0C = _mm_abs_epi16(T20_0C); T20_1C = _mm_abs_epi16(T20_1C); T20_0D = _mm_abs_epi16(T20_0D); T20_1D = _mm_abs_epi16(T20_1D); T20 = _mm_add_epi16(T20_0A, T20_1A); T20 = _mm_add_epi16(T20, T20_0B); T20 = _mm_add_epi16(T20, T20_1B); T20 = _mm_add_epi16(T20, T20_0C); T20 = _mm_add_epi16(T20, T20_1C); T20 = _mm_add_epi16(T20, T20_0D); T20 = _mm_add_epi16(T20, T20_1D); T21_0A = _mm_abs_epi16(T21_0A); T21_1A = _mm_abs_epi16(T21_1A); T21_0B = _mm_abs_epi16(T21_0B); T21_1B = _mm_abs_epi16(T21_1B); T21_0C = _mm_abs_epi16(T21_0C); T21_1C = _mm_abs_epi16(T21_1C); T21_0D = _mm_abs_epi16(T21_0D); T21_1D = _mm_abs_epi16(T21_1D); T21 = _mm_add_epi16(T21_0A, T21_1A); T21 = _mm_add_epi16(T21, T21_0B); T21 = _mm_add_epi16(T21, T21_1B); T21 = _mm_add_epi16(T21, T21_0C); T21 = _mm_add_epi16(T21, T21_1C); T21 = _mm_add_epi16(T21, T21_0D); T21 = _mm_add_epi16(T21, T21_1D); T22_0A = _mm_abs_epi16(T22_0A); T22_1A = _mm_abs_epi16(T22_1A); T22_0B = _mm_abs_epi16(T22_0B); T22_1B = _mm_abs_epi16(T22_1B); T22_0C = _mm_abs_epi16(T22_0C); T22_1C = _mm_abs_epi16(T22_1C); T22_0D = _mm_abs_epi16(T22_0D); T22_1D = _mm_abs_epi16(T22_1D); T22 = _mm_add_epi16(T22_0A, T22_1A); T22 = _mm_add_epi16(T22, T22_0B); T22 = _mm_add_epi16(T22, T22_1B); T22 = _mm_add_epi16(T22, T22_0C); T22 = _mm_add_epi16(T22, T22_1C); T22 = _mm_add_epi16(T22, T22_0D); T22 = _mm_add_epi16(T22, T22_1D); T23_0A = _mm_abs_epi16(T23_0A); T23_1A = _mm_abs_epi16(T23_1A); T23_0B = _mm_abs_epi16(T23_0B); T23_1B = _mm_abs_epi16(T23_1B); T23_0C = _mm_abs_epi16(T23_0C); T23_1C = _mm_abs_epi16(T23_1C); T23_0D = _mm_abs_epi16(T23_0D); T23_1D = _mm_abs_epi16(T23_1D); T23 = _mm_add_epi16(T23_0A, T23_1A); T23 = _mm_add_epi16(T23, T23_0B); T23 = _mm_add_epi16(T23, T23_1B); T23 = _mm_add_epi16(T23, T23_0C); T23 = _mm_add_epi16(T23, T23_1C); T23 = _mm_add_epi16(T23, T23_0D); T23 = _mm_add_epi16(T23, T23_1D); T24_0A = _mm_abs_epi16(T24_0A); T24_1A = _mm_abs_epi16(T24_1A); T24_0B = _mm_abs_epi16(T24_0B); T24_1B = _mm_abs_epi16(T24_1B); T24_0C = _mm_abs_epi16(T24_0C); T24_1C = _mm_abs_epi16(T24_1C); T24_0D = _mm_abs_epi16(T24_0D); T24_1D = _mm_abs_epi16(T24_1D); T24 = _mm_add_epi16(T24_0A, T24_1A); T24 = _mm_add_epi16(T24, T24_0B); T24 = _mm_add_epi16(T24, T24_1B); T24 = _mm_add_epi16(T24, T24_0C); T24 = _mm_add_epi16(T24, T24_1C); T24 = _mm_add_epi16(T24, T24_0D); T24 = _mm_add_epi16(T24, T24_1D); T25_0A = _mm_abs_epi16(T25_0A); T25_1A = _mm_abs_epi16(T25_1A); T25_0B = _mm_abs_epi16(T25_0B); T25_1B = _mm_abs_epi16(T25_1B); T25_0C = _mm_abs_epi16(T25_0C); T25_1C = _mm_abs_epi16(T25_1C); T25_0D = _mm_abs_epi16(T25_0D); T25_1D = _mm_abs_epi16(T25_1D); T25 = _mm_add_epi16(T25_0A, T25_1A); T25 = _mm_add_epi16(T25, T25_0B); T25 = _mm_add_epi16(T25, T25_1B); T25 = _mm_add_epi16(T25, T25_0C); T25 = _mm_add_epi16(T25, T25_1C); T25 = _mm_add_epi16(T25, T25_0D); T25 = _mm_add_epi16(T25, T25_1D); T26_0A = _mm_abs_epi16(T26_0A); T26_1A = _mm_abs_epi16(T26_1A); T26_0B = _mm_abs_epi16(T26_0B); T26_1B = _mm_abs_epi16(T26_1B); T26_0C = _mm_abs_epi16(T26_0C); T26_1C = _mm_abs_epi16(T26_1C); T26_0D = _mm_abs_epi16(T26_0D); T26_1D = _mm_abs_epi16(T26_1D); T26 = _mm_add_epi16(T26_0A, T26_1A); T26 = _mm_add_epi16(T26, T26_0B); T26 = _mm_add_epi16(T26, T26_1B); T26 = _mm_add_epi16(T26, T26_0C); T26 = _mm_add_epi16(T26, T26_1C); T26 = _mm_add_epi16(T26, T26_0D); T26 = _mm_add_epi16(T26, T26_1D); T27_0A = _mm_abs_epi16(T27_0A); T27_1A = _mm_abs_epi16(T27_1A); T27_0B = _mm_abs_epi16(T27_0B); T27_1B = _mm_abs_epi16(T27_1B); T27_0C = _mm_abs_epi16(T27_0C); T27_1C = _mm_abs_epi16(T27_1C); T27_0D = _mm_abs_epi16(T27_0D); T27_1D = _mm_abs_epi16(T27_1D); T27 = _mm_add_epi16(T27_0A, T27_1A); T27 = _mm_add_epi16(T27, T27_0B); T27 = _mm_add_epi16(T27, T27_1B); T27 = _mm_add_epi16(T27, T27_0C); T27 = _mm_add_epi16(T27, T27_1C); T27 = _mm_add_epi16(T27, T27_0D); T27 = _mm_add_epi16(T27, T27_1D); T28_0A = _mm_abs_epi16(T28_0A); T28_1A = _mm_abs_epi16(T28_1A); T28_0B = _mm_abs_epi16(T28_0B); T28_1B = _mm_abs_epi16(T28_1B); T28_0C = _mm_abs_epi16(T28_0C); T28_1C = _mm_abs_epi16(T28_1C); T28_0D = _mm_abs_epi16(T28_0D); T28_1D = _mm_abs_epi16(T28_1D); T28 = _mm_add_epi16(T28_0A, T28_1A); T28 = _mm_add_epi16(T28, T28_0B); T28 = _mm_add_epi16(T28, T28_1B); T28 = _mm_add_epi16(T28, T28_0C); T28 = _mm_add_epi16(T28, T28_1C); T28 = _mm_add_epi16(T28, T28_0D); T28 = _mm_add_epi16(T28, T28_1D); T29_0A = _mm_abs_epi16(T29_0A); T29_1A = _mm_abs_epi16(T29_1A); T29_0B = _mm_abs_epi16(T29_0B); T29_1B = _mm_abs_epi16(T29_1B); T29_0C = _mm_abs_epi16(T29_0C); T29_1C = _mm_abs_epi16(T29_1C); T29_0D = _mm_abs_epi16(T29_0D); T29_1D = _mm_abs_epi16(T29_1D); T29 = _mm_add_epi16(T29_0A, T29_1A); T29 = _mm_add_epi16(T29, T29_0B); T29 = _mm_add_epi16(T29, T29_1B); T29 = _mm_add_epi16(T29, T29_0C); T29 = _mm_add_epi16(T29, T29_1C); T29 = _mm_add_epi16(T29, T29_0D); T29 = _mm_add_epi16(T29, T29_1D); T30_0A = _mm_abs_epi16(T30_0A); T30_1A = _mm_abs_epi16(T30_1A); T30_0B = _mm_abs_epi16(T30_0B); T30_1B = _mm_abs_epi16(T30_1B); T30_0C = _mm_abs_epi16(T30_0C); T30_1C = _mm_abs_epi16(T30_1C); T30_0D = _mm_abs_epi16(T30_0D); T30_1D = _mm_abs_epi16(T30_1D); T30 = _mm_add_epi16(T30_0A, T30_1A); T30 = _mm_add_epi16(T30, T30_0B); T30 = _mm_add_epi16(T30, T30_1B); T30 = _mm_add_epi16(T30, T30_0C); T30 = _mm_add_epi16(T30, T30_1C); T30 = _mm_add_epi16(T30, T30_0D); T30 = _mm_add_epi16(T30, T30_1D); T31_0A = _mm_abs_epi16(T31_0A); T31_1A = _mm_abs_epi16(T31_1A); T31_0B = _mm_abs_epi16(T31_0B); T31_1B = _mm_abs_epi16(T31_1B); T31_0C = _mm_abs_epi16(T31_0C); T31_1C = _mm_abs_epi16(T31_1C); T31_0D = _mm_abs_epi16(T31_0D); T31_1D = _mm_abs_epi16(T31_1D); T31 = _mm_add_epi16(T31_0A, T31_1A); T31 = _mm_add_epi16(T31, T31_0B); T31 = _mm_add_epi16(T31, T31_1B); T31 = _mm_add_epi16(T31, T31_0C); T31 = _mm_add_epi16(T31, T31_1C); T31 = _mm_add_epi16(T31, T31_0D); T31 = _mm_add_epi16(T31, T31_1D); T32_0A = _mm_abs_epi16(T32_0A); T32_1A = _mm_abs_epi16(T32_1A); T32_0B = _mm_abs_epi16(T32_0B); T32_1B = _mm_abs_epi16(T32_1B); T32_0C = _mm_abs_epi16(T32_0C); T32_1C = _mm_abs_epi16(T32_1C); T32_0D = _mm_abs_epi16(T32_0D); T32_1D = _mm_abs_epi16(T32_1D); T32 = _mm_add_epi16(T32_0A, T32_1A); T32 = _mm_add_epi16(T32, T32_0B); T32 = _mm_add_epi16(T32, T32_1B); T32 = _mm_add_epi16(T32, T32_0C); T32 = _mm_add_epi16(T32, T32_1C); T32 = _mm_add_epi16(T32, T32_0D); T32 = _mm_add_epi16(T32, T32_1D); T33_0A = _mm_abs_epi16(T33_0A); T33_1A = _mm_abs_epi16(T33_1A); T33_0B = _mm_abs_epi16(T33_0B); T33_1B = _mm_abs_epi16(T33_1B); T33_0C = _mm_abs_epi16(T33_0C); T33_1C = _mm_abs_epi16(T33_1C); T33_0D = _mm_abs_epi16(T33_0D); T33_1D = _mm_abs_epi16(T33_1D); T33 = _mm_add_epi16(T33_0A, T33_1A); T33 = _mm_add_epi16(T33, T33_0B); T33 = _mm_add_epi16(T33, T33_1B); T33 = _mm_add_epi16(T33, T33_0C); T33 = _mm_add_epi16(T33, T33_1C); T33 = _mm_add_epi16(T33, T33_0D); T33 = _mm_add_epi16(T33, T33_1D); T34_0A = _mm_abs_epi16(T34_0A); T34_1A = _mm_abs_epi16(T34_1A); T34_0B = _mm_abs_epi16(T34_0B); T34_1B = _mm_abs_epi16(T34_1B); T34_0C = _mm_abs_epi16(T34_0C); T34_1C = _mm_abs_epi16(T34_1C); T34_0D = _mm_abs_epi16(T34_0D); T34_1D = _mm_abs_epi16(T34_1D); T34 = _mm_add_epi16(T34_0A, T34_1A); T34 = _mm_add_epi16(T34, T34_0B); T34 = _mm_add_epi16(T34, T34_1B); T34 = _mm_add_epi16(T34, T34_0C); T34 = _mm_add_epi16(T34, T34_1C); T34 = _mm_add_epi16(T34, T34_0D); T34 = _mm_add_epi16(T34, T34_1D); T35_0A = _mm_abs_epi16(T35_0A); T35_1A = _mm_abs_epi16(T35_1A); T35_0B = _mm_abs_epi16(T35_0B); T35_1B = _mm_abs_epi16(T35_1B); T35_0C = _mm_abs_epi16(T35_0C); T35_1C = _mm_abs_epi16(T35_1C); T35_0D = _mm_abs_epi16(T35_0D); T35_1D = _mm_abs_epi16(T35_1D); T35 = _mm_add_epi16(T35_0A, T35_1A); T35 = _mm_add_epi16(T35, T35_0B); T35 = _mm_add_epi16(T35, T35_1B); T35 = _mm_add_epi16(T35, T35_0C); T35 = _mm_add_epi16(T35, T35_1C); T35 = _mm_add_epi16(T35, T35_0D); T35 = _mm_add_epi16(T35, T35_1D); T36_0A = _mm_abs_epi16(T36_0A); T36_1A = _mm_abs_epi16(T36_1A); T36_0B = _mm_abs_epi16(T36_0B); T36_1B = _mm_abs_epi16(T36_1B); T36_0C = _mm_abs_epi16(T36_0C); T36_1C = _mm_abs_epi16(T36_1C); T36_0D = _mm_abs_epi16(T36_0D); T36_1D = _mm_abs_epi16(T36_1D); T36 = _mm_add_epi16(T36_0A, T36_1A); T36 = _mm_add_epi16(T36, T36_0B); T36 = _mm_add_epi16(T36, T36_1B); T36 = _mm_add_epi16(T36, T36_0C); T36 = _mm_add_epi16(T36, T36_1C); T36 = _mm_add_epi16(T36, T36_0D); T36 = _mm_add_epi16(T36, T36_1D); T37_0A = _mm_abs_epi16(T37_0A); T37_1A = _mm_abs_epi16(T37_1A); T37_0B = _mm_abs_epi16(T37_0B); T37_1B = _mm_abs_epi16(T37_1B); T37_0C = _mm_abs_epi16(T37_0C); T37_1C = _mm_abs_epi16(T37_1C); T37_0D = _mm_abs_epi16(T37_0D); T37_1D = _mm_abs_epi16(T37_1D); T37 = _mm_add_epi16(T37_0A, T37_1A); T37 = _mm_add_epi16(T37, T37_0B); T37 = _mm_add_epi16(T37, T37_1B); T37 = _mm_add_epi16(T37, T37_0C); T37 = _mm_add_epi16(T37, T37_1C); T37 = _mm_add_epi16(T37, T37_0D); T37 = _mm_add_epi16(T37, T37_1D); T38_0A = _mm_abs_epi16(T38_0A); T38_1A = _mm_abs_epi16(T38_1A); T38_0B = _mm_abs_epi16(T38_0B); T38_1B = _mm_abs_epi16(T38_1B); T38_0C = _mm_abs_epi16(T38_0C); T38_1C = _mm_abs_epi16(T38_1C); T38_0D = _mm_abs_epi16(T38_0D); T38_1D = _mm_abs_epi16(T38_1D); T38 = _mm_add_epi16(T38_0A, T38_1A); T38 = _mm_add_epi16(T38, T38_0B); T38 = _mm_add_epi16(T38, T38_1B); T38 = _mm_add_epi16(T38, T38_0C); T38 = _mm_add_epi16(T38, T38_1C); T38 = _mm_add_epi16(T38, T38_0D); T38 = _mm_add_epi16(T38, T38_1D); T39_0A = _mm_abs_epi16(T39_0A); T39_1A = _mm_abs_epi16(T39_1A); T39_0B = _mm_abs_epi16(T39_0B); T39_1B = _mm_abs_epi16(T39_1B); T39_0C = _mm_abs_epi16(T39_0C); T39_1C = _mm_abs_epi16(T39_1C); T39_0D = _mm_abs_epi16(T39_0D); T39_1D = _mm_abs_epi16(T39_1D); T39 = _mm_add_epi16(T39_0A, T39_1A); T39 = _mm_add_epi16(T39, T39_0B); T39 = _mm_add_epi16(T39, T39_1B); T39 = _mm_add_epi16(T39, T39_0C); T39 = _mm_add_epi16(T39, T39_1C); T39 = _mm_add_epi16(T39, T39_0D); T39 = _mm_add_epi16(T39, T39_1D); T40_0A = _mm_abs_epi16(T40_0A); T40_1A = _mm_abs_epi16(T40_1A); T40_0B = _mm_abs_epi16(T40_0B); T40_1B = _mm_abs_epi16(T40_1B); T40_0C = _mm_abs_epi16(T40_0C); T40_1C = _mm_abs_epi16(T40_1C); T40_0D = _mm_abs_epi16(T40_0D); T40_1D = _mm_abs_epi16(T40_1D); T40 = _mm_add_epi16(T40_0A, T40_1A); T40 = _mm_add_epi16(T40, T40_0B); T40 = _mm_add_epi16(T40, T40_1B); T40 = _mm_add_epi16(T40, T40_0C); T40 = _mm_add_epi16(T40, T40_1C); T40 = _mm_add_epi16(T40, T40_0D); T40 = _mm_add_epi16(T40, T40_1D); T41_0A = _mm_abs_epi16(T41_0A); T41_1A = _mm_abs_epi16(T41_1A); T41_0B = _mm_abs_epi16(T41_0B); T41_1B = _mm_abs_epi16(T41_1B); T41_0C = _mm_abs_epi16(T41_0C); T41_1C = _mm_abs_epi16(T41_1C); T41_0D = _mm_abs_epi16(T41_0D); T41_1D = _mm_abs_epi16(T41_1D); T41 = _mm_add_epi16(T41_0A, T41_1A); T41 = _mm_add_epi16(T41, T41_0B); T41 = _mm_add_epi16(T41, T41_1B); T41 = _mm_add_epi16(T41, T41_0C); T41 = _mm_add_epi16(T41, T41_1C); T41 = _mm_add_epi16(T41, T41_0D); T41 = _mm_add_epi16(T41, T41_1D); T42_0A = _mm_abs_epi16(T42_0A); T42_1A = _mm_abs_epi16(T42_1A); T42_0B = _mm_abs_epi16(T42_0B); T42_1B = _mm_abs_epi16(T42_1B); T42_0C = _mm_abs_epi16(T42_0C); T42_1C = _mm_abs_epi16(T42_1C); T42_0D = _mm_abs_epi16(T42_0D); T42_1D = _mm_abs_epi16(T42_1D); T42 = _mm_add_epi16(T42_0A, T42_1A); T42 = _mm_add_epi16(T42, T42_0B); T42 = _mm_add_epi16(T42, T42_1B); T42 = _mm_add_epi16(T42, T42_0C); T42 = _mm_add_epi16(T42, T42_1C); T42 = _mm_add_epi16(T42, T42_0D); T42 = _mm_add_epi16(T42, T42_1D); T43_0A = _mm_abs_epi16(T43_0A); T43_1A = _mm_abs_epi16(T43_1A); T43_0B = _mm_abs_epi16(T43_0B); T43_1B = _mm_abs_epi16(T43_1B); T43_0C = _mm_abs_epi16(T43_0C); T43_1C = _mm_abs_epi16(T43_1C); T43_0D = _mm_abs_epi16(T43_0D); T43_1D = _mm_abs_epi16(T43_1D); T43 = _mm_add_epi16(T43_0A, T43_1A); T43 = _mm_add_epi16(T43, T43_0B); T43 = _mm_add_epi16(T43, T43_1B); T43 = _mm_add_epi16(T43, T43_0C); T43 = _mm_add_epi16(T43, T43_1C); T43 = _mm_add_epi16(T43, T43_0D); T43 = _mm_add_epi16(T43, T43_1D); T44_0A = _mm_abs_epi16(T44_0A); T44_1A = _mm_abs_epi16(T44_1A); T44_0B = _mm_abs_epi16(T44_0B); T44_1B = _mm_abs_epi16(T44_1B); T44_0C = _mm_abs_epi16(T44_0C); T44_1C = _mm_abs_epi16(T44_1C); T44_0D = _mm_abs_epi16(T44_0D); T44_1D = _mm_abs_epi16(T44_1D); T44 = _mm_add_epi16(T44_0A, T44_1A); T44 = _mm_add_epi16(T44, T44_0B); T44 = _mm_add_epi16(T44, T44_1B); T44 = _mm_add_epi16(T44, T44_0C); T44 = _mm_add_epi16(T44, T44_1C); T44 = _mm_add_epi16(T44, T44_0D); T44 = _mm_add_epi16(T44, T44_1D); T45_0A = _mm_abs_epi16(T45_0A); T45_1A = _mm_abs_epi16(T45_1A); T45_0B = _mm_abs_epi16(T45_0B); T45_1B = _mm_abs_epi16(T45_1B); T45_0C = _mm_abs_epi16(T45_0C); T45_1C = _mm_abs_epi16(T45_1C); T45_0D = _mm_abs_epi16(T45_0D); T45_1D = _mm_abs_epi16(T45_1D); T45 = _mm_add_epi16(T45_0A, T45_1A); T45 = _mm_add_epi16(T45, T45_0B); T45 = _mm_add_epi16(T45, T45_1B); T45 = _mm_add_epi16(T45, T45_0C); T45 = _mm_add_epi16(T45, T45_1C); T45 = _mm_add_epi16(T45, T45_0D); T45 = _mm_add_epi16(T45, T45_1D); T46_0A = _mm_abs_epi16(T46_0A); T46_1A = _mm_abs_epi16(T46_1A); T46_0B = _mm_abs_epi16(T46_0B); T46_1B = _mm_abs_epi16(T46_1B); T46_0C = _mm_abs_epi16(T46_0C); T46_1C = _mm_abs_epi16(T46_1C); T46_0D = _mm_abs_epi16(T46_0D); T46_1D = _mm_abs_epi16(T46_1D); T46 = _mm_add_epi16(T46_0A, T46_1A); T46 = _mm_add_epi16(T46, T46_0B); T46 = _mm_add_epi16(T46, T46_1B); T46 = _mm_add_epi16(T46, T46_0C); T46 = _mm_add_epi16(T46, T46_1C); T46 = _mm_add_epi16(T46, T46_0D); T46 = _mm_add_epi16(T46, T46_1D); T47_0A = _mm_abs_epi16(T47_0A); T47_1A = _mm_abs_epi16(T47_1A); T47_0B = _mm_abs_epi16(T47_0B); T47_1B = _mm_abs_epi16(T47_1B); T47_0C = _mm_abs_epi16(T47_0C); T47_1C = _mm_abs_epi16(T47_1C); T47_0D = _mm_abs_epi16(T47_0D); T47_1D = _mm_abs_epi16(T47_1D); T47 = _mm_add_epi16(T47_0A, T47_1A); T47 = _mm_add_epi16(T47, T47_0B); T47 = _mm_add_epi16(T47, T47_1B); T47 = _mm_add_epi16(T47, T47_0C); T47 = _mm_add_epi16(T47, T47_1C); T47 = _mm_add_epi16(T47, T47_0D); T47 = _mm_add_epi16(T47, T47_1D); T48_0A = _mm_abs_epi16(T48_0A); T48_1A = _mm_abs_epi16(T48_1A); T48_0B = _mm_abs_epi16(T48_0B); T48_1B = _mm_abs_epi16(T48_1B); T48_0C = _mm_abs_epi16(T48_0C); T48_1C = _mm_abs_epi16(T48_1C); T48_0D = _mm_abs_epi16(T48_0D); T48_1D = _mm_abs_epi16(T48_1D); T48 = _mm_add_epi16(T48_0A, T48_1A); T48 = _mm_add_epi16(T48, T48_0B); T48 = _mm_add_epi16(T48, T48_1B); T48 = _mm_add_epi16(T48, T48_0C); T48 = _mm_add_epi16(T48, T48_1C); T48 = _mm_add_epi16(T48, T48_0D); T48 = _mm_add_epi16(T48, T48_1D); T49_0A = _mm_abs_epi16(T49_0A); T49_1A = _mm_abs_epi16(T49_1A); T49_0B = _mm_abs_epi16(T49_0B); T49_1B = _mm_abs_epi16(T49_1B); T49_0C = _mm_abs_epi16(T49_0C); T49_1C = _mm_abs_epi16(T49_1C); T49_0D = _mm_abs_epi16(T49_0D); T49_1D = _mm_abs_epi16(T49_1D); T49 = _mm_add_epi16(T49_0A, T49_1A); T49 = _mm_add_epi16(T49, T49_0B); T49 = _mm_add_epi16(T49, T49_1B); T49 = _mm_add_epi16(T49, T49_0C); T49 = _mm_add_epi16(T49, T49_1C); T49 = _mm_add_epi16(T49, T49_0D); T49 = _mm_add_epi16(T49, T49_1D); T50_0A = _mm_abs_epi16(T50_0A); T50_1A = _mm_abs_epi16(T50_1A); T50_0B = _mm_abs_epi16(T50_0B); T50_1B = _mm_abs_epi16(T50_1B); T50_0C = _mm_abs_epi16(T50_0C); T50_1C = _mm_abs_epi16(T50_1C); T50_0D = _mm_abs_epi16(T50_0D); T50_1D = _mm_abs_epi16(T50_1D); T50 = _mm_add_epi16(T50_0A, T50_1A); T50 = _mm_add_epi16(T50, T50_0B); T50 = _mm_add_epi16(T50, T50_1B); T50 = _mm_add_epi16(T50, T50_0C); T50 = _mm_add_epi16(T50, T50_1C); T50 = _mm_add_epi16(T50, T50_0D); T50 = _mm_add_epi16(T50, T50_1D); T51_0A = _mm_abs_epi16(T51_0A); T51_1A = _mm_abs_epi16(T51_1A); T51_0B = _mm_abs_epi16(T51_0B); T51_1B = _mm_abs_epi16(T51_1B); T51_0C = _mm_abs_epi16(T51_0C); T51_1C = _mm_abs_epi16(T51_1C); T51_0D = _mm_abs_epi16(T51_0D); T51_1D = _mm_abs_epi16(T51_1D); T51 = _mm_add_epi16(T51_0A, T51_1A); T51 = _mm_add_epi16(T51, T51_0B); T51 = _mm_add_epi16(T51, T51_1B); T51 = _mm_add_epi16(T51, T51_0C); T51 = _mm_add_epi16(T51, T51_1C); T51 = _mm_add_epi16(T51, T51_0D); T51 = _mm_add_epi16(T51, T51_1D); T52_0A = _mm_abs_epi16(T52_0A); T52_1A = _mm_abs_epi16(T52_1A); T52_0B = _mm_abs_epi16(T52_0B); T52_1B = _mm_abs_epi16(T52_1B); T52_0C = _mm_abs_epi16(T52_0C); T52_1C = _mm_abs_epi16(T52_1C); T52_0D = _mm_abs_epi16(T52_0D); T52_1D = _mm_abs_epi16(T52_1D); T52 = _mm_add_epi16(T52_0A, T52_1A); T52 = _mm_add_epi16(T52, T52_0B); T52 = _mm_add_epi16(T52, T52_1B); T52 = _mm_add_epi16(T52, T52_0C); T52 = _mm_add_epi16(T52, T52_1C); T52 = _mm_add_epi16(T52, T52_0D); T52 = _mm_add_epi16(T52, T52_1D); T53_0A = _mm_abs_epi16(T53_0A); T53_1A = _mm_abs_epi16(T53_1A); T53_0B = _mm_abs_epi16(T53_0B); T53_1B = _mm_abs_epi16(T53_1B); T53_0C = _mm_abs_epi16(T53_0C); T53_1C = _mm_abs_epi16(T53_1C); T53_0D = _mm_abs_epi16(T53_0D); T53_1D = _mm_abs_epi16(T53_1D); T53 = _mm_add_epi16(T53_0A, T53_1A); T53 = _mm_add_epi16(T53, T53_0B); T53 = _mm_add_epi16(T53, T53_1B); T53 = _mm_add_epi16(T53, T53_0C); T53 = _mm_add_epi16(T53, T53_1C); T53 = _mm_add_epi16(T53, T53_0D); T53 = _mm_add_epi16(T53, T53_1D); T54_0A = _mm_abs_epi16(T54_0A); T54_1A = _mm_abs_epi16(T54_1A); T54_0B = _mm_abs_epi16(T54_0B); T54_1B = _mm_abs_epi16(T54_1B); T54_0C = _mm_abs_epi16(T54_0C); T54_1C = _mm_abs_epi16(T54_1C); T54_0D = _mm_abs_epi16(T54_0D); T54_1D = _mm_abs_epi16(T54_1D); T54 = _mm_add_epi16(T54_0A, T54_1A); T54 = _mm_add_epi16(T54, T54_0B); T54 = _mm_add_epi16(T54, T54_1B); T54 = _mm_add_epi16(T54, T54_0C); T54 = _mm_add_epi16(T54, T54_1C); T54 = _mm_add_epi16(T54, T54_0D); T54 = _mm_add_epi16(T54, T54_1D); T55_0A = _mm_abs_epi16(T55_0A); T55_1A = _mm_abs_epi16(T55_1A); T55_0B = _mm_abs_epi16(T55_0B); T55_1B = _mm_abs_epi16(T55_1B); T55_0C = _mm_abs_epi16(T55_0C); T55_1C = _mm_abs_epi16(T55_1C); T55_0D = _mm_abs_epi16(T55_0D); T55_1D = _mm_abs_epi16(T55_1D); T55 = _mm_add_epi16(T55_0A, T55_1A); T55 = _mm_add_epi16(T55, T55_0B); T55 = _mm_add_epi16(T55, T55_1B); T55 = _mm_add_epi16(T55, T55_0C); T55 = _mm_add_epi16(T55, T55_1C); T55 = _mm_add_epi16(T55, T55_0D); T55 = _mm_add_epi16(T55, T55_1D); T56_0A = _mm_abs_epi16(T56_0A); T56_1A = _mm_abs_epi16(T56_1A); T56_0B = _mm_abs_epi16(T56_0B); T56_1B = _mm_abs_epi16(T56_1B); T56_0C = _mm_abs_epi16(T56_0C); T56_1C = _mm_abs_epi16(T56_1C); T56_0D = _mm_abs_epi16(T56_0D); T56_1D = _mm_abs_epi16(T56_1D); T56 = _mm_add_epi16(T56_0A, T56_1A); T56 = _mm_add_epi16(T56, T56_0B); T56 = _mm_add_epi16(T56, T56_1B); T56 = _mm_add_epi16(T56, T56_0C); T56 = _mm_add_epi16(T56, T56_1C); T56 = _mm_add_epi16(T56, T56_0D); T56 = _mm_add_epi16(T56, T56_1D); T57_0A = _mm_abs_epi16(T57_0A); T57_1A = _mm_abs_epi16(T57_1A); T57_0B = _mm_abs_epi16(T57_0B); T57_1B = _mm_abs_epi16(T57_1B); T57_0C = _mm_abs_epi16(T57_0C); T57_1C = _mm_abs_epi16(T57_1C); T57_0D = _mm_abs_epi16(T57_0D); T57_1D = _mm_abs_epi16(T57_1D); T57 = _mm_add_epi16(T57_0A, T57_1A); T57 = _mm_add_epi16(T57, T57_0B); T57 = _mm_add_epi16(T57, T57_1B); T57 = _mm_add_epi16(T57, T57_0C); T57 = _mm_add_epi16(T57, T57_1C); T57 = _mm_add_epi16(T57, T57_0D); T57 = _mm_add_epi16(T57, T57_1D); T58_0A = _mm_abs_epi16(T58_0A); T58_1A = _mm_abs_epi16(T58_1A); T58_0B = _mm_abs_epi16(T58_0B); T58_1B = _mm_abs_epi16(T58_1B); T58_0C = _mm_abs_epi16(T58_0C); T58_1C = _mm_abs_epi16(T58_1C); T58_0D = _mm_abs_epi16(T58_0D); T58_1D = _mm_abs_epi16(T58_1D); T58 = _mm_add_epi16(T58_0A, T58_1A); T58 = _mm_add_epi16(T58, T58_0B); T58 = _mm_add_epi16(T58, T58_1B); T58 = _mm_add_epi16(T58, T58_0C); T58 = _mm_add_epi16(T58, T58_1C); T58 = _mm_add_epi16(T58, T58_0D); T58 = _mm_add_epi16(T58, T58_1D); T59_0A = _mm_abs_epi16(T59_0A); T59_1A = _mm_abs_epi16(T59_1A); T59_0B = _mm_abs_epi16(T59_0B); T59_1B = _mm_abs_epi16(T59_1B); T59_0C = _mm_abs_epi16(T59_0C); T59_1C = _mm_abs_epi16(T59_1C); T59_0D = _mm_abs_epi16(T59_0D); T59_1D = _mm_abs_epi16(T59_1D); T59 = _mm_add_epi16(T59_0A, T59_1A); T59 = _mm_add_epi16(T59, T59_0B); T59 = _mm_add_epi16(T59, T59_1B); T59 = _mm_add_epi16(T59, T59_0C); T59 = _mm_add_epi16(T59, T59_1C); T59 = _mm_add_epi16(T59, T59_0D); T59 = _mm_add_epi16(T59, T59_1D); T60_0A = _mm_abs_epi16(T60_0A); T60_1A = _mm_abs_epi16(T60_1A); T60_0B = _mm_abs_epi16(T60_0B); T60_1B = _mm_abs_epi16(T60_1B); T60_0C = _mm_abs_epi16(T60_0C); T60_1C = _mm_abs_epi16(T60_1C); T60_0D = _mm_abs_epi16(T60_0D); T60_1D = _mm_abs_epi16(T60_1D); T60 = _mm_add_epi16(T60_0A, T60_1A); T60 = _mm_add_epi16(T60, T60_0B); T60 = _mm_add_epi16(T60, T60_1B); T60 = _mm_add_epi16(T60, T60_0C); T60 = _mm_add_epi16(T60, T60_1C); T60 = _mm_add_epi16(T60, T60_0D); T60 = _mm_add_epi16(T60, T60_1D); T61_0A = _mm_abs_epi16(T61_0A); T61_1A = _mm_abs_epi16(T61_1A); T61_0B = _mm_abs_epi16(T61_0B); T61_1B = _mm_abs_epi16(T61_1B); T61_0C = _mm_abs_epi16(T61_0C); T61_1C = _mm_abs_epi16(T61_1C); T61_0D = _mm_abs_epi16(T61_0D); T61_1D = _mm_abs_epi16(T61_1D); T61 = _mm_add_epi16(T61_0A, T61_1A); T61 = _mm_add_epi16(T61, T61_0B); T61 = _mm_add_epi16(T61, T61_1B); T61 = _mm_add_epi16(T61, T61_0C); T61 = _mm_add_epi16(T61, T61_1C); T61 = _mm_add_epi16(T61, T61_0D); T61 = _mm_add_epi16(T61, T61_1D); T62_0A = _mm_abs_epi16(T62_0A); T62_1A = _mm_abs_epi16(T62_1A); T62_0B = _mm_abs_epi16(T62_0B); T62_1B = _mm_abs_epi16(T62_1B); T62_0C = _mm_abs_epi16(T62_0C); T62_1C = _mm_abs_epi16(T62_1C); T62_0D = _mm_abs_epi16(T62_0D); T62_1D = _mm_abs_epi16(T62_1D); T62 = _mm_add_epi16(T62_0A, T62_1A); T62 = _mm_add_epi16(T62, T62_0B); T62 = _mm_add_epi16(T62, T62_1B); T62 = _mm_add_epi16(T62, T62_0C); T62 = _mm_add_epi16(T62, T62_1C); T62 = _mm_add_epi16(T62, T62_0D); T62 = _mm_add_epi16(T62, T62_1D); T63_0A = _mm_abs_epi16(T63_0A); T63_1A = _mm_abs_epi16(T63_1A); T63_0B = _mm_abs_epi16(T63_0B); T63_1B = _mm_abs_epi16(T63_1B); T63_0C = _mm_abs_epi16(T63_0C); T63_1C = _mm_abs_epi16(T63_1C); T63_0D = _mm_abs_epi16(T63_0D); T63_1D = _mm_abs_epi16(T63_1D); T63 = _mm_add_epi16(T63_0A, T63_1A); T63 = _mm_add_epi16(T63, T63_0B); T63 = _mm_add_epi16(T63, T63_1B); T63 = _mm_add_epi16(T63, T63_0C); T63 = _mm_add_epi16(T63, T63_1C); T63 = _mm_add_epi16(T63, T63_0D); T63 = _mm_add_epi16(T63, T63_1D); S = _mm_add_epi16(T0, T1); S = _mm_add_epi16(S, T2); S = _mm_add_epi16(S, T3); S = _mm_add_epi16(S, T4); S = _mm_add_epi16(S, T5); S = _mm_add_epi16(S, T6); S = _mm_add_epi16(S, T7); S = _mm_add_epi16(S, T8); S = _mm_add_epi16(S, T9); S = _mm_add_epi16(S, T10); S = _mm_add_epi16(S, T11); S = _mm_add_epi16(S, T12); S = _mm_add_epi16(S, T13); S = _mm_add_epi16(S, T14); S = _mm_add_epi16(S, T15); M1 = _mm_add_epi16(T16, T17); M1 = _mm_add_epi16(M1, T18); M1 = _mm_add_epi16(M1, T19); M1 = _mm_add_epi16(M1, T20); M1 = _mm_add_epi16(M1, T21); M1 = _mm_add_epi16(M1, T22); M1 = _mm_add_epi16(M1, T23); M1 = _mm_add_epi16(M1, T24); M1 = _mm_add_epi16(M1, T25); M1 = _mm_add_epi16(M1, T26); M1 = _mm_add_epi16(M1, T27); M1 = _mm_add_epi16(M1, T28); M1 = _mm_add_epi16(M1, T29); M1 = _mm_add_epi16(M1, T30); M1 = _mm_add_epi16(M1, T31); M2 = _mm_add_epi16(T32, T33); M2 = _mm_add_epi16(M2, T34); M2 = _mm_add_epi16(M2, T35); M2 = _mm_add_epi16(M2, T36); M2 = _mm_add_epi16(M2, T37); M2 = _mm_add_epi16(M2, T38); M2 = _mm_add_epi16(M2, T39); M2 = _mm_add_epi16(M2, T40); M2 = _mm_add_epi16(M2, T41); M2 = _mm_add_epi16(M2, T42); M2 = _mm_add_epi16(M2, T43); M2 = _mm_add_epi16(M2, T44); M2 = _mm_add_epi16(M2, T45); M2 = _mm_add_epi16(M2, T46); M2 = _mm_add_epi16(M2, T47); M = _mm_add_epi16(T48, T49); M = _mm_add_epi16(M, T50); M = _mm_add_epi16(M, T51); M = _mm_add_epi16(M, T52); M = _mm_add_epi16(M, T53); M = _mm_add_epi16(M, T54); M = _mm_add_epi16(M, T55); M = _mm_add_epi16(M, T56); M = _mm_add_epi16(M, T57); M = _mm_add_epi16(M, T58); M = _mm_add_epi16(M, T59); M = _mm_add_epi16(M, T60); M = _mm_add_epi16(M, T61); M = _mm_add_epi16(M, T62); M = _mm_add_epi16(M, T63); mads = M128_U16(S, 0) + M128_U16(S, 1) + M128_U16(S, 2) + M128_U16(S, 3) + M128_U16(S, 4) + M128_U16(S, 5) + M128_U16(S, 6) + M128_U16(S, 7); mad1 = M128_U16(M1, 0) + M128_U16(M1, 1) + M128_U16(M1, 2) + M128_U16(M1, 3) + M128_U16(M1, 4) + M128_U16(M1, 5) + M128_U16(M1, 6) + M128_U16(M1, 7); mad2 = M128_U16(M2, 0) + M128_U16(M2, 1) + M128_U16(M2, 2) + M128_U16(M2, 3) + M128_U16(M2, 4) + M128_U16(M2, 5) + M128_U16(M2, 6) + M128_U16(M2, 7); mad = M128_U16(M, 0) + M128_U16(M, 1) + M128_U16(M, 2) + M128_U16(M, 3) + M128_U16(M, 4) + M128_U16(M, 5) + M128_U16(M, 6) + M128_U16(M, 7); mad = mads + mad1 + mad2 + mad; return mad; } xavs2-1.3/source/common/vec/intrinsic_pixel.c000066400000000000000000000102011340660520300213250ustar00rootroot00000000000000/* * intrinsic_pixel.c * * Description of this file: * SSE assembly functions of Pixel-Processing module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../basic_types.h" #include "intrinsic.h" #include #include #include #include void xavs2_pixel_average_sse128(pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height) { int i, j; __m128i S1, S2, D; if (width & 15) { __m128i mask = _mm_load_si128((const __m128i*)intrinsic_mask[(width & 15) - 1]); for (i = 0; i < height; i++) { for (j = 0; j < width - 15; j += 16) { S1 = _mm_loadu_si128((const __m128i*)(src1 + j)); S2 = _mm_loadu_si128((const __m128i*)(src2 + j)); D = _mm_avg_epu8(S1, S2); _mm_storeu_si128((__m128i*)(dst + j), D); } S1 = _mm_loadu_si128((const __m128i*)(src1 + j)); S2 = _mm_loadu_si128((const __m128i*)(src2 + j)); D = _mm_avg_epu8(S1, S2); _mm_maskmoveu_si128(D, mask, (char*)&dst[j]); src1 += i_src1; src2 += i_src2; dst += i_dst; } } else { for (i = 0; i < height; i++) { for (j = 0; j < width; j += 16) { S1 = _mm_loadu_si128((const __m128i*)(src1 + j)); S2 = _mm_loadu_si128((const __m128i*)(src2 + j)); D = _mm_avg_epu8(S1, S2); _mm_storeu_si128((__m128i*)(dst + j), D); } src1 += i_src1; src2 += i_src2; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void *xavs2_memzero_aligned_c_sse2(void *dst, size_t n) { __m128i *p_dst = (__m128i *)dst; __m128i m0 = _mm_setzero_si128(); int i = (int)(n >> 4); for (; i != 0; i--) { _mm_store_si128(p_dst, m0); p_dst++; } return dst; } /* --------------------------------------------------------------------------- */ void xavs2_mem_repeat_i_c_sse2(void *dst, int val, size_t count) { __m128i *p_dst = (__m128i *)dst; __m128i m0 = _mm_set1_epi32(val); int i = (int)((count + 3) >> 2); for (; i != 0; i--) { _mm_store_si128(p_dst, m0); p_dst++; } } /* --------------------------------------------------------------------------- */ void *xavs2_memcpy_aligned_c_sse2(void *dst, const void *src, size_t n) { __m128i *p_dst = (__m128i *)dst; const __m128i *p_src = (const __m128i *)src; int i = (int)(n >> 4); for (; i != 0; i--) { _mm_store_si128(p_dst, _mm_load_si128(p_src)); p_src++; p_dst++; } return dst; } xavs2-1.3/source/common/vec/intrinsic_pixel_avx.c000066400000000000000000000335271340660520300222230ustar00rootroot00000000000000/* * intrinsic_pixel_avx.c * * Description of this file: * AVX2 assembly functions of Pixel-Processing module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include #include #include #include #include #include #include "../basic_types.h" #include "../avs2_defs.h" #include "intrinsic.h" /* --------------------------------------------------------------------------- */ void *xavs2_memzero_aligned_c_avx(void *dst, size_t n) { __m256i *p_dst = (__m256i *)dst; __m256i m0 = _mm256_setzero_si256(); int i = (int)(n >> 5); for (; i != 0; i--) { _mm256_store_si256(p_dst, m0); p_dst++; } return dst; } /* --------------------------------------------------------------------------- */ void xavs2_mem_repeat_i_c_avx(void *dst, int val, size_t count) { __m256i *p_dst = (__m256i *)dst; __m256i m0 = _mm256_set1_epi32(val); int i = (int)((count + 7) >> 3); for (; i != 0; i--) { _mm256_store_si256(p_dst, m0); p_dst++; } } void padding_rows_sse256_10bit(pel_t *src, int i_src, int width, int height, int start, int rows, int pad) { int i, j; pel_t *p, *p1, *p2; int pad_lr = pad + 16 - (pad & 0xF); start = XAVS2_MAX(start, 0); if (start + rows > height) { rows = height - start; } p = src + start * i_src; // left & right for (i = 0; i < rows; i++) { __m256i Val1 = _mm256_set1_epi16((int16_t)p[0]); __m256i Val2 = _mm256_set1_epi16((int16_t)p[width - 1]); p1 = p - pad_lr; p2 = p + width; for (j = 0; j < pad_lr; j += 16) { _mm256_storeu_si256((__m256i *)(p1 + j), Val1); _mm256_storeu_si256((__m256i *)(p2 + j), Val2); } p += i_src; } if (start == 0) { p = src - pad; for (i = 1; i <= pad; i++) { memcpy(p - i_src * i, p, (width + 2 * pad) * sizeof(pel_t)); } } if (start + rows == height) { p = src + i_src * (height - 1) - pad; for (i = 1; i <= pad; i++) { memcpy(p + i_src * i, p, (width + 2 * pad) * sizeof(pel_t)); } } } void padding_rows_lr_sse256_10bit(pel_t *src, int i_src, int width, int height, int start, int rows, int pad) { int i, j; pel_t *p, *p1, *p2; int pad_lr = pad + 16 - (pad & 0xF); start = XAVS2_MAX(start, 0); if (start + rows > height) { rows = height - start; } p = src + start * i_src; // left & right for (i = 0; i < rows; i++) { __m256i Val1 = _mm256_set1_epi16((int16_t)p[0]); __m256i Val2 = _mm256_set1_epi16((int16_t)p[width - 1]); p1 = p - pad_lr; p2 = p + width; for (j = 0; j < pad_lr; j += 16) { _mm256_storeu_si256((__m256i *)(p1 + j), Val1); _mm256_storeu_si256((__m256i *)(p2 + j), Val2); } p += i_src; } } void add_pel_clip_sse256(const pel_t *src1, int i_src1, const coeff_t *src2, int i_src2, pel_t *dst, int i_dst, int width, int height) { int i, j; __m256i mask; __m128i mask1; if (width >= 32) { __m256i S, R1, R2, S1, S2, D; __m256i zero = _mm256_setzero_si256(); mask = _mm256_load_si256((const __m256i *)intrinsic_mask32[(width & 31)]); for (i = 0; i < height; i++) { S = _mm256_loadu_si256((const __m256i *)(src1)); R1 = _mm256_loadu_si256((const __m256i *)(src2)); R2 = _mm256_loadu_si256((const __m256i *)(src2 + 16)); S = _mm256_permute4x64_epi64(S, 0xd8); S1 = _mm256_unpacklo_epi8(S, zero); S2 = _mm256_unpackhi_epi8(S, zero); S1 = _mm256_add_epi16(R1, S1); S2 = _mm256_add_epi16(R2, S2); D = _mm256_packus_epi16(S1, S2); D = _mm256_permute4x64_epi64(D, 0xd8); _mm256_storeu_si256((__m256i *)(dst), D); if (width > 32) { S = _mm256_loadu_si256((const __m256i *)(src1 + 32)); R1 = _mm256_loadu_si256((const __m256i *)(src2 + 32)); R2 = _mm256_loadu_si256((const __m256i *)(src2 + 48)); S = _mm256_permute4x64_epi64(S, 0xd8); S1 = _mm256_unpacklo_epi8(S, zero); S2 = _mm256_unpackhi_epi8(S, zero); S1 = _mm256_add_epi16(R1, S1); S2 = _mm256_add_epi16(R2, S2); D = _mm256_packus_epi16(S1, S2); D = _mm256_permute4x64_epi64(D, 0xd8); _mm256_maskstore_epi32((int *)(dst + 32), mask, D); } src1 += i_src1; src2 += i_src2; dst += i_dst; } } else { __m128i zero = _mm_setzero_si128(); __m128i S, S1, S2, R1, R2, D; if (width & 15) { mask1 = _mm_load_si128((const __m128i *)intrinsic_mask[(width & 15) - 1]); for (i = 0; i < height; i++) { for (j = 0; j < width - 15; j += 16) { S = _mm_load_si128((const __m128i *)(src1 + j)); R1 = _mm_load_si128((const __m128i *)(src2 + j)); R2 = _mm_load_si128((const __m128i *)(src2 + j + 8)); S1 = _mm_unpacklo_epi8(S, zero); S2 = _mm_unpackhi_epi8(S, zero); S1 = _mm_add_epi16(R1, S1); S2 = _mm_add_epi16(R2, S2); D = _mm_packus_epi16(S1, S2); _mm_store_si128((__m128i *)(dst + j), D); } S = _mm_loadu_si128((const __m128i *)(src1 + j)); R1 = _mm_loadu_si128((const __m128i *)(src2 + j)); R2 = _mm_loadu_si128((const __m128i *)(src2 + j + 8)); S1 = _mm_unpacklo_epi8(S, zero); S2 = _mm_unpackhi_epi8(S, zero); S1 = _mm_add_epi16(R1, S1); S2 = _mm_add_epi16(R2, S2); D = _mm_packus_epi16(S1, S2); _mm_maskmoveu_si128(D, mask1, (char *)&dst[j]); src1 += i_src1; src2 += i_src2; dst += i_dst; } } else { for (i = 0; i < height; i++) { for (j = 0; j < width; j += 16) { S = _mm_load_si128((const __m128i *)(src1 + j)); R1 = _mm_load_si128((const __m128i *)(src2 + j)); R2 = _mm_load_si128((const __m128i *)(src2 + j + 8)); S1 = _mm_unpacklo_epi8(S, zero); S2 = _mm_unpackhi_epi8(S, zero); S1 = _mm_add_epi16(R1, S1); S2 = _mm_add_epi16(R2, S2); D = _mm_packus_epi16(S1, S2); _mm_store_si128((__m128i *)(dst + j), D); } src1 += i_src1; src2 += i_src2; dst += i_dst; } } } } void xavs2_pixel_average_avx(pel_t *dst, int i_dst, pel_t *src1, int i_src1, pel_t *src2, int i_src2, int width, int height) { int i; if (width >= 32) { __m256i mask, S1, S2, D; mask = _mm256_load_si256((const __m256i *)intrinsic_mask32[(width & 31)]); for (i = 0; i < height; i++) { S1 = _mm256_loadu_si256((const __m256i *)(src1)); S2 = _mm256_loadu_si256((const __m256i *)(src2)); D = _mm256_avg_epu8(S1, S2); _mm256_storeu_si256((__m256i *)(dst), D); if (32 < width) { S1 = _mm256_loadu_si256((const __m256i *)(src1 + 32)); S2 = _mm256_loadu_si256((const __m256i *)(src2 + 32)); D = _mm256_avg_epu8(S1, S2); _mm256_maskstore_epi32((int *)(dst + 32), mask, D); } src1 += i_src1; src2 += i_src2; dst += i_dst; } } else { int j; __m128i S1, S2, D; if (width & 15) { __m128i mask = _mm_load_si128((const __m128i *)intrinsic_mask[(width & 15) - 1]); for (i = 0; i < height; i++) { for (j = 0; j < width - 15; j += 16) { S1 = _mm_loadu_si128((const __m128i *)(src1 + j)); S2 = _mm_load_si128((const __m128i *)(src2 + j)); D = _mm_avg_epu8(S1, S2); _mm_storeu_si128((__m128i *)(dst + j), D); } S1 = _mm_loadu_si128((const __m128i *)(src1 + j)); S2 = _mm_loadu_si128((const __m128i *)(src2 + j)); D = _mm_avg_epu8(S1, S2); _mm_maskmoveu_si128(D, mask, (char *)&dst[j]); src1 += i_src1; src2 += i_src2; dst += i_dst; } } else { for (i = 0; i < height; i++) { for (j = 0; j < width; j += 16) { S1 = _mm_loadu_si128((const __m128i *)(src1 + j)); S2 = _mm_load_si128((const __m128i *)(src2 + j)); D = _mm_avg_epu8(S1, S2); _mm_storeu_si128((__m128i *)(dst + j), D); } src1 += i_src1; src2 += i_src2; dst += i_dst; } } } } void padding_rows_lr_sse256(pel_t *src, int i_src, int width, int height, int start, int rows, int pad) { int i, j; pel_t *p, *p1, *p2; start = XAVS2_MAX(start, 0); if (start + rows > height) { rows = height - start; } p = src + start * i_src; pad = pad + 16 - (pad & 0xF); if (pad & 0x1f) { __m256i mask = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0); for (i = 0; i < rows; i++) { __m256i Val1 = _mm256_set1_epi8((char)p[0]); __m256i Val2 = _mm256_set1_epi8((char)p[width - 1]); p1 = p - pad; p2 = p + width; for (j = 0; j < pad - 31; j += 32) { _mm256_storeu_si256((__m256i *)(p1 + j), Val1); _mm256_storeu_si256((__m256i *)(p2 + j), Val2); } _mm256_maskstore_epi32((int *)(p1 + j), mask, Val1); _mm256_maskstore_epi32((int *)(p2 + j), mask, Val2); p += i_src; } } else { __m256i Val1 = _mm256_set1_epi8((char)p[0]); __m256i Val2 = _mm256_set1_epi8((char)p[width - 1]); p1 = p - pad; p2 = p + width; for (j = 0; j < pad; j += 32) { _mm256_storeu_si256((__m256i *)(p1 + j), Val1); _mm256_storeu_si256((__m256i *)(p2 + j), Val2); } p += i_src; } } void padding_rows_sse256(pel_t *src, int i_src, int width, int height, int start, int rows, int pad) { int i, j; pel_t *p, *p1, *p2; start = XAVS2_MAX(start, 0); if (start + rows > height) { rows = height - start; } p = src + start * i_src; pad = pad + 16 - (pad & 0xF); if (pad & 0x1f) { __m256i mask = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0); for (i = 0; i < rows; i++) { __m256i Val1 = _mm256_set1_epi8((char)p[0]); __m256i Val2 = _mm256_set1_epi8((char)p[width - 1]); p1 = p - pad; p2 = p + width; for (j = 0; j < pad - 31; j += 32) { _mm256_storeu_si256((__m256i *)(p1 + j), Val1); _mm256_storeu_si256((__m256i *)(p2 + j), Val2); } _mm256_maskstore_epi32((int *)(p1 + j), mask, Val1); _mm256_maskstore_epi32((int *)(p2 + j), mask, Val2); p += i_src; } } else { __m256i Val1 = _mm256_set1_epi8((char)p[0]); __m256i Val2 = _mm256_set1_epi8((char)p[width - 1]); p1 = p - pad; p2 = p + width; for (j = 0; j < pad; j += 32) { _mm256_storeu_si256((__m256i *)(p1 + j), Val1); _mm256_storeu_si256((__m256i *)(p2 + j), Val2); } p += i_src; } if (start == 0) { p = src - pad; for (i = 1; i <= pad; i++) { memcpy(p - i_src * i, p, (width + 2 * pad) * sizeof(pel_t)); } } if (start + rows == height) { p = src + i_src * (height - 1) - pad; for (i = 1; i <= pad; i++) { memcpy(p + i_src * i, p, (width + 2 * pad) * sizeof(pel_t)); } } } xavs2-1.3/source/common/vec/intrinsic_quant.c000066400000000000000000000146261340660520300213530ustar00rootroot00000000000000/* * intrinsic_quant.c * * Description of this file: * SSE assembly functions of QUANT module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../basic_types.h" #include "intrinsic.h" #include #include #include // SSE #include // SSE3 #include // SSSE3 #include int quant_c_sse128(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add) { __m128i mScale, mAdd; __m128i data0, data1; __m128i T0, T1; __m128i mZero, mCount; int i; mScale = _mm_set1_epi32(scale); mAdd = _mm_set1_epi32(add); mZero = _mm_setzero_si128(); mCount = _mm_setzero_si128(); for (i = 0; i < i_coef; i += 16) { data0 = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(coef + i))); data1 = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(coef + i + 4))); T0 = _mm_abs_epi32(data0); T1 = _mm_abs_epi32(data1); T0 = _mm_mullo_epi32(T0, mScale); T1 = _mm_mullo_epi32(T1, mScale); T0 = _mm_add_epi32(T0, mAdd); T1 = _mm_add_epi32(T1, mAdd); T0 = _mm_srai_epi32(T0, shift); T1 = _mm_srai_epi32(T1, shift); T0 = _mm_sign_epi32(T0, data0); T1 = _mm_sign_epi32(T1, data1); T0 = _mm_packs_epi32(T0, T1); _mm_store_si128((__m128i *)(coef + i), T0); mCount = _mm_sub_epi16(mCount, _mm_cmpeq_epi16(T0, mZero)); data0 = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(coef + i + 8))); data1 = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(coef + i + 12))); T0 = _mm_abs_epi32(data0); T1 = _mm_abs_epi32(data1); T0 = _mm_mullo_epi32(T0, mScale); T1 = _mm_mullo_epi32(T1, mScale); T0 = _mm_add_epi32(T0, mAdd); T1 = _mm_add_epi32(T1, mAdd); T0 = _mm_srai_epi32(T0, shift); T1 = _mm_srai_epi32(T1, shift); T0 = _mm_sign_epi32(T0, data0); T1 = _mm_sign_epi32(T1, data1); T0 = _mm_packs_epi32(T0, T1); _mm_store_si128((__m128i *)(coef + i + 8), T0); mCount = _mm_sub_epi16(mCount, _mm_cmpeq_epi16(T0, mZero)); } mCount = _mm_packus_epi16(mCount, mCount); mCount = _mm_sad_epu8(mCount, mZero); // get the total number of 0 return i_coef - *(int16_t *)&mCount; } void dequant_c_sse128(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add) { __m128i mScale, mAdd; __m128i data0, data1; int i; mScale = _mm_set1_epi32(scale); mAdd = _mm_set1_epi32(add); for (i = 0; i < i_coef; i += 16) { data0 = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(coef + i))); data1 = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(coef + i + 4))); data0 = _mm_mullo_epi32(data0, mScale); data1 = _mm_mullo_epi32(data1, mScale); data0 = _mm_add_epi32(data0, mAdd); data1 = _mm_add_epi32(data1, mAdd); data0 = _mm_srai_epi32(data0, shift); data1 = _mm_srai_epi32(data1, shift); _mm_store_si128((__m128i *)(coef + i), _mm_packs_epi32(data0, data1)); data0 = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(coef + i + 8))); data1 = _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(coef + i + 12))); data0 = _mm_mullo_epi32(data0, mScale); data1 = _mm_mullo_epi32(data1, mScale); data0 = _mm_add_epi32(data0, mAdd); data1 = _mm_add_epi32(data1, mAdd); data0 = _mm_srai_epi32(data0, shift); data1 = _mm_srai_epi32(data1, shift); _mm_store_si128((__m128i *)(coef + i + 8), _mm_packs_epi32(data0, data1)); } } void abs_coeff_sse128(coeff_t *dst, const coeff_t *src, const int i_coef) { int i; for (i = 0; i < i_coef; i += 16) { _mm_store_si128((__m128i *)(dst + i), _mm_abs_epi16(_mm_load_si128((__m128i *)(src + i)))); _mm_store_si128((__m128i *)(dst + i + 8), _mm_abs_epi16(_mm_load_si128((__m128i *)(src + i + 8)))); } } int add_sign_sse128(coeff_t *dst, const coeff_t *abs_val, const int i_coef) { __m128i mDst, mAbs; __m128i mZero, mCount; int i; mZero = _mm_setzero_si128(); mCount = _mm_setzero_si128(); for (i = 0; i < i_coef; i += 16) { mDst = _mm_load_si128((__m128i *)(dst + i)); mAbs = _mm_load_si128((__m128i *)(abs_val + i)); mDst = _mm_sign_epi16(mAbs, mDst); _mm_store_si128((__m128i *)(dst + i), mDst); mCount = _mm_sub_epi16(mCount, _mm_cmpeq_epi16(mAbs, mZero)); mDst = _mm_load_si128((__m128i *)(dst + i + 8)); mAbs = _mm_load_si128((__m128i *)(abs_val + i + 8)); mDst = _mm_sign_epi16(mAbs, mDst); _mm_store_si128((__m128i *)(dst + i + 8), mDst); mCount = _mm_sub_epi16(mCount, _mm_cmpeq_epi16(mAbs, mZero)); } mCount = _mm_packus_epi16(mCount, mCount); mCount = _mm_sad_epu8(mCount, mZero); return i_coef - *(int16_t *) &mCount; } xavs2-1.3/source/common/vec/intrinsic_quant_avx2.c000066400000000000000000000335421340660520300223110ustar00rootroot00000000000000/* * intrinsic_quant_avx2.c * * Description of this file: * AVX2 assembly functions of QUANT module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * Jiaqi ZHANG * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include #include #include #include #include #include "../basic_types.h" #include "intrinsic.h" int quant_c_avx2(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add) { __m256i mScale, mAdd; __m256i data0, data1; __m256i T0, T1; __m256i mZero, mCount, mCmp; int i; mScale = _mm256_set1_epi32(scale); mAdd = _mm256_set1_epi32(add); mZero = _mm256_setzero_si256(); mCount = _mm256_setzero_si256(); if (i_coef == 16) { data1 = _mm256_load_si256((__m256i *) coef); data0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(data1)); data1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(data1, 0x1)); T0 = _mm256_abs_epi32(data0); T1 = _mm256_abs_epi32(data1); T0 = _mm256_mullo_epi32(T0, mScale); T1 = _mm256_mullo_epi32(T1, mScale); T0 = _mm256_add_epi32(T0, mAdd); T1 = _mm256_add_epi32(T1, mAdd); T0 = _mm256_srai_epi32(T0, shift); T1 = _mm256_srai_epi32(T1, shift); T0 = _mm256_sign_epi32(T0, data0); T1 = _mm256_sign_epi32(T1, data1); T0 = _mm256_packs_epi32(T0, T1); T0 = _mm256_permute4x64_epi64(T0, 0xD8); mCmp = _mm256_cmpeq_epi16(T0, mZero); // for i from 1 to 8, if coeff0[i] == zero, cmp[i] = -1(0xFFFF) mCount = _mm256_sub_epi16(mCount, mCmp); _mm256_store_si256((__m256i *) coef, T0); } else { for (i = 0; i < i_coef; i += 64) { // 0 ~ 15 data1 = _mm256_load_si256((__m256i *)(coef + i)); data0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(data1)); data1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(data1, 0x1)); T0 = _mm256_abs_epi32(data0); T1 = _mm256_abs_epi32(data1); T0 = _mm256_mullo_epi32(T0, mScale); T1 = _mm256_mullo_epi32(T1, mScale); T0 = _mm256_add_epi32(T0, mAdd); T1 = _mm256_add_epi32(T1, mAdd); T0 = _mm256_srai_epi32(T0, shift); T1 = _mm256_srai_epi32(T1, shift); T0 = _mm256_sign_epi32(T0, data0); T1 = _mm256_sign_epi32(T1, data1); T0 = _mm256_packs_epi32(T0, T1); T0 = _mm256_permute4x64_epi64(T0, 0xD8); mCmp = _mm256_cmpeq_epi16(T0, mZero); mCount = _mm256_sub_epi16(mCount, mCmp); _mm256_store_si256((__m256i *)(coef + i), T0); // 16 ~ 31 data1 = _mm256_load_si256((__m256i *)(coef + i + 16)); data0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(data1)); data1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(data1, 0x1)); T0 = _mm256_abs_epi32(data0); T1 = _mm256_abs_epi32(data1); T0 = _mm256_mullo_epi32(T0, mScale); T1 = _mm256_mullo_epi32(T1, mScale); T0 = _mm256_add_epi32(T0, mAdd); T1 = _mm256_add_epi32(T1, mAdd); T0 = _mm256_srai_epi32(T0, shift); T1 = _mm256_srai_epi32(T1, shift); T0 = _mm256_sign_epi32(T0, data0); T1 = _mm256_sign_epi32(T1, data1); T0 = _mm256_packs_epi32(T0, T1); T0 = _mm256_permute4x64_epi64(T0, 0xD8); mCmp = _mm256_cmpeq_epi16(T0, mZero); mCount = _mm256_sub_epi16(mCount, mCmp); _mm256_store_si256((__m256i *)(coef + i + 16), T0); // 32 ~ 47 data1 = _mm256_load_si256((__m256i *)(coef + i + 32)); data0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(data1)); data1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(data1, 0x1)); T0 = _mm256_abs_epi32(data0); T1 = _mm256_abs_epi32(data1); T0 = _mm256_mullo_epi32(T0, mScale); T1 = _mm256_mullo_epi32(T1, mScale); T0 = _mm256_add_epi32(T0, mAdd); T1 = _mm256_add_epi32(T1, mAdd); T0 = _mm256_srai_epi32(T0, shift); T1 = _mm256_srai_epi32(T1, shift); T0 = _mm256_sign_epi32(T0, data0); T1 = _mm256_sign_epi32(T1, data1); T0 = _mm256_packs_epi32(T0, T1); T0 = _mm256_permute4x64_epi64(T0, 0xD8); mCmp = _mm256_cmpeq_epi16(T0, mZero); mCount = _mm256_sub_epi16(mCount, mCmp); _mm256_store_si256((__m256i *)(coef + i + 32), T0); // 48 ~ 63 data1 = _mm256_load_si256((__m256i *)(coef + i + 48)); data0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(data1)); data1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(data1, 0x1)); T0 = _mm256_abs_epi32(data0); T1 = _mm256_abs_epi32(data1); T0 = _mm256_mullo_epi32(T0, mScale); T1 = _mm256_mullo_epi32(T1, mScale); T0 = _mm256_add_epi32(T0, mAdd); T1 = _mm256_add_epi32(T1, mAdd); T0 = _mm256_srai_epi32(T0, shift); T1 = _mm256_srai_epi32(T1, shift); T0 = _mm256_sign_epi32(T0, data0); T1 = _mm256_sign_epi32(T1, data1); T0 = _mm256_packs_epi32(T0, T1); T0 = _mm256_permute4x64_epi64(T0, 0xD8); mCmp = _mm256_cmpeq_epi16(T0, mZero); mCount = _mm256_sub_epi16(mCount, mCmp); _mm256_store_si256((__m256i *)(coef + i + 48), T0); } } mCount = _mm256_packus_epi16(mCount, mCount); mCount = _mm256_permute4x64_epi64(mCount, 0xD8); mCount = _mm256_sad_epu8(mCount, mZero); // get the total number of 0 return i_coef - *(int16_t *) &mCount - *(((int16_t *) &mCount) + 4); } void dequant_c_avx2(coeff_t *coef, const int i_coef, const int scale, const int shift) { __m256i mScale, mAdd; __m256i data0, data1; __m256i T0, T1; int i; mScale = _mm256_set1_epi32(scale); mAdd = _mm256_set1_epi32(1 << (shift - 1)); if (i_coef == 16) { data1 = _mm256_load_si256((__m256i *) coef); data0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(data1)); data1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(data1, 0x1)); T0 = _mm256_mullo_epi32(data0, mScale); T1 = _mm256_mullo_epi32(data1, mScale); T0 = _mm256_add_epi32(T0, mAdd); T1 = _mm256_add_epi32(T1, mAdd); T0 = _mm256_srai_epi32(T0, shift); T1 = _mm256_srai_epi32(T1, shift); T0 = _mm256_permute4x64_epi64(_mm256_packs_epi32(T0, T1), 0xD8); _mm256_store_si256((__m256i *) coef, T0); } else { for (i = 0; i < i_coef; i += 64) { // 0 ~ 15 data1 = _mm256_load_si256((__m256i *)(coef + i)); data0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(data1)); data1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(data1, 0x1)); T0 = _mm256_mullo_epi32(data0, mScale); T1 = _mm256_mullo_epi32(data1, mScale); T0 = _mm256_add_epi32(T0, mAdd); T1 = _mm256_add_epi32(T1, mAdd); T0 = _mm256_srai_epi32(T0, shift); T1 = _mm256_srai_epi32(T1, shift); T0 = _mm256_permute4x64_epi64(_mm256_packs_epi32(T0, T1), 0xD8); _mm256_store_si256((__m256i *)(coef + i), T0); // 16 ~ 31 data1 = _mm256_load_si256((__m256i *)(coef + i + 16)); data0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(data1)); data1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(data1, 0x1)); T0 = _mm256_mullo_epi32(data0, mScale); T1 = _mm256_mullo_epi32(data1, mScale); T0 = _mm256_add_epi32(T0, mAdd); T1 = _mm256_add_epi32(T1, mAdd); T0 = _mm256_srai_epi32(T0, shift); T1 = _mm256_srai_epi32(T1, shift); T0 = _mm256_permute4x64_epi64(_mm256_packs_epi32(T0, T1), 0xD8); _mm256_store_si256((__m256i *)(coef + i + 16), T0); // 32 ~ 47 data1 = _mm256_load_si256((__m256i *)(coef + i + 32)); data0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(data1)); data1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(data1, 0x1)); T0 = _mm256_mullo_epi32(data0, mScale); T1 = _mm256_mullo_epi32(data1, mScale); T0 = _mm256_add_epi32(T0, mAdd); T1 = _mm256_add_epi32(T1, mAdd); T0 = _mm256_srai_epi32(T0, shift); T1 = _mm256_srai_epi32(T1, shift); T0 = _mm256_permute4x64_epi64(_mm256_packs_epi32(T0, T1), 0xD8); _mm256_store_si256((__m256i *)(coef + i + 32), T0); // 48 ~ 63 data1 = _mm256_load_si256((__m256i *)(coef + i + 48)); data0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(data1)); data1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(data1, 0x1)); T0 = _mm256_mullo_epi32(data0, mScale); T1 = _mm256_mullo_epi32(data1, mScale); T0 = _mm256_add_epi32(T0, mAdd); T1 = _mm256_add_epi32(T1, mAdd); T0 = _mm256_srai_epi32(T0, shift); T1 = _mm256_srai_epi32(T1, shift); T0 = _mm256_permute4x64_epi64(_mm256_packs_epi32(T0, T1), 0xD8); _mm256_store_si256((__m256i *)(coef + i + 48), T0); } } } void abs_coeff_avx2(coeff_t *dst, const coeff_t *src, const int i_coef) { int i; if (i_coef == 16) { _mm256_store_si256((__m256i *) dst, _mm256_abs_epi16(_mm256_load_si256((__m256i *) src))); } else { for (i = 0; i < i_coef; i += 64) { _mm256_store_si256((__m256i *)(dst + i), _mm256_abs_epi16(_mm256_load_si256((__m256i *)(src + i)))); _mm256_store_si256((__m256i *)(dst + i + 16), _mm256_abs_epi16(_mm256_load_si256((__m256i *)(src + i + 16)))); _mm256_store_si256((__m256i *)(dst + i + 32), _mm256_abs_epi16(_mm256_load_si256((__m256i *)(src + i + 32)))); _mm256_store_si256((__m256i *)(dst + i + 48), _mm256_abs_epi16(_mm256_load_si256((__m256i *)(src + i + 48)))); } } } int add_sign_avx2(coeff_t *dst, const coeff_t *abs_val, const int i_coef) { __m256i mDst, mAbs; __m256i mZero, mCount; int i; mZero = _mm256_setzero_si256(); mCount = _mm256_setzero_si256(); if (i_coef == 16) { mDst = _mm256_load_si256((__m256i *) dst); mAbs = _mm256_load_si256((__m256i *) abs_val); mDst = _mm256_sign_epi16(mAbs, mDst); mCount = _mm256_sub_epi16(mCount, _mm256_cmpeq_epi16(mAbs, mZero)); _mm256_store_si256((__m256i *) dst, mDst); } else { for (i = 0; i < i_coef; i += 64) { // 0 ~ 15 mDst = _mm256_load_si256((__m256i *)(dst + i)); mAbs = _mm256_load_si256((__m256i *)(abs_val + i)); mDst = _mm256_sign_epi16(mAbs, mDst); mCount = _mm256_sub_epi16(mCount, _mm256_cmpeq_epi16(mAbs, mZero)); _mm256_store_si256((__m256i *)(dst + i), mDst); // 16 ~ 31 mDst = _mm256_load_si256((__m256i *)(dst + i + 16)); mAbs = _mm256_load_si256((__m256i *)(abs_val + i + 16)); mDst = _mm256_sign_epi16(mAbs, mDst); mCount = _mm256_sub_epi16(mCount, _mm256_cmpeq_epi16(mAbs, mZero)); _mm256_store_si256((__m256i *)(dst + i + 16), mDst); // 32 ~ 47 mDst = _mm256_load_si256((__m256i *)(dst + i + 32)); mAbs = _mm256_load_si256((__m256i *)(abs_val + i + 32)); mDst = _mm256_sign_epi16(mAbs, mDst); mCount = _mm256_sub_epi16(mCount, _mm256_cmpeq_epi16(mAbs, mZero)); _mm256_store_si256((__m256i *)(dst + i + 32), mDst); // 48 ~ 63 mDst = _mm256_load_si256((__m256i *)(dst + i + 48)); mAbs = _mm256_load_si256((__m256i *)(abs_val + i + 48)); mDst = _mm256_sign_epi16(mAbs, mDst); mCount = _mm256_sub_epi16(mCount, _mm256_cmpeq_epi16(mAbs, mZero)); _mm256_store_si256((__m256i *)(dst + i + 48), mDst); } } mCount = _mm256_permute4x64_epi64(_mm256_packus_epi16(mCount, mCount), 0xD8); mCount = _mm256_sad_epu8(mCount, mZero); return i_coef - *(int16_t *) &mCount - *(((int16_t *) &mCount) + 4); } xavs2-1.3/source/common/vec/intrinsic_sao.c000066400000000000000000000715301340660520300210020ustar00rootroot00000000000000/* * intrinsic_sao.c * * Description of this file: * SSE assembly functions of SAO module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "../avs2_defs.h" #include "../basic_types.h" #include "../filter.h" #include "intrinsic.h" #include #include #include #include /* --------------------------------------------------------------------------- */ void SAO_on_block_sse128(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int i_block_w, int i_block_h, int *lcu_avail, SAOBlkParam *sao_param) { int start_x, end_x, start_y, end_y; int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn; int x, y; __m128i clipMin = _mm_setzero_si128(); assert(sao_param->typeIdc != SAO_TYPE_OFF); switch (sao_param->typeIdc) { case SAO_TYPE_EO_0: { __m128i off0, off1, off2, off3, off4; __m128i s0, s1, s2; __m128i t0, t1, t2, t3, t4, etype; __m128i c0, c1, c2, c3, c4; __m128i mask; int end_x_16; c0 = _mm_set1_epi8(-2); c1 = _mm_set1_epi8(-1); c2 = _mm_set1_epi8(0); c3 = _mm_set1_epi8(1); c4 = _mm_set1_epi8(2); off0 = _mm_set1_epi8((int8_t)sao_param->offset[0]); off1 = _mm_set1_epi8((int8_t)sao_param->offset[1]); off2 = _mm_set1_epi8((int8_t)sao_param->offset[2]); off3 = _mm_set1_epi8((int8_t)sao_param->offset[3]); off4 = _mm_set1_epi8((int8_t)sao_param->offset[4]); start_x = lcu_avail[SAO_L] ? 0 : 1; end_x = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); end_x_16 = end_x - ((end_x - start_x) & 0x0f); for (y = 0; y < i_block_h; y++) { for (x = start_x; x < end_x; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - 1]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + 1]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //rightsign etype = _mm_adds_epi8(t0, t3); //edgetype=leftsign+rightsign t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); if (x != end_x_16){ _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else{ mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x - end_x_16 - 1])); _mm_maskmoveu_si128(t0, mask, (char*)(p_dst + x)); break; } } p_dst += i_dst; p_src += i_src; } } break; case SAO_TYPE_EO_90: { __m128i off0, off1, off2, off3, off4; __m128i s0, s1, s2; __m128i t0, t1, t2, t3, t4, etype; __m128i c0, c1, c2, c3, c4; int end_x_16 = i_block_w - 15; c0 = _mm_set1_epi8(-2); c1 = _mm_set1_epi8(-1); c2 = _mm_set1_epi8(0); c3 = _mm_set1_epi8(1); c4 = _mm_set1_epi8(2); off0 = _mm_set1_epi8((int8_t)sao_param->offset[0]); off1 = _mm_set1_epi8((int8_t)sao_param->offset[1]); off2 = _mm_set1_epi8((int8_t)sao_param->offset[2]); off3 = _mm_set1_epi8((int8_t)sao_param->offset[3]); off4 = _mm_set1_epi8((int8_t)sao_param->offset[4]); start_y = lcu_avail[SAO_T] ? 0 : 1; end_y = lcu_avail[SAO_D] ? i_block_h : (i_block_h - 1); p_dst += start_y * i_dst; p_src += start_y * i_src; for (y = start_y; y < end_y; y++) { for (x = 0; x < i_block_w; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - i_src]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + i_src]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); //upsign t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //downsign etype = _mm_adds_epi8(t0, t3); //edgetype t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); //saturated if (x < end_x_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(i_block_w & 15) - 1])); _mm_maskmoveu_si128(t0, mask, (char*)(p_dst + x)); break; } } p_dst += i_dst; p_src += i_src; } } break; case SAO_TYPE_EO_135: { __m128i off0, off1, off2, off3, off4; __m128i s0, s1, s2; __m128i t0, t1, t2, t3, t4, etype; __m128i c0, c1, c2, c3, c4; __m128i mask_r0, mask_r, mask_rn; int end_x_r0_16, end_x_r_16, end_x_rn_16; c0 = _mm_set1_epi8(-2); c1 = _mm_set1_epi8(-1); c2 = _mm_set1_epi8(0); c3 = _mm_set1_epi8(1); c4 = _mm_set1_epi8(2); off0 = _mm_set1_epi8((int8_t)sao_param->offset[0]); off1 = _mm_set1_epi8((int8_t)sao_param->offset[1]); off2 = _mm_set1_epi8((int8_t)sao_param->offset[2]); off3 = _mm_set1_epi8((int8_t)sao_param->offset[3]); off4 = _mm_set1_epi8((int8_t)sao_param->offset[4]); //first row start_x_r0 = lcu_avail[SAO_TL] ? 0 : 1; end_x_r0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; end_x_r0_16 = end_x_r0 - ((end_x_r0 - start_x_r0) & 0x0f); for (x = start_x_r0; x < end_x_r0; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - i_src - 1]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + i_src + 1]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); //upsign t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //downsign etype = _mm_adds_epi8(t0, t3); //edgetype t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); //saturated if (x != end_x_r0_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { mask_r0 = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r0 - end_x_r0_16 - 1])); _mm_maskmoveu_si128(t0, mask_r0, (char*)(p_dst + x)); break; } } p_dst += i_dst; p_src += i_src; //middle rows start_x_r = lcu_avail[SAO_L] ? 0 : 1; end_x_r = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); end_x_r_16 = end_x_r - ((end_x_r - start_x_r) & 0x0f); for (y = 1; y < i_block_h - 1; y++) { for (x = start_x_r; x < end_x_r; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - i_src - 1]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + i_src + 1]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); //upsign t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //downsign etype = _mm_adds_epi8(t0, t3); //edgetype t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); //saturated if (x != end_x_r_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { mask_r = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_16 - 1])); _mm_maskmoveu_si128(t0, mask_r, (char*)(p_dst + x)); break; } } p_dst += i_dst; p_src += i_src; } //last row start_x_rn = lcu_avail[SAO_D] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); end_x_rn = lcu_avail[SAO_DR] ? i_block_w : (i_block_w - 1); end_x_rn_16 = end_x_rn - ((end_x_rn - start_x_rn) & 0x0f); for (x = start_x_rn; x < end_x_rn; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - i_src - 1]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + i_src + 1]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); //upsign t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //downsign etype = _mm_adds_epi8(t0, t3); //edgetype t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); //saturated if (x != end_x_rn_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { mask_rn = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_16 - 1])); _mm_maskmoveu_si128(t0, mask_rn, (char*)(p_dst + x)); break; } } } break; case SAO_TYPE_EO_45: { __m128i off0, off1, off2, off3, off4; __m128i s0, s1, s2; __m128i t0, t1, t2, t3, t4, etype; __m128i c0, c1, c2, c3, c4; __m128i mask_r0, mask_r, mask_rn; int end_x_r0_16, end_x_r_16, end_x_rn_16; c0 = _mm_set1_epi8(-2); c1 = _mm_set1_epi8(-1); c2 = _mm_set1_epi8(0); c3 = _mm_set1_epi8(1); c4 = _mm_set1_epi8(2); off0 = _mm_set1_epi8((int8_t)sao_param->offset[0]); off1 = _mm_set1_epi8((int8_t)sao_param->offset[1]); off2 = _mm_set1_epi8((int8_t)sao_param->offset[2]); off3 = _mm_set1_epi8((int8_t)sao_param->offset[3]); off4 = _mm_set1_epi8((int8_t)sao_param->offset[4]); //first row start_x_r0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); end_x_r0 = lcu_avail[SAO_TR] ? i_block_w : (i_block_w - 1); end_x_r0_16 = end_x_r0 - ((end_x_r0 - start_x_r0) & 0x0f); for (x = start_x_r0; x < end_x_r0; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - i_src + 1]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + i_src - 1]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); //upsign t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //downsign etype = _mm_adds_epi8(t0, t3); //edgetype t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); //saturated if (x != end_x_r0_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { mask_r0 = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r0 - end_x_r0_16 - 1])); _mm_maskmoveu_si128(t0, mask_r0, (char*)(p_dst + x)); break; } } p_dst += i_dst; p_src += i_src; //middle rows start_x_r = lcu_avail[SAO_L] ? 0 : 1; end_x_r = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); end_x_r_16 = end_x_r - ((end_x_r - start_x_r) & 0x0f); for (y = 1; y < i_block_h - 1; y++) { for (x = start_x_r; x < end_x_r; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - i_src + 1]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + i_src - 1]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); //upsign t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //downsign etype = _mm_adds_epi8(t0, t3); //edgetype t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); //saturated if (x != end_x_r_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { mask_r = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_16 - 1])); _mm_maskmoveu_si128(t0, mask_r, (char*)(p_dst + x)); break; } } p_dst += i_dst; p_src += i_src; } //last row start_x_rn = lcu_avail[SAO_DL] ? 0 : 1; end_x_rn = lcu_avail[SAO_D] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; end_x_rn_16 = end_x_rn - ((end_x_rn - start_x_rn) & 0x0f); for (x = start_x_rn; x < end_x_rn; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - i_src + 1]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + i_src - 1]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); //upsign t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //downsign etype = _mm_adds_epi8(t0, t3); //edgetype t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); //saturated if (x != end_x_rn_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { mask_rn = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_16 - 1])); _mm_maskmoveu_si128(t0, mask_rn, (char*)(p_dst + x)); break; } } } break; case SAO_TYPE_BO: { __m128i r0, r1, r2, r3, off0, off1, off2, off3; __m128i t0, t1, t2, t3, t4, src0, src1; __m128i mask ; __m128i shift_mask = _mm_set1_epi8(31); int shift_bo = g_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT; int end_x_16 = i_block_w - 15; r0 = _mm_set1_epi8((int8_t)(sao_param->startBand)); r1 = _mm_set1_epi8((int8_t)((sao_param->startBand + 1) & 31)); r2 = _mm_set1_epi8((int8_t)((sao_param->deltaBand + sao_param->startBand) & 31)); r3 = _mm_set1_epi8((int8_t)((sao_param->deltaBand + sao_param->startBand + 1) & 31)); off0 = _mm_set1_epi8((int8_t)sao_param->offset[sao_param->startBand]); off1 = _mm_set1_epi8((int8_t)sao_param->offset[(sao_param->startBand + 1) & 31]); off2 = _mm_set1_epi8((int8_t)sao_param->offset[(sao_param->deltaBand + sao_param->startBand) & 31]); off3 = _mm_set1_epi8((int8_t)sao_param->offset[(sao_param->deltaBand + sao_param->startBand + 1) & 31]); for (y = 0; y < i_block_h; y++) { for (x = 0; x < i_block_w; x += 16) { src0 = _mm_loadu_si128((__m128i*)&p_src[x]); src1 = _mm_and_si128(_mm_srai_epi16(src0, shift_bo), shift_mask); t0 = _mm_cmpeq_epi8(src1, r0); t1 = _mm_cmpeq_epi8(src1, r1); t2 = _mm_cmpeq_epi8(src1, r2); t3 = _mm_cmpeq_epi8(src1, r3); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t0 = _mm_or_si128(t0, t1); t2 = _mm_or_si128(t2, t3); t0 = _mm_or_si128(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(src0, clipMin); t4 = _mm_unpackhi_epi8(src0, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); src0 = _mm_packus_epi16(t1, t2); //saturated if (x < end_x_16) { _mm_storeu_si128((__m128i*)&p_dst[x], src0); } else { mask = _mm_load_si128((const __m128i*)intrinsic_mask[(i_block_w & 15) - 1]); _mm_maskmoveu_si128(src0, mask, (char*)(p_dst + x)); } } p_dst += i_dst; p_src += i_src; } } break; default: { xavs2_log(NULL, XAVS2_LOG_ERROR, "Not a supported SAO types."); assert(0); exit(-1); } } } xavs2-1.3/source/common/vec/intrinsic_sao_avx2.c000066400000000000000000000734551340660520300217520ustar00rootroot00000000000000/* * intrinsic_sao_avx2.c * * Description of this file: * AVX2 assembly functions of SAO module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include #include #include #include #include #include "../common.h" #include "intrinsic.h" #include "../filter.h" /* --------------------------------------------------------------------------- */ void SAO_on_block_sse256(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int i_block_w, int i_block_h, int *lcu_avail, SAOBlkParam *sao_param) { int start_x, end_x, start_y, end_y; int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn; int x, y; // __m128i clipMin = _mm_setzero_si128(); assert(sao_param->typeIdc != SAO_TYPE_OFF); switch (sao_param->typeIdc) { case SAO_TYPE_EO_0: { __m256i off; __m256i s0, s1, s2; __m256i t0, t1, t2, t3, t4, etype; __m128i mask, offtmp; __m256i c2 = _mm256_set1_epi8(2); int end_x_32; offtmp = _mm_loadu_si128((__m128i*)sao_param->offset); offtmp = _mm_packs_epi32(offtmp, _mm_set_epi32(0, 0, 0, sao_param->offset[4])); offtmp = _mm_packs_epi16(offtmp, _mm_setzero_si128()); off = _mm256_castsi128_si256(offtmp); off = _mm256_inserti128_si256(off, offtmp, 1); start_x = lcu_avail[SAO_L] ? 0 : 1; end_x = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); end_x_32 = end_x - ((end_x - start_x) & 0x1f); for (y = 0; y < i_block_h; y++) { for (x = start_x; x < end_x; x += 32) { s0 = _mm256_lddqu_si256((__m256i*)&p_src[x - 1]); s1 = _mm256_loadu_si256((__m256i*)&p_src[x]); s2 = _mm256_loadu_si256((__m256i*)&p_src[x + 1]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //leftsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //rightsign etype = _mm256_adds_epi8(t0, t3); etype = _mm256_adds_epi8(etype, c2);//edgetype=left + right +2 t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x - x > 16) { mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[end_x - end_x_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x - end_x_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } p_dst += i_dst; p_src += i_src; } } break; case SAO_TYPE_EO_90: { __m256i off; __m256i s0, s1, s2; __m256i t0, t1, t2, t3, t4, etype; __m128i mask, offtmp; __m256i c2 = _mm256_set1_epi8(2); int end_x_32 = i_block_w - (i_block_w & 0x1f); offtmp = _mm_loadu_si128((__m128i*)sao_param->offset); offtmp = _mm_packs_epi32(offtmp, _mm_set_epi32(0, 0, 0, sao_param->offset[4])); offtmp = _mm_packs_epi16(offtmp, _mm_setzero_si128()); off = _mm256_castsi128_si256(offtmp); off = _mm256_inserti128_si256(off, offtmp, 1); start_y = lcu_avail[SAO_T] ? 0 : 1; end_y = lcu_avail[SAO_D] ? i_block_h : (i_block_h - 1); p_dst += start_y * i_dst; p_src += start_y * i_src; for (y = start_y; y < end_y; y++) { for (x = 0; x < i_block_w; x += 32) { s0 = _mm256_lddqu_si256((__m256i*)&p_src[x - i_src]); s1 = _mm256_lddqu_si256((__m256i*)&p_src[x]); s2 = _mm256_lddqu_si256((__m256i*)&p_src[x + i_src]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //leftsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //rightsign etype = _mm256_adds_epi8(t0, t3); //edgetype etype = _mm256_adds_epi8(etype, c2); t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (i_block_w - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (i_block_w - x > 16) { mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[i_block_w - end_x_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[i_block_w - end_x_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } p_dst += i_dst; p_src += i_src; } } break; case SAO_TYPE_EO_135: { __m256i off; __m256i s0, s1, s2; __m256i t0, t1, t2, t3, t4, etype; __m128i mask, offtmp; __m256i c2 = _mm256_set1_epi8(2); int end_x_r0_32, end_x_r_32, end_x_rn_32; offtmp = _mm_loadu_si128((__m128i*)sao_param->offset); offtmp = _mm_packs_epi32(offtmp, _mm_set_epi32(0, 0, 0, sao_param->offset[4])); offtmp = _mm_packs_epi16(offtmp, _mm_setzero_si128()); off = _mm256_castsi128_si256(offtmp); off = _mm256_inserti128_si256(off, offtmp, 1); //first row start_x_r0 = lcu_avail[SAO_TL] ? 0 : 1; end_x_r0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; end_x_r0_32 = end_x_r0 - ((end_x_r0 - start_x_r0) & 0x1f); for (x = start_x_r0; x < end_x_r0; x += 32) { s0 = _mm256_loadu_si256((__m256i*)&p_src[x - i_src - 1]); s1 = _mm256_loadu_si256((__m256i*)&p_src[x]); s2 = _mm256_loadu_si256((__m256i*)&p_src[x + i_src + 1]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //upsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //downsign etype = _mm256_adds_epi8(t0, t3); //edgetype etype = _mm256_adds_epi8(etype, c2); t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_r0_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x_r0 - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x_r0 - x > 16) { mask = _mm_loadu_si128((__m128i*)intrinsic_mask[end_x_r0 - end_x_r0_32 - 17]); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r0 - end_x_r0_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } p_dst += i_dst; p_src += i_src; //middle rows start_x_r = lcu_avail[SAO_L] ? 0 : 1; end_x_r = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); end_x_r_32 = end_x_r - ((end_x_r - start_x_r) & 0x1f); for (y = 1; y < i_block_h - 1; y++) { for (x = start_x_r; x < end_x_r; x += 32) { s0 = _mm256_loadu_si256((__m256i*)&p_src[x - i_src - 1]); s1 = _mm256_loadu_si256((__m256i*)&p_src[x]); s2 = _mm256_loadu_si256((__m256i*)&p_src[x + i_src + 1]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //upsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //downsign etype = _mm256_adds_epi8(t0, t3); //edgetype etype = _mm256_adds_epi8(etype, c2); t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_r_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x_r - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x_r - x > 16) { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } p_dst += i_dst; p_src += i_src; } //last row start_x_rn = lcu_avail[SAO_D] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); end_x_rn = lcu_avail[SAO_DR] ? i_block_w : (i_block_w - 1); end_x_rn_32 = end_x_rn - ((end_x_rn - start_x_rn) & 0x1f); for (x = start_x_rn; x < end_x_rn; x += 32) { s0 = _mm256_loadu_si256((__m256i*)&p_src[x - i_src - 1]); s1 = _mm256_loadu_si256((__m256i*)&p_src[x]); s2 = _mm256_loadu_si256((__m256i*)&p_src[x + i_src + 1]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //upsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //downsign etype = _mm256_adds_epi8(t0, t3); //edgetype etype = _mm256_adds_epi8(etype, c2); t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_rn_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x_rn - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x_rn - x > 16) { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } } break; case SAO_TYPE_EO_45: { __m256i off; __m256i s0, s1, s2; __m256i t0, t1, t2, t3, t4, etype; __m128i mask, offtmp; __m256i c2 = _mm256_set1_epi8(2); int end_x_r0_32, end_x_r_32, end_x_rn_32; offtmp = _mm_loadu_si128((__m128i*)sao_param->offset); offtmp = _mm_packs_epi32(offtmp, _mm_set_epi32(0, 0, 0, sao_param->offset[4])); offtmp = _mm_packs_epi16(offtmp, _mm_setzero_si128()); off = _mm256_castsi128_si256(offtmp); off = _mm256_inserti128_si256(off, offtmp, 1); start_x_r0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); end_x_r0 = lcu_avail[SAO_TR] ? i_block_w : (i_block_w - 1); end_x_r0_32 = end_x_r0 - ((end_x_r0 - start_x_r0) & 0x1f); //first row for (x = start_x_r0; x < end_x_r0; x += 32) { s0 = _mm256_loadu_si256((__m256i*)&p_src[x - i_src + 1]); s1 = _mm256_loadu_si256((__m256i*)&p_src[x]); s2 = _mm256_loadu_si256((__m256i*)&p_src[x + i_src - 1]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //upsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //downsign etype = _mm256_adds_epi8(t0, t3); //edgetype etype = _mm256_adds_epi8(etype, c2); t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_r0_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x_r0 - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x_r0 - x > 16) { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r0 - end_x_r0_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r0 - end_x_r0_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } p_dst += i_dst; p_src += i_src; //middle rows start_x_r = lcu_avail[SAO_L] ? 0 : 1; end_x_r = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); end_x_r_32 = end_x_r - ((end_x_r - start_x_r) & 0x1f); for (y = 1; y < i_block_h - 1; y++) { for (x = start_x_r; x < end_x_r; x += 32) { s0 = _mm256_loadu_si256((__m256i*)&p_src[x - i_src + 1]); s1 = _mm256_loadu_si256((__m256i*)&p_src[x]); s2 = _mm256_loadu_si256((__m256i*)&p_src[x + i_src - 1]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //upsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //downsign etype = _mm256_adds_epi8(t0, t3); //edgetype etype = _mm256_adds_epi8(etype, c2); t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_r_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x_r - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x_r - x > 16) { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } p_dst += i_dst; p_src += i_src; } //last row start_x_rn = lcu_avail[SAO_DL] ? 0 : 1; end_x_rn = lcu_avail[SAO_D] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; end_x_rn_32 = end_x_rn - ((end_x_rn - start_x_rn) & 0x1f); for (x = start_x_rn; x < end_x_rn; x += 32) { s0 = _mm256_loadu_si256((__m256i*)&p_src[x - i_src + 1]); s1 = _mm256_loadu_si256((__m256i*)&p_src[x]); s2 = _mm256_loadu_si256((__m256i*)&p_src[x + i_src - 1]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //upsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //downsign etype = _mm256_adds_epi8(t0, t3); //edgetype etype = _mm256_adds_epi8(etype, c2); t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_rn_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x_rn - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x_rn - x > 16) { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } } break; case SAO_TYPE_BO: { __m256i r0, r1, r2, r3, off0, off1, off2, off3; __m256i t0, t1, t2, t3, t4, src0, src1; __m128i mask = _mm_setzero_si128(); __m256i shift_mask = _mm256_set1_epi8(31); int end_x = i_block_w; int end_x_32 = end_x - ((end_x - 0) & 0x1f); r0 = _mm256_set1_epi8((int8_t)(sao_param->startBand)); r1 = _mm256_set1_epi8((int8_t)((sao_param->startBand + 1) & 31)); r2 = _mm256_set1_epi8((int8_t)((sao_param->deltaBand + sao_param->startBand) & 31)); r3 = _mm256_set1_epi8((int8_t)((sao_param->deltaBand + sao_param->startBand + 1) & 31)); off0 = _mm256_set1_epi8((int8_t)sao_param->offset[sao_param->startBand]); off1 = _mm256_set1_epi8((int8_t)sao_param->offset[(sao_param->startBand + 1) & 31]); off2 = _mm256_set1_epi8((int8_t)sao_param->offset[(sao_param->deltaBand + sao_param->startBand) & 31]); off3 = _mm256_set1_epi8((int8_t)sao_param->offset[(sao_param->deltaBand + sao_param->startBand + 1) & 31]); for (y = 0; y < i_block_h; y++) { for (x = 0; x < i_block_w; x += 32){ src0 = _mm256_loadu_si256((__m256i*)&p_src[x]); src1 = _mm256_srli_epi16(src0, 3); src1 = _mm256_and_si256(src1, shift_mask); t0 = _mm256_cmpeq_epi8(src1, r0); t1 = _mm256_cmpeq_epi8(src1, r1); t2 = _mm256_cmpeq_epi8(src1, r2); t3 = _mm256_cmpeq_epi8(src1, r3); t0 = _mm256_and_si256(t0, off0); t1 = _mm256_and_si256(t1, off1); t2 = _mm256_and_si256(t2, off2); t3 = _mm256_and_si256(t3, off3); t0 = _mm256_or_si256(t0, t1); t2 = _mm256_or_si256(t2, t3); t0 = _mm256_or_si256(t0, t2); //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(src0)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(src0, 1)); t1 = _mm256_add_epi16(t1, t3); t2 = _mm256_add_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x < end_x_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x - x > 16) { mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[end_x - end_x_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x - end_x_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } p_dst += i_dst; p_src += i_src; } } break; default: { xavs2_log(NULL, XAVS2_LOG_ERROR, "Invalid SAO type in %s.", __FUNCTION__); exit(-1); } } } xavs2-1.3/source/common/win32thread.c000066400000000000000000000272131340660520300175120ustar00rootroot00000000000000/* * win32thread.c * * Description of this file: * windows threading of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ /* Microsoft's way of supporting systems with >64 logical cpus can be found at * http://www.microsoft.com/whdc/system/Sysinternals/MoreThan64proc.mspx */ /* Based on the agreed standing that xavs2 encoder does not need to utilize >64 logical cpus, * this API does not detect nor utilize more than 64 cpus for systems that have them. */ #include "common.h" #if HAVE_WIN32THREAD #include /** * =========================================================================== * type defines * =========================================================================== */ /* number of times to spin a thread about to block on a locked mutex before retrying and sleeping if still locked */ #define XAVS2_SPIN_COUNT 0 /* GROUP_AFFINITY struct */ typedef struct { ULONG_PTR mask; // KAFFINITY = ULONG_PTR USHORT group; USHORT reserved[3]; } xavs2_group_affinity_t; typedef void (WINAPI *cond_func_t)(xavs2_thread_cond_t *cond); typedef BOOL (WINAPI *cond_wait_t)(xavs2_thread_cond_t *cond, xavs2_thread_mutex_t *mutex, DWORD milliseconds); typedef struct { /* global mutex for replacing MUTEX_INITIALIZER instances */ xavs2_thread_mutex_t static_mutex; /* function pointers to conditional variable API on windows 6.0+ kernels */ cond_func_t cond_broadcast; cond_func_t cond_init; cond_func_t cond_signal; cond_wait_t cond_wait; } xavs2_win32thread_control_t; static xavs2_win32thread_control_t thread_control; /** * =========================================================================== * function defines * =========================================================================== */ /* _beginthreadex requires that the start routine is __stdcall */ static unsigned __stdcall xavs2_win32thread_worker(void *arg) { xavs2_thread_t *h = arg; h->ret = h->func(h->arg); return 0; } int xavs2_thread_create(xavs2_thread_t *thread, const xavs2_thread_attr_t *attr, void *(*start_routine)(void *), void *arg) { UNUSED_PARAMETER(attr); thread->func = start_routine; thread->arg = arg; thread->handle = (void *)_beginthreadex(NULL, 0, xavs2_win32thread_worker, thread, 0, NULL); return !thread->handle; } int xavs2_thread_join(xavs2_thread_t thread, void **value_ptr) { DWORD ret = WaitForSingleObject(thread.handle, INFINITE); if (ret != WAIT_OBJECT_0) { return -1; } if (value_ptr) { *value_ptr = thread.ret; } CloseHandle(thread.handle); return 0; } int xavs2_thread_mutex_init(xavs2_thread_mutex_t *mutex, const xavs2_thread_mutexattr_t *attr) { UNUSED_PARAMETER(attr); return !InitializeCriticalSectionAndSpinCount(mutex, XAVS2_SPIN_COUNT); } int xavs2_thread_mutex_destroy(xavs2_thread_mutex_t *mutex) { DeleteCriticalSection(mutex); return 0; } int xavs2_thread_mutex_lock(xavs2_thread_mutex_t *mutex) { static xavs2_thread_mutex_t init = XAVS2_PTHREAD_MUTEX_INITIALIZER; if (!memcmp(mutex, &init, sizeof(xavs2_thread_mutex_t))) { *mutex = thread_control.static_mutex; } EnterCriticalSection(mutex); return 0; } int xavs2_thread_mutex_unlock(xavs2_thread_mutex_t *mutex) { LeaveCriticalSection(mutex); return 0; } /* for pre-Windows 6.0 platforms we need to define and use our own condition variable and api */ typedef struct { xavs2_thread_mutex_t mtx_broadcast; xavs2_thread_mutex_t mtx_waiter_count; int waiter_count; HANDLE semaphore; HANDLE waiters_done; int is_broadcast; } xavs2_win32_cond_t; int xavs2_thread_cond_init(xavs2_thread_cond_t *cond, const xavs2_thread_condattr_t *attr) { xavs2_win32_cond_t *win32_cond; UNUSED_PARAMETER(attr); if (thread_control.cond_init) { thread_control.cond_init(cond); return 0; } /* non native condition variables */ win32_cond = xavs2_calloc(1, sizeof(xavs2_win32_cond_t)); if (!win32_cond) { return -1; } cond->ptr = win32_cond; win32_cond->semaphore = CreateSemaphore(NULL, 0, 0x7fffffff, NULL); if (!win32_cond->semaphore) { return -1; } if (xavs2_thread_mutex_init(&win32_cond->mtx_waiter_count, NULL)) { return -1; } if (xavs2_thread_mutex_init(&win32_cond->mtx_broadcast, NULL)) { return -1; } win32_cond->waiters_done = CreateEvent(NULL, FALSE, FALSE, NULL); if (!win32_cond->waiters_done) { return -1; } return 0; } int xavs2_thread_cond_destroy(xavs2_thread_cond_t *cond) { xavs2_win32_cond_t *win32_cond; /* native condition variables do not destroy */ if (thread_control.cond_init) { return 0; } /* non native condition variables */ win32_cond = cond->ptr; CloseHandle(win32_cond->semaphore); CloseHandle(win32_cond->waiters_done); xavs2_thread_mutex_destroy(&win32_cond->mtx_broadcast); xavs2_thread_mutex_destroy(&win32_cond->mtx_waiter_count); xavs2_free(win32_cond); return 0; } int xavs2_thread_cond_broadcast(xavs2_thread_cond_t *cond) { xavs2_win32_cond_t *win32_cond; int have_waiter = 0; if (thread_control.cond_broadcast) { thread_control.cond_broadcast(cond); return 0; } /* non native condition variables */ win32_cond = cond->ptr; xavs2_thread_mutex_lock(&win32_cond->mtx_broadcast); xavs2_thread_mutex_lock(&win32_cond->mtx_waiter_count); if (win32_cond->waiter_count) { win32_cond->is_broadcast = 1; have_waiter = 1; } if (have_waiter) { ReleaseSemaphore(win32_cond->semaphore, win32_cond->waiter_count, NULL); xavs2_thread_mutex_unlock(&win32_cond->mtx_waiter_count); WaitForSingleObject(win32_cond->waiters_done, INFINITE); win32_cond->is_broadcast = 0; } else { xavs2_thread_mutex_unlock(&win32_cond->mtx_waiter_count); } return xavs2_thread_mutex_unlock(&win32_cond->mtx_broadcast); } int xavs2_thread_cond_signal(xavs2_thread_cond_t *cond) { xavs2_win32_cond_t *win32_cond; int have_waiter; if (thread_control.cond_signal) { thread_control.cond_signal(cond); return 0; } /* non-native condition variables */ win32_cond = cond->ptr; xavs2_thread_mutex_lock(&win32_cond->mtx_broadcast); xavs2_thread_mutex_lock(&win32_cond->mtx_waiter_count); have_waiter = win32_cond->waiter_count; xavs2_thread_mutex_unlock(&win32_cond->mtx_waiter_count); if (have_waiter) { ReleaseSemaphore(win32_cond->semaphore, 1, NULL); } return xavs2_thread_mutex_unlock(&win32_cond->mtx_broadcast); } int xavs2_thread_cond_wait(xavs2_thread_cond_t *cond, xavs2_thread_mutex_t *mutex) { xavs2_win32_cond_t *win32_cond; int last_waiter; if (thread_control.cond_wait) { return !thread_control.cond_wait(cond, mutex, INFINITE); } /* non native condition variables */ win32_cond = cond->ptr; xavs2_thread_mutex_lock(&win32_cond->mtx_broadcast); xavs2_thread_mutex_lock(&win32_cond->mtx_waiter_count); win32_cond->waiter_count++; xavs2_thread_mutex_unlock(&win32_cond->mtx_waiter_count); xavs2_thread_mutex_unlock(&win32_cond->mtx_broadcast); // unlock the external mutex xavs2_thread_mutex_unlock(mutex); WaitForSingleObject(win32_cond->semaphore, INFINITE); xavs2_thread_mutex_lock(&win32_cond->mtx_waiter_count); win32_cond->waiter_count--; last_waiter = !win32_cond->waiter_count || !win32_cond->is_broadcast; xavs2_thread_mutex_unlock(&win32_cond->mtx_waiter_count); if (last_waiter) { SetEvent(win32_cond->waiters_done); } // lock the external mutex return xavs2_thread_mutex_lock(mutex); } int xavs2_win32_threading_init(void) { /* find function pointers to API functions, if they exist */ HMODULE kernel_dll = GetModuleHandle(TEXT("kernel32")); thread_control.cond_init = (cond_func_t)GetProcAddress(kernel_dll, "InitializeConditionVariable"); if (thread_control.cond_init) { /* we're on a windows 6.0+ kernel, acquire the rest of the functions */ thread_control.cond_broadcast = (cond_func_t)GetProcAddress(kernel_dll, "WakeAllConditionVariable"); thread_control.cond_signal = (cond_func_t)GetProcAddress(kernel_dll, "WakeConditionVariable"); thread_control.cond_wait = (cond_wait_t)GetProcAddress(kernel_dll, "SleepConditionVariableCS"); } return xavs2_thread_mutex_init(&thread_control.static_mutex, NULL); } void xavs2_win32_threading_destroy(void) { xavs2_thread_mutex_destroy(&thread_control.static_mutex); memset(&thread_control, 0, sizeof(xavs2_win32thread_control_t)); } int xavs2_thread_num_processors_np() { DWORD_PTR system_cpus, process_cpus = 0; int cpus = 0; DWORD_PTR bit; /* GetProcessAffinityMask returns affinities of 0 when the process has threads in multiple processor groups. * On platforms that support processor grouping, use GetThreadGroupAffinity to get the current thread's affinity instead. */ #if ARCH_X86_64 /* find function pointers to API functions specific to x86_64 platforms, if they exist. * BOOL GetThreadGroupAffinity(_In_ HANDLE hThread, _Out_ PGROUP_AFFINITY GroupAffinity); */ typedef BOOL(*get_thread_affinity_t)(HANDLE thread, xavs2_group_affinity_t *group_affinity); HANDLE kernel_dll = GetModuleHandle(TEXT("kernel32.dll")); get_thread_affinity_t get_thread_affinity = (get_thread_affinity_t)GetProcAddress(kernel_dll, "GetThreadGroupAffinity"); if (get_thread_affinity) { /* running on a platform that supports >64 logical cpus */ xavs2_group_affinity_t thread_affinity; if (get_thread_affinity(GetCurrentThread(), &thread_affinity)) { process_cpus = thread_affinity.mask; } } #endif if (!process_cpus) { GetProcessAffinityMask(GetCurrentProcess(), &process_cpus, &system_cpus); } for (bit = 1; bit; bit <<= 1) { cpus += !!(process_cpus & bit); } return cpus ? cpus : 1; } #endif // #if HAVE_WIN32THREAD xavs2-1.3/source/common/win32thread.h000066400000000000000000000102061340660520300175110ustar00rootroot00000000000000/* * win32thread.h * * Description of this file: * windows threading of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_WIN32THREAD_H #define XAVS2_WIN32THREAD_H #define WIN32_LEAN_AND_MEAN #include /* the following macro is used within xavs2 encoder */ #undef ERROR typedef struct { void *handle; void *(*func)(void *arg); void *arg; void *ret; } xavs2_thread_t; #define xavs2_thread_attr_t int /* the conditional variable api for windows 6.0+ uses critical sections and not mutexes */ typedef CRITICAL_SECTION xavs2_thread_mutex_t; #define XAVS2_PTHREAD_MUTEX_INITIALIZER {0} #define xavs2_thread_mutexattr_t int #define pthread_exit(a) /* This is the CONDITIONAL_VARIABLE typedef for using Window's native conditional variables on kernels 6.0+. * MinGW does not currently have this typedef. */ typedef struct { void *ptr; } xavs2_thread_cond_t; #define xavs2_thread_condattr_t int #define xavs2_thread_create FPFX(thread_create) int xavs2_thread_create(xavs2_thread_t *thread, const xavs2_thread_attr_t *attr, void *(*start_routine)(void *), void *arg); #define xavs2_thread_join FPFX(thread_join) int xavs2_thread_join(xavs2_thread_t thread, void **value_ptr); #define xavs2_thread_mutex_init FPFX(thread_mutex_init) int xavs2_thread_mutex_init(xavs2_thread_mutex_t *mutex, const xavs2_thread_mutexattr_t *attr); #define xavs2_thread_mutex_destroy FPFX(thread_mutex_destroy) int xavs2_thread_mutex_destroy(xavs2_thread_mutex_t *mutex); #define xavs2_thread_mutex_lock FPFX(thread_mutex_lock) int xavs2_thread_mutex_lock(xavs2_thread_mutex_t *mutex); #define xavs2_thread_mutex_unlock FPFX(thread_mutex_unlock) int xavs2_thread_mutex_unlock(xavs2_thread_mutex_t *mutex); #define xavs2_thread_cond_init FPFX(thread_cond_init) int xavs2_thread_cond_init(xavs2_thread_cond_t *cond, const xavs2_thread_condattr_t *attr); #define xavs2_thread_cond_destroy FPFX(thread_cond_destroy) int xavs2_thread_cond_destroy(xavs2_thread_cond_t *cond); #define xavs2_thread_cond_broadcast FPFX(thread_cond_broadcast) int xavs2_thread_cond_broadcast(xavs2_thread_cond_t *cond); #define xavs2_thread_cond_wait FPFX(thread_cond_wait) int xavs2_thread_cond_wait(xavs2_thread_cond_t *cond, xavs2_thread_mutex_t *mutex); #define xavs2_thread_cond_signal FPFX(thread_cond_signal) int xavs2_thread_cond_signal(xavs2_thread_cond_t *cond); #define xavs2_thread_attr_init(a) 0 #define xavs2_thread_attr_destroy(a) 0 #define xavs2_win32_threading_init FPFX(win32_threading_init) int xavs2_win32_threading_init(void); #define xavs2_win32_threading_destroy FPFX(win32_threading_destroy) void xavs2_win32_threading_destroy(void); #define xavs2_thread_num_processors_np FPFX(thread_num_processors_np) int xavs2_thread_num_processors_np(void); #endif // XAVS2_WIN32THREAD_H xavs2-1.3/source/common/x86/000077500000000000000000000000001340660520300156345ustar00rootroot00000000000000xavs2-1.3/source/common/x86/blockcopy8.asm000066400000000000000000005005151340660520300204210ustar00rootroot00000000000000;***************************************************************************** ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* ;* Authors: Praveen Kumar Tiwari ;* Murugan Vairavel ;* Min Chen ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;*****************************************************************************/ %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 cextern pb_4 cextern pb_1 cextern pb_16 cextern pb_64 cextern pw_4 cextern pb_8 cextern pb_32 cextern pb_128 SECTION .text ;----------------------------------------------------------------------------- ; void blockcopy_pp_2x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_2x4, 4, 7, 0 mov r4w, [r2] mov r5w, [r2 + r3] mov r6w, [r2 + 2 * r3] lea r3, [r3 + 2 * r3] mov r3w, [r2 + r3] mov [r0], r4w mov [r0 + r1], r5w mov [r0 + 2 * r1], r6w lea r1, [r1 + 2 * r1] mov [r0 + r1], r3w RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_2x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_2x8, 4, 7, 0 lea r5, [3 * r1] lea r6, [3 * r3] mov r4w, [r2] mov [r0], r4w mov r4w, [r2 + r3] mov [r0 + r1], r4w mov r4w, [r2 + 2 * r3] mov [r0 + 2 * r1], r4w mov r4w, [r2 + r6] mov [r0 + r5], r4w lea r2, [r2 + 4 * r3] mov r4w, [r2] lea r0, [r0 + 4 * r1] mov [r0], r4w mov r4w, [r2 + r3] mov [r0 + r1], r4w mov r4w, [r2 + 2 * r3] mov [r0 + 2 * r1], r4w mov r4w, [r2 + r6] mov [r0 + r5], r4w RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_2x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_2x16, 4, 7, 0 lea r5, [3 * r1] lea r6, [3 * r3] mov r4w, [r2] mov [r0], r4w mov r4w, [r2 + r3] mov [r0 + r1], r4w mov r4w, [r2 + 2 * r3] mov [r0 + 2 * r1], r4w mov r4w, [r2 + r6] mov [r0 + r5], r4w %rep 3 lea r2, [r2 + 4 * r3] mov r4w, [r2] lea r0, [r0 + 4 * r1] mov [r0], r4w mov r4w, [r2 + r3] mov [r0 + r1], r4w mov r4w, [r2 + 2 * r3] mov [r0 + 2 * r1], r4w mov r4w, [r2 + r6] mov [r0 + r5], r4w %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_4x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_4x2, 4, 6, 0 mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_4x4, 4, 4, 4 movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] lea r3, [r3 + r3 * 2] movd m3, [r2 + r3] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 lea r1, [r1 + 2 * r1] movd [r0 + r1], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_4x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_4x8, 4, 6, 4 lea r4, [3 * r1] lea r5, [3 * r3] movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] movd m3, [r2 + r5] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 movd [r0 + r4], m3 lea r2, [r2 + 4 * r3] movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] movd m3, [r2 + r5] lea r0, [r0 + 4 * r1] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 movd [r0 + r4], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W4_H8 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 7, 4 mov r4d, %2/8 lea r5, [3 * r1] lea r6, [3 * r3] .loop: movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] movd m3, [r2 + r6] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 movd [r0 + r5], m3 lea r2, [r2 + 4 * r3] movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] movd m3, [r2 + r6] lea r0, [r0 + 4 * r1] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 movd [r0 + r5], m3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PP_W4_H8 4, 16 BLOCKCOPY_PP_W4_H8 4, 32 ;----------------------------------------------------------------------------- ; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_6x8, 4, 7, 3 movd m0, [r2] mov r4w, [r2 + 4] movd m1, [r2 + r3] mov r5w, [r2 + r3 + 4] movd m2, [r2 + 2 * r3] mov r6w, [r2 + 2 * r3 + 4] movd [r0], m0 mov [r0 + 4], r4w movd [r0 + r1], m1 mov [r0 + r1 + 4], r5w movd [r0 + 2 * r1], m2 mov [r0 + 2 * r1 + 4], r6w lea r2, [r2 + 2 * r3] movd m0, [r2 + r3] mov r4w, [r2 + r3 + 4] movd m1, [r2 + 2 * r3] mov r5w, [r2 + 2 * r3 + 4] lea r2, [r2 + 2 * r3] movd m2, [r2 + r3] mov r6w, [r2 + r3 + 4] lea r0, [r0 + 2 * r1] movd [r0 + r1], m0 mov [r0 + r1 + 4], r4w movd [r0 + 2 * r1], m1 mov [r0 + 2 * r1 + 4], r5w lea r0, [r0 + 2 * r1] movd [r0 + r1], m2 mov [r0 + r1 + 4], r6w lea r2, [r2 + 2 * r3] movd m0, [r2] mov r4w, [r2 + 4] movd m1, [r2 + r3] mov r5w, [r2 + r3 + 4] lea r0, [r0 + 2 * r1] movd [r0], m0 mov [r0 + 4], r4w movd [r0 + r1], m1 mov [r0 + r1 + 4], r5w RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_6x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_6x16, 4, 7, 2 mov r6d, 16/2 .loop: movd m0, [r2] mov r4w, [r2 + 4] movd m1, [r2 + r3] mov r5w, [r2 + r3 + 4] lea r2, [r2 + r3 * 2] movd [r0], m0 mov [r0 + 4], r4w movd [r0 + r1], m1 mov [r0 + r1 + 4], r5w lea r0, [r0 + r1 * 2] dec r6d jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x2, 4, 4, 2 movh m0, [r2] movh m1, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x4, 4, 4, 4 movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] lea r3, [r3 + r3 * 2] movh m3, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 lea r1, [r1 + 2 * r1] movh [r0 + r1], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x6, 4, 4, 6 movh m0, [r2] movh m1, [r2 + r3] lea r2, [r2 + 2 * r3] movh m2, [r2] movh m3, [r2 + r3] lea r2, [r2 + 2 * r3] movh m4, [r2] movh m5, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 lea r0, [r0 + 2 * r1] movh [r0], m2 movh [r0 + r1], m3 lea r0, [r0 + 2 * r1] movh [r0], m4 movh [r0 + r1], m5 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x12, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %rep 2 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x8, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x16, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %rep 3 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x32(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x32, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %rep 7 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x64(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x64, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %rep 15 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W12_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 4 mov r4d, %2/4 .loop: movh m0, [r2] movd m1, [r2 + 8] movh m2, [r2 + r3] movd m3, [r2 + r3 + 8] lea r2, [r2 + 2 * r3] movh [r0], m0 movd [r0 + 8], m1 movh [r0 + r1], m2 movd [r0 + r1 + 8], m3 lea r0, [r0 + 2 * r1] movh m0, [r2] movd m1, [r2 + 8] movh m2, [r2 + r3] movd m3, [r2 + r3 + 8] movh [r0], m0 movd [r0 + 8], m1 movh [r0 + r1], m2 movd [r0 + r1 + 8], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W12_H4 12, 16 BLOCKCOPY_PP_W12_H4 12, 32 ;----------------------------------------------------------------------------- ; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W16_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 4 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + 2 * r3] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W16_H4 16, 4 BLOCKCOPY_PP_W16_H4 16, 12 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W16_H8 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 6 mov r4d, %2/8 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + 2 * r3] movu m2, [r2] movu m3, [r2 + r3] lea r2, [r2 + 2 * r3] movu m4, [r2] movu m5, [r2 + r3] lea r2, [r2 + 2 * r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 lea r0, [r0 + 2 * r1] movu [r0], m4 movu [r0 + r1], m5 lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W16_H8 16, 8 BLOCKCOPY_PP_W16_H8 16, 16 BLOCKCOPY_PP_W16_H8 16, 32 BLOCKCOPY_PP_W16_H8 16, 64 BLOCKCOPY_PP_W16_H8 16, 24 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W24_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 6 mov r4d, %2/4 .loop: movu m0, [r2] movh m1, [r2 + 16] movu m2, [r2 + r3] movh m3, [r2 + r3 + 16] lea r2, [r2 + 2 * r3] movu m4, [r2] movh m5, [r2 + 16] movu [r0], m0 movh [r0 + 16], m1 movu [r0 + r1], m2 movh [r0 + r1 + 16], m3 lea r0, [r0 + 2 * r1] movu [r0], m4 movh [r0 + 16], m5 movu m0, [r2 + r3] movh m1, [r2 + r3 + 16] movu [r0 + r1], m0 movh [r0 + r1 + 16], m1 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W24_H4 24, 32 BLOCKCOPY_PP_W24_H4 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W32_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 4 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] lea r2, [r2 + 2 * r3] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W32_H4 32, 8 BLOCKCOPY_PP_W32_H4 32, 16 BLOCKCOPY_PP_W32_H4 32, 24 BLOCKCOPY_PP_W32_H4 32, 32 BLOCKCOPY_PP_W32_H4 32, 64 BLOCKCOPY_PP_W32_H4 32, 48 INIT_YMM avx cglobal blockcopy_pp_32x8, 4, 6, 6 lea r4, [3 * r1] lea r5, [3 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r2, [r2 + 4 * r3] movu m4, [r2] movu m5, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r0, [r0 + 4 * r1] movu [r0], m4 movu [r0 + r1], m5 movu m0, [r2 + 2 * r3] movu m1, [r2 + r5] movu [r0 + 2 * r1], m0 movu [r0 + r4], m1 RET INIT_YMM avx cglobal blockcopy_pp_32x16, 4, 6, 6 lea r4, [3 * r1] lea r5, [3 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r2, [r2 + 4 * r3] movu m4, [r2] movu m5, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r0, [r0 + 4 * r1] movu [r0], m4 movu [r0 + r1], m5 movu m0, [r2 + 2 * r3] movu m1, [r2 + r5] lea r2, [r2 + 4 * r3] movu m2, [r2] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] movu m5, [r2 + r5] movu [r0 + 2 * r1], m0 movu [r0 + r4], m1 lea r0, [r0 + 4 * r1] movu [r0], m2 movu [r0 + r1], m3 movu [r0 + 2 * r1], m4 movu [r0 + r4], m5 lea r2, [r2 + 4 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_32x24(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_YMM avx cglobal blockcopy_pp_32x24, 4, 7, 6 lea r4, [3 * r1] lea r5, [3 * r3] mov r6d, 24/8 .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r2, [r2 + 4 * r3] movu m4, [r2] movu m5, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r0, [r0 + 4 * r1] movu [r0], m4 movu [r0 + r1], m5 movu m0, [r2 + 2 * r3] movu m1, [r2 + r5] movu [r0 + 2 * r1], m0 movu [r0 + r4], m1 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r6d jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W32_H16_avx 2 INIT_YMM avx cglobal blockcopy_pp_%1x%2, 4, 7, 6 lea r4, [3 * r1] lea r5, [3 * r3] mov r6d, %2/16 .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r2, [r2 + 4 * r3] movu m4, [r2] movu m5, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r0, [r0 + 4 * r1] movu [r0], m4 movu [r0 + r1], m5 movu m0, [r2 + 2 * r3] movu m1, [r2 + r5] lea r2, [r2 + 4 * r3] movu m2, [r2] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] movu m5, [r2 + r5] movu [r0 + 2 * r1], m0 movu [r0 + r4], m1 lea r0, [r0 + 4 * r1] movu [r0], m2 movu [r0 + r1], m3 movu [r0 + 2 * r1], m4 movu [r0 + r4], m5 lea r2, [r2 + 4 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r6d jnz .loop RET %endmacro BLOCKCOPY_PP_W32_H16_avx 32, 32 BLOCKCOPY_PP_W32_H16_avx 32, 48 BLOCKCOPY_PP_W32_H16_avx 32, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W48_H2 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 6 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] lea r2, [r2 + 2 * r3] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + r1], m3 movu [r0 + r1 + 16], m4 movu [r0 + r1 + 32], m5 lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + r1], m3 movu [r0 + r1 + 16], m4 movu [r0 + r1 + 32], m5 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W48_H2 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W48_H4_avx 2 INIT_YMM avx cglobal blockcopy_pp_%1x%2, 4, 5, 4 mov r4d, %2/4 .loop: movu m0, [r2] movu xm1, [r2 + 32] movu m2, [r2 + r3] movu xm3, [r2 + r3 + 32] lea r2, [r2 + 2 * r3] movu [r0], m0 movu [r0 + 32], xm1 movu [r0 + r1], m2 movu [r0 + r1 + 32], xm3 lea r0, [r0 + 2 * r1] movu m0, [r2] movu xm1, [r2 + 32] movu m2, [r2 + r3] movu xm3, [r2 + r3 + 32] movu [r0], m0 movu [r0 + 32], xm1 movu [r0 + r1], m2 movu [r0 + r1 + 32], xm3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W48_H4_avx 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W64_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 6 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + r3] movu m5, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu [r0 + r1], m4 movu [r0 + r1 + 16], m5 movu m0, [r2 + r3 + 32] movu m1, [r2 + r3 + 48] lea r2, [r2 + 2 * r3] movu m2, [r2] movu m3, [r2 + 16] movu m4, [r2 + 32] movu m5, [r2 + 48] movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + 16], m3 movu [r0 + 32], m4 movu [r0 + 48], m5 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W64_H4 64, 16 BLOCKCOPY_PP_W64_H4 64, 32 BLOCKCOPY_PP_W64_H4 64, 48 BLOCKCOPY_PP_W64_H4 64, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W64_H4_avx 2 INIT_YMM avx cglobal blockcopy_pp_%1x%2, 4, 7, 6 lea r4, [3 * r1] lea r5, [3 * r3] mov r6d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + r3] movu m3, [r2 + r3 + 32] movu m4, [r2 + 2 * r3] movu m5, [r2 + 2 * r3 + 32] movu [r0], m0 movu [r0 + 32], m1 movu [r0 + r1], m2 movu [r0 + r1 + 32], m3 movu [r0 + 2 * r1], m4 movu [r0 + 2 * r1 + 32], m5 movu m0, [r2 + r5] movu m1, [r2 + r5 + 32] movu [r0 + r4], m0 movu [r0 + r4 + 32], m1 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r6d jnz .loop RET %endmacro BLOCKCOPY_PP_W64_H4_avx 64, 16 BLOCKCOPY_PP_W64_H4_avx 64, 32 BLOCKCOPY_PP_W64_H4_avx 64, 48 BLOCKCOPY_PP_W64_H4_avx 64, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_sp_2x4, 4, 5, 2 add r3, r3 ;Row 0-1 movd m0, [r2] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0], r4w pextrw [r0 + r1], m0, 4 ;Row 2-3 movd m0, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0 + 2 * r1], r4w lea r0, [r0 + 2 * r1] pextrw [r0 + r1], m0, 4 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_2x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_sp_2x8, 4, 5, 2 add r3, r3 ;Row 0-1 movd m0, [r2] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0], r4w pextrw [r0 + r1], m0, 4 ;Row 2-3 movd m0, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0 + 2 * r1], r4w lea r0, [r0 + 2 * r1] pextrw [r0 + r1], m0, 4 ;Row 4-5 movd m0, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0 + 2 * r1], r4w lea r0, [r0 + 2 * r1] pextrw [r0 + r1], m0, 4 ;Row 6-7 movd m0, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0 + 2 * r1], r4w lea r0, [r0 + 2 * r1] pextrw [r0 + r1], m0, 4 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W2_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 7, 2, dst, dstStride, src, srcStride add r3, r3 mov r6d, %2/2 .loop: movd m0, [r2] movd m1, [r2 + r3] dec r6d lea r2, [r2 + r3 * 2] packuswb m0, m0 packuswb m1, m1 movd r4d, m0 movd r5d, m1 mov [r0], r4w mov [r0 + r1], r5w lea r0, [r0 + r1 * 2] jnz .loop RET %endmacro BLOCKCOPY_SP_W2_H2 2, 4 BLOCKCOPY_SP_W2_H2 2, 8 BLOCKCOPY_SP_W2_H2 2, 16 ;----------------------------------------------------------------------------- ; void blockcopy_sp_4x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_4x2, 4, 4, 2, dst, dstStride, src, srcStride add r3, r3 movh m0, [r2] movh m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pshufd m0, m0, 2 movd [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_4x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_4x4, 4, 4, 4, dst, dstStride, src, srcStride add r3, r3 movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m3, [r2 + r3] packuswb m0, m1 packuswb m2, m3 movd [r0], m0 pshufd m0, m0, 2 movd [r0 + r1], m0 movd [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] pshufd m2, m2, 2 movd [r0 + r1], m2 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_4x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_4x8, 4, 4, 8, dst, dstStride, src, srcStride add r3, r3 movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m3, [r2 + r3] movh m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m5, [r2 + r3] movh m6, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m7, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movd [r0], m0 pshufd m0, m0, 2 movd [r0 + r1], m0 movd [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] pshufd m2, m2, 2 movd [r0 + r1], m2 movd [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] pshufd m4, m4, 2 movd [r0 + r1], m4 movd [r0 + 2 * r1], m6 lea r0, [r0 + 2 * r1] pshufd m6, m6, 2 movd [r0 + r1], m6 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W4_H8 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/8 add r3, r3 .loop: movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m3, [r2 + r3] movh m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m5, [r2 + r3] movh m6, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m7, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movd [r0], m0 pshufd m0, m0, 2 movd [r0 + r1], m0 movd [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] pshufd m2, m2, 2 movd [r0 + r1], m2 movd [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] pshufd m4, m4, 2 movd [r0 + r1], m4 movd [r0 + 2 * r1], m6 lea r0, [r0 + 2 * r1] pshufd m6, m6, 2 movd [r0 + r1], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W4_H8 4, 16 BLOCKCOPY_SP_W4_H8 4, 32 ;----------------------------------------------------------------------------- ; void blockcopy_sp_6x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_sp_6x8, 4, 4, 2 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pextrw [r0 + 4], m0, 2 movhlps m0, m0 movd [r0 + r1], m0 pextrw [r0 + r1 + 4], m0, 2 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pextrw [r0 + 4], m0, 2 movhlps m0, m0 movd [r0 + r1], m0 pextrw [r0 + r1 + 4], m0, 2 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pextrw [r0 + 4], m0, 2 movhlps m0, m0 movd [r0 + r1], m0 pextrw [r0 + r1 + 4], m0, 2 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pextrw [r0 + 4], m0, 2 movhlps m0, m0 movd [r0 + r1], m0 pextrw [r0 + r1 + 4], m0, 2 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W6_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride add r3, r3 mov r6d, %2/2 .loop: movh m0, [r2] movd m2, [r2 + 8] movh m1, [r2 + r3] movd m3, [r2 + r3 + 8] dec r6d lea r2, [r2 + r3 * 2] packuswb m0, m0 packuswb m2, m2 packuswb m1, m1 packuswb m3, m3 movd r4d, m2 movd r5d, m3 movd [r0], m0 mov [r0 + 4], r4w movd [r0 + r1], m1 mov [r0 + r1 + 4], r5w lea r0, [r0 + r1 * 2] jnz .loop RET %endmacro BLOCKCOPY_SP_W6_H2 6, 8 BLOCKCOPY_SP_W6_H2 6, 16 ;----------------------------------------------------------------------------- ; void blockcopy_sp_8x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_8x2, 4, 4, 2, dst, dstStride, src, srcStride add r3, r3 movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movlps [r0], m0 movhps [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_8x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_8x4, 4, 4, 4, dst, dstStride, src, srcStride add r3, r3 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m3, [r2 + r3] packuswb m0, m1 packuswb m2, m3 movlps [r0], m0 movhps [r0 + r1], m0 movlps [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m2 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_8x6(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_8x6, 4, 4, 6, dst, dstStride, src, srcStride add r3, r3 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m5, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 movlps [r0], m0 movhps [r0 + r1], m0 movlps [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m2 movlps [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m4 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_8x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_8x8, 4, 4, 8, dst, dstStride, src, srcStride add r3, r3 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m5, [r2 + r3] movu m6, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m7, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movlps [r0], m0 movhps [r0 + r1], m0 movlps [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m2 movlps [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m4 movlps [r0 + 2 * r1], m6 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m6 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W8_H4 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 4, dst, dstStride, src, srcStride add r3, r3 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] dec r4d lea r2, [r2 + r3 * 2] packuswb m0, m1 packuswb m2, m3 movlps [r0], m0 movhps [r0 + r1], m0 lea r0, [r0 + r1 * 2] movlps [r0], m2 movhps [r0 + r1], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %endmacro BLOCKCOPY_SP_W8_H4 8, 12 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W8_H8 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/8 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m5, [r2 + r3] movu m6, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m7, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movlps [r0], m0 movhps [r0 + r1], m0 movlps [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m2 movlps [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m4 movlps [r0 + 2 * r1], m6 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W8_H8 8, 16 BLOCKCOPY_SP_W8_H8 8, 32 BLOCKCOPY_SP_W8_H8 8, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W12_H4 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/4 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu m4, [r2 + 2 * r3] movu m5, [r2 + 2 * r3 + 16] lea r2, [r2 + 2 * r3] movu m6, [r2 + r3] movu m7, [r2 + r3 + 16] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movh [r0], m0 pshufd m0, m0, 2 movd [r0 + 8], m0 movh [r0 + r1], m2 pshufd m2, m2, 2 movd [r0 + r1 + 8], m2 movh [r0 + 2 * r1], m4 pshufd m4, m4, 2 movd [r0 + 2 * r1 + 8], m4 lea r0, [r0 + 2 * r1] movh [r0 + r1], m6 pshufd m6, m6, 2 movd [r0 + r1 + 8], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W12_H4 12, 16 BLOCKCOPY_SP_W12_H4 12, 32 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W16_H4 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/4 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu m4, [r2 + 2 * r3] movu m5, [r2 + 2 * r3 + 16] lea r2, [r2 + 2 * r3] movu m6, [r2 + r3] movu m7, [r2 + r3 + 16] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movu [r0], m0 movu [r0 + r1], m2 movu [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] movu [r0 + r1], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W16_H4 16, 4 BLOCKCOPY_SP_W16_H4 16, 8 BLOCKCOPY_SP_W16_H4 16, 12 BLOCKCOPY_SP_W16_H4 16, 16 BLOCKCOPY_SP_W16_H4 16, 32 BLOCKCOPY_SP_W16_H4 16, 64 BLOCKCOPY_SP_W16_H4 16, 24 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W16_H8_avx2 2 INIT_YMM avx2 cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride mov r4d, %2/8 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 movu [r0], xm0 movu [r0 + r1], xm1 movu [r0 + 2 * r1], xm2 movu [r0 + r6], xm3 lea r2, [r2 + 4 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 lea r0, [r0 + 4 * r1] movu [r0], xm0 movu [r0 + r1], xm1 movu [r0 + 2 * r1], xm2 movu [r0 + r6], xm3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W16_H8_avx2 16, 16 BLOCKCOPY_SP_W16_H8_avx2 16, 32 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W24_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride mov r4d, %2/2 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 movu [r0], m0 movlps [r0 + 16], m2 movhps [r0 + r1], m2 movu [r0 + r1 + 8], m4 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W24_H2 24, 32 BLOCKCOPY_SP_W24_H2 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W32_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/2 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + r3] movu m5, [r2 + r3 + 16] movu m6, [r2 + r3 + 32] movu m7, [r2 + r3 + 48] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movu [r0], m0 movu [r0 + 16], m2 movu [r0 + r1], m4 movu [r0 + r1 + 16], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W32_H2 32, 8 BLOCKCOPY_SP_W32_H2 32, 16 BLOCKCOPY_SP_W32_H2 32, 24 BLOCKCOPY_SP_W32_H2 32, 32 BLOCKCOPY_SP_W32_H2 32, 64 BLOCKCOPY_SP_W32_H2 32, 48 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W32_H4_avx2 2 INIT_YMM avx2 cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride mov r4d, %2/4 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + r3] movu m3, [r2 + r3 + 32] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0], m0 movu [r0 + r1], m2 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu m2, [r2 + r5] movu m3, [r2 + r5 + 32] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0 + 2 * r1], m0 movu [r0 + r6], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W32_H4_avx2 32, 32 BLOCKCOPY_SP_W32_H4_avx2 32, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W48_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride mov r4d, %2 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + 64] movu m5, [r2 + 80] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 movu [r0], m0 movu [r0 + 16], m2 movu [r0 + 32], m4 lea r0, [r0 + r1] lea r2, [r2 + r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W48_H2 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W64_H1 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + 64] movu m5, [r2 + 80] movu m6, [r2 + 96] movu m7, [r2 + 112] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movu [r0], m0 movu [r0 + 16], m2 movu [r0 + 32], m4 movu [r0 + 48], m6 lea r0, [r0 + r1] lea r2, [r2 + r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W64_H1 64, 16 BLOCKCOPY_SP_W64_H1 64, 32 BLOCKCOPY_SP_W64_H1 64, 48 BLOCKCOPY_SP_W64_H1 64, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W64_H4_avx2 2 INIT_YMM avx2 cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride mov r4d, %2/4 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + 64] movu m3, [r2 + 96] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0], m0 movu [r0 + 32], m2 movu m0, [r2 + r3] movu m1, [r2 + r3 + 32] movu m2, [r2 + r3 + 64] movu m3, [r2 + r3 + 96] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0 + r1], m0 movu [r0 + r1 + 32], m2 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu m2, [r2 + 2 * r3 + 64] movu m3, [r2 + 2 * r3 + 96] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m2 movu m0, [r2 + r5] movu m1, [r2 + r5 + 32] movu m2, [r2 + r5 + 64] movu m3, [r2 + r5 + 96] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0 + r6], m0 movu [r0 + r6 + 32], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W64_H4_avx2 64, 64 ;----------------------------------------------------------------------------- ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockfill_s_4x4, 3, 3, 1, dst, dstStride, val add r1, r1 movd m0, r2d pshuflw m0, m0, 0 movh [r0], m0 movh [r0 + r1], m0 movh [r0 + 2 * r1], m0 lea r0, [r0 + 2 * r1] movh [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockfill_s_8x8, 3, 4, 1, dst, dstStride, val add r1, r1 lea r3, [3 * r1] movd m0, r2d pshuflw m0, m0, 0 pshufd m0, m0, 0 movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 RET ;----------------------------------------------------------------------------- ; void blockfill_s_16x16(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockfill_s_16x16, 3, 4, 1, dst, dstStride, val add r1, r1 lea r3, [3 * r1] movd m0, r2d pshuflw m0, m0, 0 pshufd m0, m0, 0 movu [r0], m0 movu [r0 + 16], m0 movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + r3], m0 movu [r0 + r3 + 16], m0 movu [r0 + 4 * r1], m0 movu [r0 + 4 * r1 + 16], m0 lea r0, [r0 + 4 * r1] movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + r3], m0 movu [r0 + r3 + 16], m0 movu [r0 + 4 * r1], m0 movu [r0 + 4 * r1 + 16], m0 lea r0, [r0 + 4 * r1] movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + r3], m0 movu [r0 + r3 + 16], m0 movu [r0 + 4 * r1], m0 movu [r0 + 4 * r1 + 16], m0 lea r0, [r0 + 4 * r1] movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + r3], m0 movu [r0 + r3 + 16], m0 RET INIT_YMM avx2 cglobal blockfill_s_16x16, 3, 4, 1 add r1, r1 lea r3, [3 * r1] movd xm0, r2d vpbroadcastw m0, xm0 movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 RET ;----------------------------------------------------------------------------- ; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- %macro BLOCKFILL_S_W32_H8 2 INIT_XMM sse2 cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val mov r3d, %2/8 add r1, r1 lea r4, [3 * r1] movd m0, r2d pshuflw m0, m0, 0 pshufd m0, m0, 0 .loop: movu [r0], m0 movu [r0 + 16], m0 movu [r0 + 32], m0 movu [r0 + 48], m0 movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + 2 * r1 + 48], m0 movu [r0 + r4], m0 movu [r0 + r4 + 16], m0 movu [r0 + r4 + 32], m0 movu [r0 + r4 + 48], m0 movu [r0 + 4 * r1], m0 movu [r0 + 4 * r1 + 16], m0 movu [r0 + 4 * r1 + 32], m0 movu [r0 + 4 * r1 + 48], m0 lea r0, [r0 + 4 * r1] movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + 2 * r1 + 48], m0 movu [r0 + r4], m0 movu [r0 + r4 + 16], m0 movu [r0 + r4 + 32], m0 movu [r0 + r4 + 48], m0 lea r0, [r0 + 4 * r1] dec r3d jnz .loop RET %endmacro BLOCKFILL_S_W32_H8 32, 32 INIT_YMM avx2 cglobal blockfill_s_32x32, 3, 4, 1 add r1, r1 lea r3, [3 * r1] movd xm0, r2d vpbroadcastw m0, xm0 movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_2x4, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movd m0, [r2] pmovzxbw m0, m0 movd [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movd [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_2x8(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_2x8, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movd m0, [r2] pmovzxbw m0, m0 movd [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movd [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movd [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movd [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_2x16(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_2x16, 4, 5, 2, dst, dstStride, src, srcStride add r1, r1 mov r4d, 16/2 .loop: movd m0, [r2] movd m1, [r2 + r3] dec r4d lea r2, [r2 + r3 * 2] pmovzxbw m0, m0 pmovzxbw m1, m1 movd [r0], m0 movd [r0 + r1], m1 lea r0, [r0 + r1 * 2] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_4x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_4x2, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movd m0, [r2] pmovzxbw m0, m0 movh [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_4x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_4x4, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movd m0, [r2] pmovzxbw m0, m0 movh [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movh [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W4_H4 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 .loop: movd m0, [r2] pmovzxbw m0, m0 movh [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movh [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W4_H4 4, 8 BLOCKCOPY_PS_W4_H4 4, 16 BLOCKCOPY_PS_W4_H4 4, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W6_H4 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 .loop: movh m0, [r2] pmovzxbw m0, m0 movh [r0], m0 pextrd [r0 + 8], m0, 2 movh m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 pextrd [r0 + r1 + 8], m0, 2 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movh [r0 + 2 * r1], m0 pextrd [r0 + 2 * r1 + 8], m0, 2 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 pextrd [r0 + r1 + 8], m0, 2 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W6_H4 6, 8 BLOCKCOPY_PS_W6_H4 6, 16 ;----------------------------------------------------------------------------- ; void blockcopy_ps_8x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_8x2, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movh m0, [r2] pmovzxbw m0, m0 movu [r0], m0 movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_8x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_8x4, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movh m0, [r2] pmovzxbw m0, m0 movu [r0], m0 movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movu [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_8x6(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_8x6, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movh m0, [r2] pmovzxbw m0, m0 movu [r0], m0 movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movu [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movu [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W8_H4 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 .loop: movh m0, [r2] pmovzxbw m0, m0 movu [r0], m0 movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movu [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W8_H4 8, 8 BLOCKCOPY_PS_W8_H4 8, 16 BLOCKCOPY_PS_W8_H4 8, 32 BLOCKCOPY_PS_W8_H4 8, 12 BLOCKCOPY_PS_W8_H4 8, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W12_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movh [r0 + 16], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movh [r0 + r1 + 16], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W12_H2 12, 16 BLOCKCOPY_PS_W12_H2 12, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ps_16x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_16x4, 4, 4, 3, dst, dstStride, src, srcStride add r1, r1 pxor m0, m0 movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + 2 * r3] pmovzxbw m2, m1 movu [r0 + 2 * r1], m2 punpckhbw m1, m0 movu [r0 + 2 * r1 + 16], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W16_H4 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + 2 * r3] pmovzxbw m2, m1 movu [r0 + 2 * r1], m2 punpckhbw m1, m0 movu [r0 + 2 * r1 + 16], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W16_H4 16, 8 BLOCKCOPY_PS_W16_H4 16, 12 BLOCKCOPY_PS_W16_H4 16, 16 BLOCKCOPY_PS_W16_H4 16, 32 BLOCKCOPY_PS_W16_H4 16, 64 BLOCKCOPY_PS_W16_H4 16, 24 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W16_H4_avx2 2 INIT_YMM avx2 cglobal blockcopy_ps_%1x%2, 4, 7, 3 add r1, r1 mov r4d, %2/4 lea r5, [3 * r3] lea r6, [3 * r1] pxor m0, m0 .loop: movu xm1, [r2] pmovzxbw m2, xm1 movu [r0], m2 movu xm1, [r2 + r3] pmovzxbw m2, xm1 movu [r0 + r1], m2 movu xm1, [r2 + 2 * r3] pmovzxbw m2, xm1 movu [r0 + 2 * r1], m2 movu xm1, [r2 + r5] pmovzxbw m2, xm1 movu [r0 + r6], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W16_H4_avx2 16, 16 BLOCKCOPY_PS_W16_H4_avx2 16, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W24_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movh m1, [r2 + 16] pmovzxbw m1, m1 movu [r0 + 32], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movh m1, [r2 + r3 + 16] pmovzxbw m1, m1 movu [r0 + r1 + 32], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W24_H2 24, 32 BLOCKCOPY_PS_W24_H2 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W32_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + 16] pmovzxbw m2, m1 movu [r0 + 32], m2 punpckhbw m1, m0 movu [r0 + 48], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + r3 + 16] pmovzxbw m2, m1 movu [r0 + r1 + 32], m2 punpckhbw m1, m0 movu [r0 + r1 + 48], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W32_H2 32, 8 BLOCKCOPY_PS_W32_H2 32, 16 BLOCKCOPY_PS_W32_H2 32, 24 BLOCKCOPY_PS_W32_H2 32, 32 BLOCKCOPY_PS_W32_H2 32, 64 BLOCKCOPY_PS_W32_H2 32, 48 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W32_H4_avx2 2 INIT_YMM avx2 cglobal blockcopy_ps_%1x%2, 4, 7, 2 add r1, r1 mov r4d, %2/4 lea r5, [3 * r3] lea r6, [3 * r1] .loop: pmovzxbw m0, [r2 + 0] pmovzxbw m1, [r2 + 16] movu [r0 + 0], m0 movu [r0 + 32], m1 pmovzxbw m0, [r2 + r3 + 0] pmovzxbw m1, [r2 + r3 + 16] movu [r0 + r1 + 0], m0 movu [r0 + r1 + 32], m1 pmovzxbw m0, [r2 + r3 * 2 + 0] pmovzxbw m1, [r2 + r3 * 2 + 16] movu [r0 + r1 * 2 + 0], m0 movu [r0 + r1 * 2 + 32], m1 pmovzxbw m0, [r2 + r5 + 0] pmovzxbw m1, [r2 + r5 + 16] movu [r0 + r6 + 0], m0 movu [r0 + r6 + 32], m1 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W32_H4_avx2 32, 32 BLOCKCOPY_PS_W32_H4_avx2 32, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W48_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + 16] pmovzxbw m2, m1 movu [r0 + 32], m2 punpckhbw m1, m0 movu [r0 + 48], m1 movu m1, [r2 + 32] pmovzxbw m2, m1 movu [r0 + 64], m2 punpckhbw m1, m0 movu [r0 + 80], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + r3 + 16] pmovzxbw m2, m1 movu [r0 + r1 + 32], m2 punpckhbw m1, m0 movu [r0 + r1 + 48], m1 movu m1, [r2 + r3 + 32] pmovzxbw m2, m1 movu [r0 + r1 + 64], m2 punpckhbw m1, m0 movu [r0 + r1 + 80], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W48_H2 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W64_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + 16] pmovzxbw m2, m1 movu [r0 + 32], m2 punpckhbw m1, m0 movu [r0 + 48], m1 movu m1, [r2 + 32] pmovzxbw m2, m1 movu [r0 + 64], m2 punpckhbw m1, m0 movu [r0 + 80], m1 movu m1, [r2 + 48] pmovzxbw m2, m1 movu [r0 + 96], m2 punpckhbw m1, m0 movu [r0 + 112], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + r3 + 16] pmovzxbw m2, m1 movu [r0 + r1 + 32], m2 punpckhbw m1, m0 movu [r0 + r1 + 48], m1 movu m1, [r2 + r3 + 32] pmovzxbw m2, m1 movu [r0 + r1 + 64], m2 punpckhbw m1, m0 movu [r0 + r1 + 80], m1 movu m1, [r2 + r3 + 48] pmovzxbw m2, m1 movu [r0 + r1 + 96], m2 punpckhbw m1, m0 movu [r0 + r1 + 112], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W64_H2 64, 16 BLOCKCOPY_PS_W64_H2 64, 32 BLOCKCOPY_PS_W64_H2 64, 48 BLOCKCOPY_PS_W64_H2 64, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal blockcopy_ps_64x64, 4, 7, 4 add r1, r1 mov r4d, 64/8 lea r5, [3 * r3] lea r6, [3 * r1] .loop: %rep 2 pmovzxbw m0, [r2 + 0] pmovzxbw m1, [r2 + 16] pmovzxbw m2, [r2 + 32] pmovzxbw m3, [r2 + 48] movu [r0 + 0], m0 movu [r0 + 32], m1 movu [r0 + 64], m2 movu [r0 + 96], m3 pmovzxbw m0, [r2 + r3 + 0] pmovzxbw m1, [r2 + r3 + 16] pmovzxbw m2, [r2 + r3 + 32] pmovzxbw m3, [r2 + r3 + 48] movu [r0 + r1 + 0], m0 movu [r0 + r1 + 32], m1 movu [r0 + r1 + 64], m2 movu [r0 + r1 + 96], m3 pmovzxbw m0, [r2 + r3 * 2 + 0] pmovzxbw m1, [r2 + r3 * 2 + 16] pmovzxbw m2, [r2 + r3 * 2 + 32] pmovzxbw m3, [r2 + r3 * 2 + 48] movu [r0 + r1 * 2 + 0], m0 movu [r0 + r1 * 2 + 32], m1 movu [r0 + r1 * 2 + 64], m2 movu [r0 + r1 * 2 + 96], m3 pmovzxbw m0, [r2 + r5 + 0] pmovzxbw m1, [r2 + r5 + 16] pmovzxbw m2, [r2 + r5 + 32] pmovzxbw m3, [r2 + r5 + 48] movu [r0 + r6 + 0], m0 movu [r0 + r6 + 32], m1 movu [r0 + r6 + 64], m2 movu [r0 + r6 + 96], m3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] %endrep dec r4d jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_2x4, 4, 6, 0 add r1, r1 add r3, r3 mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_2x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_2x8, 4, 6, 0 add r1, r1 add r3, r3 mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_2x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_2x16, 4, 7, 0 add r1, r1 add r3, r3 mov r6d, 16/2 .loop: mov r4d, [r2] mov r5d, [r2 + r3] dec r6d lea r2, [r2 + r3 * 2] mov [r0], r4d mov [r0 + r1], r5d lea r0, [r0 + r1 * 2] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_4x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_4x2, 4, 4, 2 add r1, r1 add r3, r3 movh m0, [r2] movh m1, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_4x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_4x4, 4, 4, 4 add r1, r1 add r3, r3 movh m0, [r2] movh m1, [r2 + r3] lea r2, [r2 + r3 * 2] movh m2, [r2] movh m3, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 lea r0, [r0 + 2 * r1] movh [r0], m2 movh [r0 + r1], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W4_H8 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/8 add r1, r1 add r3, r3 .loop: movh m0, [r2] movh m1, [r2 + r3] lea r2, [r2 + r3 * 2] movh m2, [r2] movh m3, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 lea r0, [r0 + 2 * r1] movh [r0], m2 movh [r0 + r1], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movh m0, [r2] movh m1, [r2 + r3] lea r2, [r2 + r3 * 2] movh m2, [r2] movh m3, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 lea r0, [r0 + 2 * r1] movh [r0], m2 movh [r0 + r1], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SS_W4_H8 4, 8 BLOCKCOPY_SS_W4_H8 4, 16 BLOCKCOPY_SS_W4_H8 4, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ss_6x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_6x8, 4, 4, 4 add r1, r1 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] pshufd m2, m0, 2 pshufd m3, m1, 2 movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] pshufd m2, m0, 2 pshufd m3, m1, 2 movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] pshufd m2, m0, 2 pshufd m3, m1, 2 movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] pshufd m2, m0, 2 pshufd m3, m1, 2 movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_6x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_6x16, 4, 5, 4 add r1, r1 add r3, r3 mov r4d, 16/2 .loop: movh m0, [r2] movd m2, [r2 + 8] movh m1, [r2 + r3] movd m3, [r2 + r3 + 8] dec r4d lea r2, [r2 + r3 * 2] movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 lea r0, [r0 + r1 * 2] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_8x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x2, 4, 4, 2 add r1, r1 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_8x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x4, 4, 4, 4 add r1, r1 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_8x6(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x6, 4, 4, 4 add r1, r1 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_8x12(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x12, 4, 5, 2 add r1, r1 add r3, r3 mov r4d, 12/2 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + 2 * r3] dec r4d movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W8_H8 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/8 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_SS_W8_H8 8, 8 BLOCKCOPY_SS_W8_H8 8, 16 BLOCKCOPY_SS_W8_H8 8, 32 BLOCKCOPY_SS_W8_H8 8, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W12_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movh m1, [r2 + 16] movu m2, [r2 + r3] movh m3, [r2 + r3 + 16] lea r2, [r2 + 2 * r3] movu [r0], m0 movh [r0 + 16], m1 movu [r0 + r1], m2 movh [r0 + r1 + 16], m3 lea r0, [r0 + 2 * r1] movu m0, [r2] movh m1, [r2 + 16] movu m2, [r2 + r3] movh m3, [r2 + r3 + 16] movu [r0], m0 movh [r0 + 16], m1 movu [r0 + r1], m2 movh [r0 + r1 + 16], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_SS_W12_H4 12, 16 BLOCKCOPY_SS_W12_H4 12, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ss_16x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W16_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_SS_W16_H4 16, 4 BLOCKCOPY_SS_W16_H4 16, 12 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W16_H4_avx 2 INIT_YMM avx cglobal blockcopy_ss_%1x%2, 4, 7, 4 mov r4d, %2/4 add r1, r1 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r6], m3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SS_W16_H4_avx 16, 4 BLOCKCOPY_SS_W16_H4_avx 16, 12 BLOCKCOPY_SS_W16_H4_avx 16, 8 BLOCKCOPY_SS_W16_H4_avx 16, 16 BLOCKCOPY_SS_W16_H4_avx 16, 24 BLOCKCOPY_SS_W16_H4_avx 16, 32 BLOCKCOPY_SS_W16_H4_avx 16, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W16_H8 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/8 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W16_H8 16, 8 BLOCKCOPY_SS_W16_H8 16, 16 BLOCKCOPY_SS_W16_H8 16, 32 BLOCKCOPY_SS_W16_H8 16, 64 BLOCKCOPY_SS_W16_H8 16, 24 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W24_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 6 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + r1], m3 movu [r0 + r1 + 16], m4 movu [r0 + r1 + 32], m5 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + r1], m3 movu [r0 + r1 + 16], m4 movu [r0 + r1 + 32], m5 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W24_H4 24, 32 BLOCKCOPY_SS_W24_H4 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W24_H4_avx 2 INIT_YMM avx cglobal blockcopy_ss_%1x%2, 4, 7, 2 mov r4d, %2/4 add r1, r1 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu xm1, [r2 + 32] movu [r0], m0 movu [r0 + 32], xm1 movu m0, [r2 + r3] movu xm1, [r2 + r3 + 32] movu [r0 + r1], m0 movu [r0 + r1 + 32], xm1 movu m0, [r2 + 2 * r3] movu xm1, [r2 + 2 * r3 + 32] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], xm1 movu m0, [r2 + r5] movu xm1, [r2 + r5 + 32] movu [r0 + r6], m0 movu [r0 + r6 + 32], xm1 dec r4d lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W24_H4_avx 24, 32 BLOCKCOPY_SS_W24_H4_avx 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W32_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W32_H4 32, 8 BLOCKCOPY_SS_W32_H4 32, 16 BLOCKCOPY_SS_W32_H4 32, 24 BLOCKCOPY_SS_W32_H4 32, 32 BLOCKCOPY_SS_W32_H4 32, 64 BLOCKCOPY_SS_W32_H4 32, 48 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W32_H4_avx 2 INIT_YMM avx cglobal blockcopy_ss_%1x%2, 4, 7, 4 mov r4d, %2/4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] .loop: movu m0, [r2] movu m1, [r2 + 32] movu [r0], m0 movu [r0 + 32], m1 movu m0, [r2 + r3] movu m1, [r2 + r3 + 32] movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m1 movu m0, [r2 + r6] movu m1, [r2 + r6 + 32] movu [r0 + r5], m0 movu [r0 + r5 + 32], m1 dec r4d lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W32_H4_avx 32, 8 BLOCKCOPY_SS_W32_H4_avx 32, 16 BLOCKCOPY_SS_W32_H4_avx 32, 24 BLOCKCOPY_SS_W32_H4_avx 32, 32 BLOCKCOPY_SS_W32_H4_avx 32, 48 BLOCKCOPY_SS_W32_H4_avx 32, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W48_H2 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 6 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + 64] movu m5, [r2 + 80] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu [r0 + 64], m4 movu [r0 + 80], m5 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu m4, [r2 + r3 + 64] movu m5, [r2 + r3 + 80] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 movu [r0 + r1 + 64], m4 movu [r0 + r1 + 80], m5 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + 64] movu m5, [r2 + 80] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu [r0 + 64], m4 movu [r0 + 80], m5 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu m4, [r2 + r3 + 64] movu m5, [r2 + r3 + 80] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 movu [r0 + r1 + 64], m4 movu [r0 + r1 + 80], m5 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W48_H2 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_48x64(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_YMM avx cglobal blockcopy_ss_48x64, 4, 7, 6 mov r4d, 64/4 add r1, r1 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + 64] movu [r0], m0 movu [r0 + 32], m1 movu [r0 + 64], m2 movu m0, [r2 + r3] movu m1, [r2 + r3 + 32] movu m2, [r2 + r3 + 64] movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 movu [r0 + r1 + 64], m2 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu m2, [r2 + 2 * r3 + 64] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m1 movu [r0 + 2 * r1 + 64], m2 movu m0, [r2 + r5] movu m1, [r2 + r5 + 32] movu m2, [r2 + r5 + 64] movu [r0 + r6], m0 movu [r0 + r6 + 32], m1 movu [r0 + r6 + 64], m2 dec r4d lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W64_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu m0, [r2 + 64] movu m1, [r2 + 80] movu m2, [r2 + 96] movu m3, [r2 + 112] movu [r0 + 64], m0 movu [r0 + 80], m1 movu [r0 + 96], m2 movu [r0 + 112], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 movu m0, [r2 + r3 + 64] movu m1, [r2 + r3 + 80] movu m2, [r2 + r3 + 96] movu m3, [r2 + r3 + 112] movu [r0 + r1 + 64], m0 movu [r0 + r1 + 80], m1 movu [r0 + r1 + 96], m2 movu [r0 + r1 + 112], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu m0, [r2 + 64] movu m1, [r2 + 80] movu m2, [r2 + 96] movu m3, [r2 + 112] movu [r0 + 64], m0 movu [r0 + 80], m1 movu [r0 + 96], m2 movu [r0 + 112], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 movu m0, [r2 + r3 + 64] movu m1, [r2 + r3 + 80] movu m2, [r2 + r3 + 96] movu m3, [r2 + r3 + 112] movu [r0 + r1 + 64], m0 movu [r0 + r1 + 80], m1 movu [r0 + r1 + 96], m2 movu [r0 + r1 + 112], m3 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W64_H4 64, 16 BLOCKCOPY_SS_W64_H4 64, 32 BLOCKCOPY_SS_W64_H4 64, 48 BLOCKCOPY_SS_W64_H4 64, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W64_H4_avx 2 INIT_YMM avx cglobal blockcopy_ss_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride mov r4d, %2/4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + 64] movu m3, [r2 + 96] movu [r0], m0 movu [r0 + 32], m1 movu [r0 + 64], m2 movu [r0 + 96], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 32] movu m2, [r2 + r3 + 64] movu m3, [r2 + r3 + 96] movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 movu [r0 + r1 + 64], m2 movu [r0 + r1 + 96], m3 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu m2, [r2 + 2 * r3 + 64] movu m3, [r2 + 2 * r3 + 96] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m1 movu [r0 + 2 * r1 + 64], m2 movu [r0 + 2 * r1 + 96], m3 movu m0, [r2 + r6] movu m1, [r2 + r6 + 32] movu m2, [r2 + r6 + 64] movu m3, [r2 + r6 + 96] lea r2, [r2 + 4 * r3] movu [r0 + r5], m0 movu [r0 + r5 + 32], m1 movu [r0 + r5 + 64], m2 movu [r0 + r5 + 96], m3 lea r0, [r0 + 4 * r1] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SS_W64_H4_avx 64, 16 BLOCKCOPY_SS_W64_H4_avx 64, 32 BLOCKCOPY_SS_W64_H4_avx 64, 48 BLOCKCOPY_SS_W64_H4_avx 64, 64 ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shr_4, 3, 4, 4 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; m0 - shift ; m1 - word [-round] ; Row 0-3 movh m2, [r1] movhps m2, [r1 + r2] lea r1, [r1 + r2 * 2] movh m3, [r1] movhps m3, [r1 + r2] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shr_8, 3, 5, 4 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 8/4 lea r4, [r2 * 3] ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; r4 - stride * 3 ; m0 - shift ; m1 - word [-round] .loop: ; Row 0-1 mova m2, [r1] mova m3, [r1 + r2] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 ; Row 2-3 mova m2, [r1 + r2 * 2] mova m3, [r1 + r4] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 2 * mmsize], m2 mova [r0 + 3 * mmsize], m3 add r0, 4 * mmsize lea r1, [r1 + r2 * 4] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy2Dto1D_shr_8, 3, 4, 4 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 lea r3, [r2 * 3] ; Row 0-3 movu xm2, [r1] vinserti128 m2, m2, [r1 + r2], 1 movu xm3, [r1 + 2 * r2] vinserti128 m3, m3, [r1 + r3], 1 psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0], m2 movu [r0 + 32], m3 ; Row 4-7 lea r1, [r1 + 4 * r2] movu xm2, [r1] vinserti128 m2, m2, [r1 + r2], 1 movu xm3, [r1 + 2 * r2] vinserti128 m3, m3, [r1 + r3], 1 psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 64], m2 movu [r0 + 96], m3 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shr_16, 3, 4, 4 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 16/2 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; m0 - shift ; m1 - word [-round] .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 ; Row 1 mova m2, [r1 + r2 + 0 * mmsize] mova m3, [r1 + r2 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 2 * mmsize], m2 mova [r0 + 3 * mmsize], m3 add r0, 4 * mmsize lea r1, [r1 + r2 * 2] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy2Dto1D_shr_16, 4, 5, 4 add r2d, r2d movd xm0, r3d pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 lea r3, [r2 * 3] mov r4d, 16/8 .loop: ; Row 0-1 movu m2, [r1] movu m3, [r1 + r2] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 0 * mmsize], m2 movu [r0 + 1 * mmsize], m3 ; Row 2-3 movu m2, [r1 + 2 * r2] movu m3, [r1 + r3] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 2 * mmsize], m2 movu [r0 + 3 * mmsize], m3 ; Row 4-5 lea r1, [r1 + 4 * r2] movu m2, [r1] movu m3, [r1 + r2] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 4 * mmsize], m2 movu [r0 + 5 * mmsize], m3 ; Row 6-7 movu m2, [r1 + 2 * r2] movu m3, [r1 + r3] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 6 * mmsize], m2 movu [r0 + 7 * mmsize], m3 add r0, 8 * mmsize lea r1, [r1 + 4 * r2] dec r4d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shr_32, 3, 4, 6 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 32/1 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; m0 - shift ; m1 - word [-round] .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 mova [r0 + 2 * mmsize], m4 mova [r0 + 3 * mmsize], m5 add r0, 4 * mmsize add r1, r2 dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy2Dto1D_shr_32, 4, 5, 4 add r2d, r2d movd xm0, r3d pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 lea r3, [r2 * 3] mov r4d, 32/4 .loop: ; Row 0 movu m2, [r1] movu m3, [r1 + 32] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 0 * mmsize], m2 movu [r0 + 1 * mmsize], m3 ; Row 1 movu m2, [r1 + r2] movu m3, [r1 + r2 + 32] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 2 * mmsize], m2 movu [r0 + 3 * mmsize], m3 ; Row 2 movu m2, [r1 + 2 * r2] movu m3, [r1 + 2 * r2 + 32] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 4 * mmsize], m2 movu [r0 + 5 * mmsize], m3 ; Row 3 movu m2, [r1 + r3] movu m3, [r1 + r3 + 32] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 6 * mmsize], m2 movu [r0 + 7 * mmsize], m3 add r0, 8 * mmsize lea r1, [r1 + 4 * r2] dec r4d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shl_4, 3, 3, 3 add r2d, r2d movd m0, r3m ; Row 0-3 mova m1, [r1 + 0 * mmsize] mova m2, [r1 + 1 * mmsize] psllw m1, m0 psllw m2, m0 movh [r0], m1 movhps [r0 + r2], m1 movh [r0 + r2 * 2], m2 lea r2, [r2 * 3] movhps [r0 + r2], m2 RET INIT_YMM avx2 cglobal cpy1Dto2D_shl_4, 3, 3, 2 add r2d, r2d movd xm0, r3m ; Row 0-3 movu m1, [r1] psllw m1, xm0 vextracti128 xm0, m1, 1 movq [r0], xm1 movhps [r0 + r2], xm1 lea r0, [r0 + r2 * 2] movq [r0], xm0 movhps [r0 + r2], xm0 RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shl_8, 3, 4, 5 add r2d, r2d movd m0, r3m lea r3, [r2 * 3] ; Row 0-3 mova m1, [r1 + 0 * mmsize] mova m2, [r1 + 1 * mmsize] mova m3, [r1 + 2 * mmsize] mova m4, [r1 + 3 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0], m1 mova [r0 + r2], m2 mova [r0 + r2 * 2], m3 mova [r0 + r3], m4 lea r0, [r0 + r2 * 4] ; Row 4-7 mova m1, [r1 + 4 * mmsize] mova m2, [r1 + 5 * mmsize] mova m3, [r1 + 6 * mmsize] mova m4, [r1 + 7 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0], m1 mova [r0 + r2], m2 mova [r0 + r2 * 2], m3 mova [r0 + r3], m4 RET INIT_YMM avx2 cglobal cpy1Dto2D_shl_8, 3, 4, 3 add r2d, r2d movd xm0, r3m lea r3, [r2 * 3] ; Row 0-3 movu m1, [r1 + 0 * mmsize] movu m2, [r1 + 1 * mmsize] psllw m1, xm0 psllw m2, xm0 movu [r0], xm1 vextracti128 [r0 + r2], m1, 1 movu [r0 + r2 * 2], xm2 vextracti128 [r0 + r3], m2, 1 ; Row 4-7 movu m1, [r1 + 2 * mmsize] movu m2, [r1 + 3 * mmsize] lea r0, [r0 + r2 * 4] psllw m1, xm0 psllw m2, xm0 movu [r0], xm1 vextracti128 [r0 + r2], m1, 1 movu [r0 + r2 * 2], xm2 vextracti128 [r0 + r3], m2, 1 RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shl_16, 3, 4, 5 add r2d, r2d movd m0, r3m mov r3d, 16/4 .loop: ; Row 0-1 mova m1, [r1 + 0 * mmsize] mova m2, [r1 + 1 * mmsize] mova m3, [r1 + 2 * mmsize] mova m4, [r1 + 3 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0], m1 mova [r0 + 16], m2 mova [r0 + r2], m3 mova [r0 + r2 + 16], m4 ; Row 2-3 mova m1, [r1 + 4 * mmsize] mova m2, [r1 + 5 * mmsize] mova m3, [r1 + 6 * mmsize] mova m4, [r1 + 7 * mmsize] lea r0, [r0 + r2 * 2] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0], m1 mova [r0 + 16], m2 mova [r0 + r2], m3 mova [r0 + r2 + 16], m4 add r1, 8 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy1Dto2D_shl_16, 3, 5, 3 add r2d, r2d movd xm0, r3m mov r3d, 16/4 lea r4, [r2 * 3] .loop: ; Row 0-1 movu m1, [r1 + 0 * mmsize] movu m2, [r1 + 1 * mmsize] psllw m1, xm0 psllw m2, xm0 movu [r0], m1 movu [r0 + r2], m2 ; Row 2-3 movu m1, [r1 + 2 * mmsize] movu m2, [r1 + 3 * mmsize] psllw m1, xm0 psllw m2, xm0 movu [r0 + r2 * 2], m1 movu [r0 + r4], m2 add r1, 4 * mmsize lea r0, [r0 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shl_32, 3, 4, 5 add r2d, r2d movd m0, r3m mov r3d, 32/2 .loop: ; Row 0 mova m1, [r1 + 0 * mmsize] mova m2, [r1 + 1 * mmsize] mova m3, [r1 + 2 * mmsize] mova m4, [r1 + 3 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0 + 0 * mmsize], m1 mova [r0 + 1 * mmsize], m2 mova [r0 + 2 * mmsize], m3 mova [r0 + 3 * mmsize], m4 ; Row 1 mova m1, [r1 + 4 * mmsize] mova m2, [r1 + 5 * mmsize] mova m3, [r1 + 6 * mmsize] mova m4, [r1 + 7 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0 + r2 + 0 * mmsize], m1 mova [r0 + r2 + 1 * mmsize], m2 mova [r0 + r2 + 2 * mmsize], m3 mova [r0 + r2 + 3 * mmsize], m4 add r1, 8 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy1Dto2D_shl_32, 3, 4, 5 add r2d, r2d movd xm0, r3m mov r3d, 32/2 .loop: ; Row 0-1 movu m1, [r1 + 0 * mmsize] movu m2, [r1 + 1 * mmsize] movu m3, [r1 + 2 * mmsize] movu m4, [r1 + 3 * mmsize] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 movu [r0], m1 movu [r0 + mmsize], m2 movu [r0 + r2], m3 movu [r0 + r2 + mmsize], m4 add r1, 4 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_4, 3,3,3 add r2d, r2d pxor m2, m2 ; row 0 & 1 movh m0, [r1] movhps m0, [r1 + r2] mova [r0], m0 ; row 2 & 3 movh m1, [r1 + r2 * 2] lea r2, [r2 * 3] movhps m1, [r1 + r2] mova [r0 + 16], m1 packsswb m0, m1 pcmpeqb m0, m2 ; get count ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem %if 0 pmovmskb eax, m0 not ax popcnt ax, ax %else mova m1, [pb_1] paddb m0, m1 psadbw m0, m2 pshufd m1, m0, 2 paddw m0, m1 movd eax, m0 %endif RET ;-------------------------------------------------------------------------------------- ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_8, 3,3,6 add r2d, r2d pxor m4, m4 pxor m5, m5 ; row 0 & 1 movu m0, [r1] movu m1, [r1 + r2] movu [r0], m0 movu [r0 + 16], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 2 & 3 lea r1, [r1 + 2 * r2] movu m0, [r1] movu m1, [r1 + r2] movu [r0 + 32], m0 movu [r0 + 48], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 4 & 5 lea r1, [r1 + 2 * r2] movu m0, [r1] movu m1, [r1 + r2] movu [r0 + 64], m0 movu [r0 + 80], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 6 & 7 lea r1, [r1 + 2 * r2] movu m0, [r1] movu m1, [r1 + r2] movu [r0 + 96], m0 movu [r0 + 112], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; get count mova m0, [pb_4] paddb m5, m0 psadbw m5, m4 pshufd m0, m5, 2 paddw m5, m0 movd eax, m5 RET INIT_YMM avx2 cglobal copy_cnt_8, 3,4,5 add r2d, r2d lea r3, [r2 * 3] ; row 0 - 1 movu xm0, [r1] vinserti128 m0, m0, [r1 + r2], 1 movu [r0], m0 ; row 2 - 3 movu xm1, [r1 + r2 * 2] vinserti128 m1, m1, [r1 + r3], 1 movu [r0 + 32], m1 lea r1, [r1 + r2 * 4] ; row 4 - 5 movu xm2, [r1] vinserti128 m2, m2, [r1 + r2], 1 movu [r0 + 64], m2 ; row 6 - 7 movu xm3, [r1 + r2 * 2] vinserti128 m3, m3, [r1 + r3], 1 movu [r0 + 96], m3 ; get count xorpd m4, m4 vpacksswb m0, m1 vpacksswb m2, m3 pminub m0, [pb_1] pminub m2, [pb_1] paddb m0, m2 vextracti128 xm1, m0, 1 paddb xm0, xm1 psadbw xm0, xm4 movhlps xm1, xm0 paddd xm0, xm1 movd eax, xm0 RET ;-------------------------------------------------------------------------------------- ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_16, 3,4,6 add r2d, r2d mov r3d, 4 pxor m4, m4 pxor m5, m5 .loop: ; row 0 movu m0, [r1] movu m1, [r1 + 16] movu [r0], m0 movu [r0 + 16], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 1 movu m0, [r1 + r2] movu m1, [r1 + r2 + 16] movu [r0 + 32], m0 movu [r0 + 48], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 2 movu m0, [r1 + 2 * r2] movu m1, [r1 + 2 * r2 + 16] movu [r0 + 64], m0 movu [r0 + 80], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 3 lea r1, [r1 + 2 * r2] movu m0, [r1 + r2] movu m1, [r1 + r2 + 16] movu [r0 + 96], m0 movu [r0 + 112], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 add r0, 128 lea r1, [r1 + 2 * r2] dec r3d jnz .loop mova m0, [pb_16] paddb m5, m0 psadbw m5, m4 pshufd m0, m5, 2 paddw m5, m0 movd eax, m5 RET INIT_YMM avx2 cglobal copy_cnt_16, 3, 5, 5 add r2d, r2d lea r3, [r2 * 3] mov r4d, 16/4 mova m3, [pb_1] xorpd m4, m4 .loop: ; row 0 - 1 movu m0, [r1] movu [r0], m0 movu m1, [r1 + r2] movu [r0 + 32], m1 packsswb m0, m1 pminub m0, m3 ; row 2 - 3 movu m1, [r1 + r2 * 2] movu [r0 + 64], m1 movu m2, [r1 + r3] movu [r0 + 96], m2 packsswb m1, m2 pminub m1, m3 paddb m0, m1 paddb m4, m0 add r0, 128 lea r1, [r1 + 4 * r2] dec r4d jnz .loop ; get count xorpd m0, m0 vextracti128 xm1, m4, 1 paddb xm4, xm1 psadbw xm4, xm0 movhlps xm1, xm4 paddd xm4, xm1 movd eax, xm4 RET ;-------------------------------------------------------------------------------------- ; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_32, 3,4,6 add r2d, r2d mov r3d, 16 pxor m4, m4 pxor m5, m5 .loop: ; row 0 movu m0, [r1] movu m1, [r1 + 16] movu [r0], m0 movu [r0 + 16], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 movu m0, [r1 + 32] movu m1, [r1 + 48] movu [r0 + 32], m0 movu [r0 + 48], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 1 movu m0, [r1 + r2] movu m1, [r1 + r2 + 16] movu [r0 + 64], m0 movu [r0 + 80], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 movu m0, [r1 + r2 + 32] movu m1, [r1 + r2 + 48] movu [r0 + 96], m0 movu [r0 + 112], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 add r0, 128 lea r1, [r1 + 2 * r2] dec r3d jnz .loop ; get count mova m0, [pb_64] paddb m5, m0 psadbw m5, m4 pshufd m0, m5, 2 paddw m5, m0 movd eax, m5 RET INIT_YMM avx2 cglobal copy_cnt_32, 3, 5, 5 add r2d, r2d mov r3d, 32/2 mova m3, [pb_1] xorpd m4, m4 .loop: ; row 0 movu m0, [r1] movu [r0], m0 movu m1, [r1 + 32] movu [r0 + 32], m1 packsswb m0, m1 pminub m0, m3 ; row 1 movu m1, [r1 + r2] movu [r0 + 64], m1 movu m2, [r1 + r2 + 32] movu [r0 + 96], m2 packsswb m1, m2 pminub m1, m3 paddb m0, m1 paddb m4, m0 add r0, 128 lea r1, [r1 + 2 * r2] dec r3d jnz .loop ; get count xorpd m0, m0 vextracti128 xm1, m4, 1 paddb xm4, xm1 psadbw xm4, xm0 movhlps xm1, xm4 paddd xm4, xm1 movd eax, xm4 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shl_4, 4, 4, 4 add r2d, r2d movd m0, r3d ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; m0 - shift ; Row 0-3 movh m2, [r1] movhps m2, [r1 + r2] lea r1, [r1 + r2 * 2] movh m3, [r1] movhps m3, [r1 + r2] psllw m2, m0 psllw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shl_8, 4, 5, 4 add r2d, r2d movd m0, r3d mov r3d, 8/4 lea r4, [r2 * 3] ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; r4 - stride * 3 ; m0 - shift .loop: ; Row 0, 1 mova m2, [r1] mova m3, [r1 + r2] psllw m2, m0 psllw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 ; Row 2, 3 mova m2, [r1 + r2 * 2] mova m3, [r1 + r4] psllw m2, m0 psllw m3, m0 mova [r0 + 2 * mmsize], m2 mova [r0 + 3 * mmsize], m3 add r0, 4 * mmsize lea r1, [r1 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl_8(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal cpy2Dto1D_shl_8, 4, 5, 2 add r2d, r2d movd xm0, r3d lea r4, [3 * r2] ; Row 0, 1 movu xm1, [r1] vinserti128 m1, m1, [r1 + r2], 1 psllw m1, xm0 movu [r0], m1 ; Row 2, 3 movu xm1, [r1 + 2 * r2] vinserti128 m1, m1, [r1 + r4], 1 psllw m1, xm0 movu [r0 + 32], m1 lea r1, [r1 + 4 * r2] ; Row 4, 5 movu xm1, [r1] vinserti128 m1, m1, [r1 + r2], 1 psllw m1, xm0 movu [r0 + 64], m1 ; Row 6, 7 movu xm1, [r1 + 2 * r2] vinserti128 m1, m1, [r1 + r4], 1 psllw m1, xm0 movu [r0 + 96], m1 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shl_16, 4, 4, 4 add r2d, r2d movd m0, r3d mov r3d, 16/2 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; m0 - shift .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] psllw m2, m0 psllw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 ; Row 1 mova m2, [r1 + r2 + 0 * mmsize] mova m3, [r1 + r2 + 1 * mmsize] psllw m2, m0 psllw m3, m0 mova [r0 + 2 * mmsize], m2 mova [r0 + 3 * mmsize], m3 add r0, 4 * mmsize lea r1, [r1 + r2 * 2] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl_16(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal cpy2Dto1D_shl_16, 3, 5, 3 add r2d, r2d movd xm0, r3m mov r3d, 16/4 lea r4, [r2 * 3] .loop: ; Row 0-1 movu m1, [r1] movu m2, [r1 + r2] psllw m1, xm0 psllw m2, xm0 movu [r0 + 0 * mmsize], m1 movu [r0 + 1 * mmsize], m2 ; Row 2-3 movu m1, [r1 + 2 * r2] movu m2, [r1 + r4] psllw m1, xm0 psllw m2, xm0 movu [r0 + 2 * mmsize], m1 movu [r0 + 3 * mmsize], m2 add r0, 4 * mmsize lea r1, [r1 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shl_32, 4, 4, 6 add r2d, r2d movd m0, r3d mov r3d, 32/1 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; m0 - shift .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psllw m2, m0 psllw m3, m0 psllw m4, m0 psllw m5, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 mova [r0 + 2 * mmsize], m4 mova [r0 + 3 * mmsize], m5 add r0, 4 * mmsize add r1, r2 dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl_32(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal cpy2Dto1D_shl_32, 3, 5, 5 add r2d, r2d movd xm0, r3m mov r3d, 32/4 lea r4, [3 * r2] .loop: ; Row 0-1 movu m1, [r1] movu m2, [r1 + 32] movu m3, [r1 + r2] movu m4, [r1 + r2 + 32] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 movu [r0], m1 movu [r0 + mmsize], m2 movu [r0 + 2 * mmsize], m3 movu [r0 + 3 * mmsize], m4 ; Row 2-3 movu m1, [r1 + 2 * r2] movu m2, [r1 + 2 * r2 + 32] movu m3, [r1 + r4] movu m4, [r1 + r4 + 32] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 movu [r0 + 4 * mmsize], m1 movu [r0 + 5 * mmsize], m2 movu [r0 + 6 * mmsize], m3 movu [r0 + 7 * mmsize], m4 add r0, 8 * mmsize lea r1, [r1 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shr_4, 3, 3, 4 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 ; Row 0-3 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 movh [r0], m2 movhps [r0 + r2], m2 movh [r0 + r2 * 2], m3 lea r2, [r2 * 3] movhps [r0 + r2], m3 RET INIT_YMM avx2 cglobal cpy1Dto2D_shr_4, 3, 3, 3 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 ; Row 0-3 movu m2, [r1] psubw m2, m1 psraw m2, xm0 vextracti128 xm1, m2, 1 movq [r0], xm2 movhps [r0 + r2], xm2 lea r0, [r0 + r2 * 2] movq [r0], xm1 movhps [r0 + r2], xm1 RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shr_8, 3, 4, 6 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 lea r3, [r2 * 3] ; Row 0-3 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0], m2 mova [r0 + r2], m3 mova [r0 + r2 * 2], m4 mova [r0 + r3], m5 ; Row 4-7 mova m2, [r1 + 4 * mmsize] mova m3, [r1 + 5 * mmsize] mova m4, [r1 + 6 * mmsize] mova m5, [r1 + 7 * mmsize] lea r0, [r0 + r2 * 4] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0], m2 mova [r0 + r2], m3 mova [r0 + r2 * 2], m4 mova [r0 + r3], m5 RET INIT_YMM avx2 cglobal cpy1Dto2D_shr_8, 3, 4, 4 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 lea r3, [r2 * 3] ; Row 0-3 movu m2, [r1 + 0 * mmsize] movu m3, [r1 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, xm0 psraw m3, xm0 movu [r0], xm2 vextracti128 [r0 + r2], m2, 1 movu [r0 + r2 * 2], xm3 vextracti128 [r0 + r3], m3, 1 ; Row 4-7 movu m2, [r1 + 2 * mmsize] movu m3, [r1 + 3 * mmsize] lea r0, [r0 + r2 * 4] psubw m2, m1 psubw m3, m1 psraw m2, xm0 psraw m3, xm0 movu [r0], xm2 vextracti128 [r0 + r2], m2, 1 movu [r0 + r2 * 2], xm3 vextracti128 [r0 + r3], m3, 1 RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shr_16, 3, 5, 6 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 16/4 lea r4, [r2 * 3] .loop: ; Row 0-1 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0], m2 mova [r0 + mmsize], m3 mova [r0 + r2], m4 mova [r0 + r2 + mmsize], m5 ; Row 2-3 mova m2, [r1 + 4 * mmsize] mova m3, [r1 + 5 * mmsize] mova m4, [r1 + 6 * mmsize] mova m5, [r1 + 7 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0 + r2 * 2], m2 mova [r0 + r2 * 2 + mmsize], m3 mova [r0 + r4], m4 mova [r0 + r4 + mmsize], m5 add r1, 8 * mmsize lea r0, [r0 + r2 * 4] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy1Dto2D_shr_16, 3, 5, 4 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 mov r3d, 16/4 lea r4, [r2 * 3] .loop: ; Row 0-1 movu m2, [r1 + 0 * mmsize] movu m3, [r1 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, xm0 psraw m3, xm0 movu [r0], m2 movu [r0 + r2], m3 ; Row 2-3 movu m2, [r1 + 2 * mmsize] movu m3, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, xm0 psraw m3, xm0 movu [r0 + r2 * 2], m2 movu [r0 + r4], m3 add r1, 4 * mmsize lea r0, [r0 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shr_32, 3, 4, 6 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 32/2 .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 mova [r0 + 2 * mmsize], m4 mova [r0 + 3 * mmsize], m5 ; Row 1 mova m2, [r1 + 4 * mmsize] mova m3, [r1 + 5 * mmsize] mova m4, [r1 + 6 * mmsize] mova m5, [r1 + 7 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0 + r2 + 0 * mmsize], m2 mova [r0 + r2 + 1 * mmsize], m3 mova [r0 + r2 + 2 * mmsize], m4 mova [r0 + r2 + 3 * mmsize], m5 add r1, 8 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy1Dto2D_shr_32, 3, 4, 6 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 mov r3d, 32/2 .loop: ; Row 0-1 movu m2, [r1 + 0 * mmsize] movu m3, [r1 + 1 * mmsize] movu m4, [r1 + 2 * mmsize] movu m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, xm0 psraw m3, xm0 psraw m4, xm0 psraw m5, xm0 movu [r0], m2 movu [r0 + mmsize], m3 movu [r0 + r2], m4 movu [r0 + r2 + mmsize], m5 add r1, 4 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET xavs2-1.3/source/common/x86/blockcopy8.h000066400000000000000000000472141340660520300200720ustar00rootroot00000000000000/***************************************************************************** * Copyright (C) 2013-2017 MulticoreWare, Inc * * Authors: Steve Borho ;* Min Chen * Jiaqi Zhang * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ #ifndef XAVS2_BLOCKCOPY_H #define XAVS2_BLOCKCOPY_H #define xavs2_cpy2Dto1D_shr_4_sse2 FPFX(cpy2Dto1D_shr_4_sse2) void xavs2_cpy2Dto1D_shr_4_sse2 (int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_cpy2Dto1D_shr_8_sse2 FPFX(cpy2Dto1D_shr_8_sse2) void xavs2_cpy2Dto1D_shr_8_sse2 (int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_cpy2Dto1D_shr_16_sse2 FPFX(cpy2Dto1D_shr_16_sse2) void xavs2_cpy2Dto1D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_cpy2Dto1D_shr_32_sse2 FPFX(cpy2Dto1D_shr_32_sse2) void xavs2_cpy2Dto1D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_cpy1Dto2D_shl_4_avx2 FPFX(cpy1Dto2D_shl_4_avx2) void xavs2_cpy1Dto2D_shl_4_avx2 (int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_cpy1Dto2D_shl_8_avx2 FPFX(cpy1Dto2D_shl_8_avx2) void xavs2_cpy1Dto2D_shl_8_avx2 (int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_cpy1Dto2D_shl_16_avx2 FPFX(cpy1Dto2D_shl_16_avx2) void xavs2_cpy1Dto2D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_cpy1Dto2D_shl_32_avx2 FPFX(cpy1Dto2D_shl_32_avx2) void xavs2_cpy1Dto2D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_cpy1Dto2D_shl_4_sse2 FPFX(cpy1Dto2D_shl_4_sse2) void xavs2_cpy1Dto2D_shl_4_sse2 (int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); #define xavs2_cpy1Dto2D_shl_8_sse2 FPFX(cpy1Dto2D_shl_8_sse2) void xavs2_cpy1Dto2D_shl_8_sse2 (int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); #define xavs2_cpy1Dto2D_shl_16_sse2 FPFX(cpy1Dto2D_shl_16_sse2) void xavs2_cpy1Dto2D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); #define xavs2_cpy1Dto2D_shl_32_sse2 FPFX(cpy1Dto2D_shl_32_sse2) void xavs2_cpy1Dto2D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); #define xavs2_cpy1Dto2D_shr_4_avx2 FPFX(cpy1Dto2D_shr_4_avx2) void xavs2_cpy1Dto2D_shr_4_avx2 (int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_cpy1Dto2D_shr_8_avx2 FPFX(cpy1Dto2D_shr_8_avx2) void xavs2_cpy1Dto2D_shr_8_avx2 (int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); #define xavs2_cpy1Dto2D_shr_16_avx2 FPFX(cpy1Dto2D_shr_16_avx2) void xavs2_cpy1Dto2D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); #define xavs2_cpy1Dto2D_shr_32_avx2 FPFX(cpy1Dto2D_shr_32_avx2) void xavs2_cpy1Dto2D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); #define xavs2_cpy1Dto2D_shr_4_sse2 FPFX(cpy1Dto2D_shr_4_sse2) void xavs2_cpy1Dto2D_shr_4_sse2 (int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); #define xavs2_cpy1Dto2D_shr_8_sse2 FPFX(cpy1Dto2D_shr_8_sse2) void xavs2_cpy1Dto2D_shr_8_sse2 (int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); #define xavs2_cpy1Dto2D_shr_16_sse2 FPFX(cpy1Dto2D_shr_16_sse2) void xavs2_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); #define xavs2_cpy1Dto2D_shr_32_sse2 FPFX(cpy1Dto2D_shr_32_sse2) void xavs2_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); #define xavs2_cpy2Dto1D_shl_8_avx2 FPFX(cpy2Dto1D_shl_8_avx2) void xavs2_cpy2Dto1D_shl_8_avx2 (int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_cpy2Dto1D_shl_16_avx2 FPFX(cpy2Dto1D_shl_16_avx2) void xavs2_cpy2Dto1D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_cpy2Dto1D_shl_32_avx2 FPFX(cpy2Dto1D_shl_32_avx2) void xavs2_cpy2Dto1D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_cpy2Dto1D_shr_8_avx2 FPFX(cpy2Dto1D_shr_8_avx2) void xavs2_cpy2Dto1D_shr_8_avx2 (int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_cpy2Dto1D_shr_16_avx2 FPFX(cpy2Dto1D_shr_16_avx2) void xavs2_cpy2Dto1D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_cpy2Dto1D_shr_32_avx2 FPFX(cpy2Dto1D_shr_32_avx2) void xavs2_cpy2Dto1D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); #define xavs2_copy_cnt_4_sse4 FPFX(copy_cnt_4_sse4) uint32_t xavs2_copy_cnt_4_sse4 (int16_t* dst, const int16_t* src, intptr_t srcStride); #define xavs2_copy_cnt_8_sse4 FPFX(copy_cnt_8_sse4) uint32_t xavs2_copy_cnt_8_sse4 (int16_t* dst, const int16_t* src, intptr_t srcStride); #define xavs2_copy_cnt_16_sse4 FPFX(copy_cnt_16_sse4) uint32_t xavs2_copy_cnt_16_sse4 (int16_t* dst, const int16_t* src, intptr_t srcStride); #define xavs2_copy_cnt_32_sse4 FPFX(copy_cnt_32_sse4) uint32_t xavs2_copy_cnt_32_sse4 (int16_t* dst, const int16_t* src, intptr_t srcStride); #define xavs2_copy_cnt_4_avx2 FPFX(copy_cnt_4_avx2) uint32_t xavs2_copy_cnt_4_avx2 (int16_t* dst, const int16_t* src, intptr_t srcStride); #define xavs2_copy_cnt_8_avx2 FPFX(copy_cnt_8_avx2) uint32_t xavs2_copy_cnt_8_avx2 (int16_t* dst, const int16_t* src, intptr_t srcStride); #define xavs2_copy_cnt_16_avx2 FPFX(copy_cnt_16_avx2) uint32_t xavs2_copy_cnt_16_avx2 (int16_t* dst, const int16_t* src, intptr_t srcStride); #define xavs2_copy_cnt_32_avx2 FPFX(copy_cnt_32_avx2) uint32_t xavs2_copy_cnt_32_avx2 (int16_t* dst, const int16_t* src, intptr_t srcStride); #define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \ void xavs2_blockcopy_pp_ ## W ## x ## H ## cpu(uint8_t *a, intptr_t stridea, const uint8_t *b, intptr_t strideb); \ void xavs2_blockcopy_sp_ ## W ## x ## H ## cpu(uint8_t *a, intptr_t stridea, const int16_t* b, intptr_t strideb); \ void xavs2_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb); #define SETUP_BLOCKCOPY_PS(W, H, cpu) \ void xavs2_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t* dst, intptr_t dstStride, const uint8_t *src, intptr_t srcStride); #define SETUP_BLOCKCOPY_SP(W, H, cpu) \ void xavs2_blockcopy_sp_ ## W ## x ## H ## cpu(uint8_t *a, intptr_t stridea, const int16_t* b, intptr_t strideb); #define SETUP_BLOCKCOPY_SS_PP(W, H, cpu) \ void xavs2_blockcopy_pp_ ## W ## x ## H ## cpu(uint8_t *a, intptr_t stridea, const uint8_t *b, intptr_t strideb); \ void xavs2_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb); #define BLOCKCOPY_COMMON(cpu) \ SETUP_BLOCKCOPY_FUNC(64, 64, cpu); /* 64x64 */ \ SETUP_BLOCKCOPY_FUNC(64, 32, cpu); \ SETUP_BLOCKCOPY_FUNC(32, 64, cpu); \ SETUP_BLOCKCOPY_FUNC(64, 16, cpu); \ SETUP_BLOCKCOPY_FUNC(64, 48, cpu); \ SETUP_BLOCKCOPY_FUNC(16, 64, cpu); \ SETUP_BLOCKCOPY_FUNC(48, 64, cpu); \ SETUP_BLOCKCOPY_FUNC(32, 32, cpu); /* 32x32 */ \ SETUP_BLOCKCOPY_FUNC(32, 16, cpu); \ SETUP_BLOCKCOPY_FUNC(16, 32, cpu); \ SETUP_BLOCKCOPY_FUNC(32, 8, cpu); \ SETUP_BLOCKCOPY_FUNC(32, 24, cpu); \ SETUP_BLOCKCOPY_FUNC( 8, 32, cpu); \ SETUP_BLOCKCOPY_FUNC(24, 32, cpu); \ SETUP_BLOCKCOPY_FUNC(16, 16, cpu); /* 16x16 */ \ SETUP_BLOCKCOPY_FUNC(16, 8, cpu); \ SETUP_BLOCKCOPY_FUNC( 8, 16, cpu); \ SETUP_BLOCKCOPY_FUNC(16, 4, cpu); \ SETUP_BLOCKCOPY_FUNC(16, 12, cpu); \ SETUP_BLOCKCOPY_FUNC( 4, 16, cpu); \ SETUP_BLOCKCOPY_FUNC(12, 16, cpu); \ SETUP_BLOCKCOPY_FUNC( 8, 8, cpu); /* 8x8 */ \ SETUP_BLOCKCOPY_FUNC( 8, 4, cpu); \ SETUP_BLOCKCOPY_FUNC( 4, 8, cpu); \ SETUP_BLOCKCOPY_FUNC( 4, 4, cpu); /* 4x4 */ #define BLOCKCOPY_SP(cpu) \ SETUP_BLOCKCOPY_SP( 2, 4, cpu); \ SETUP_BLOCKCOPY_SP( 2, 8, cpu); \ SETUP_BLOCKCOPY_SP( 6, 8, cpu); \ \ SETUP_BLOCKCOPY_SP( 2, 16, cpu); \ SETUP_BLOCKCOPY_SP( 4, 32, cpu); \ SETUP_BLOCKCOPY_SP( 6, 16, cpu); \ SETUP_BLOCKCOPY_SP( 8, 12, cpu); \ SETUP_BLOCKCOPY_SP( 8, 64, cpu); \ SETUP_BLOCKCOPY_SP(12, 32, cpu); \ SETUP_BLOCKCOPY_SP(16, 24, cpu); \ SETUP_BLOCKCOPY_SP(24, 64, cpu); \ SETUP_BLOCKCOPY_SP(32, 48, cpu); #define BLOCKCOPY_SS_PP(cpu) \ SETUP_BLOCKCOPY_SS_PP( 2, 4, cpu); \ SETUP_BLOCKCOPY_SS_PP( 2, 8, cpu); \ SETUP_BLOCKCOPY_SS_PP( 6, 8, cpu); \ \ SETUP_BLOCKCOPY_SS_PP( 2, 16, cpu); \ SETUP_BLOCKCOPY_SS_PP( 4, 32, cpu); \ SETUP_BLOCKCOPY_SS_PP( 6, 16, cpu); \ SETUP_BLOCKCOPY_SS_PP( 8, 12, cpu); \ SETUP_BLOCKCOPY_SS_PP( 8, 64, cpu); \ SETUP_BLOCKCOPY_SS_PP(12, 32, cpu); \ SETUP_BLOCKCOPY_SS_PP(16, 24, cpu); \ SETUP_BLOCKCOPY_SS_PP(24, 64, cpu); \ SETUP_BLOCKCOPY_SS_PP(32, 48, cpu); #define BLOCKCOPY_PS(cpu) \ SETUP_BLOCKCOPY_PS(64, 64, cpu); /* 64x64 */ \ SETUP_BLOCKCOPY_PS(64, 32, cpu); \ SETUP_BLOCKCOPY_PS(32, 64, cpu); \ SETUP_BLOCKCOPY_PS(64, 16, cpu); \ SETUP_BLOCKCOPY_PS(64, 48, cpu); \ SETUP_BLOCKCOPY_PS(16, 64, cpu); \ SETUP_BLOCKCOPY_PS(48, 64, cpu); \ SETUP_BLOCKCOPY_PS(32, 32, cpu); /* 32x32 */ \ SETUP_BLOCKCOPY_PS(32, 16, cpu); \ SETUP_BLOCKCOPY_PS(16, 32, cpu); \ SETUP_BLOCKCOPY_PS(32, 8, cpu); \ SETUP_BLOCKCOPY_PS(32, 24, cpu); \ SETUP_BLOCKCOPY_PS( 8, 32, cpu); \ SETUP_BLOCKCOPY_PS(24, 32, cpu); \ SETUP_BLOCKCOPY_PS(16, 16, cpu); /* 16x16 */ \ SETUP_BLOCKCOPY_PS(16, 8, cpu); \ SETUP_BLOCKCOPY_PS( 8, 16, cpu); \ SETUP_BLOCKCOPY_PS(16, 4, cpu); \ SETUP_BLOCKCOPY_PS(16, 12, cpu); \ SETUP_BLOCKCOPY_PS( 4, 16, cpu); \ SETUP_BLOCKCOPY_PS(12, 16, cpu); \ SETUP_BLOCKCOPY_PS( 8, 8, cpu); /* 8x8 */ \ SETUP_BLOCKCOPY_PS( 8, 4, cpu); \ SETUP_BLOCKCOPY_PS( 4, 8, cpu); \ SETUP_BLOCKCOPY_PS( 8, 2, cpu); \ SETUP_BLOCKCOPY_PS( 8, 6, cpu); \ SETUP_BLOCKCOPY_PS( 2, 8, cpu); \ SETUP_BLOCKCOPY_PS( 6, 8, cpu); \ SETUP_BLOCKCOPY_PS( 4, 4, cpu); /* 4x4 */ \ SETUP_BLOCKCOPY_PS( 4, 2, cpu); \ SETUP_BLOCKCOPY_PS( 2, 4, cpu); \ \ SETUP_BLOCKCOPY_PS(32, 48, cpu); \ SETUP_BLOCKCOPY_PS(24, 64, cpu); \ SETUP_BLOCKCOPY_PS(16, 24, cpu); \ SETUP_BLOCKCOPY_PS(12, 32, cpu); \ SETUP_BLOCKCOPY_PS( 8, 64, cpu); \ SETUP_BLOCKCOPY_PS( 8, 12, cpu); \ SETUP_BLOCKCOPY_PS( 6, 16, cpu); \ SETUP_BLOCKCOPY_PS( 4, 32, cpu); \ SETUP_BLOCKCOPY_PS( 2, 16, cpu); BLOCKCOPY_COMMON(_sse2) BLOCKCOPY_SS_PP(_sse2) BLOCKCOPY_SP(_sse4) BLOCKCOPY_PS(_sse4) BLOCKCOPY_SP(_sse2) #define xavs2_blockfill_s_4x4_sse2 FPFX(blockfill_s_4x4_sse2) void xavs2_blockfill_s_4x4_sse2 (int16_t* dst, intptr_t dstride, int16_t val); #define xavs2_blockfill_s_8x8_sse2 FPFX(blockfill_s_8x8_sse2) void xavs2_blockfill_s_8x8_sse2 (int16_t* dst, intptr_t dstride, int16_t val); #define xavs2_blockfill_s_16x16_sse2 FPFX(blockfill_s_16x16_sse2) void xavs2_blockfill_s_16x16_sse2(int16_t* dst, intptr_t dstride, int16_t val); #define xavs2_blockfill_s_32x32_sse2 FPFX(blockfill_s_32x32_sse2) void xavs2_blockfill_s_32x32_sse2(int16_t* dst, intptr_t dstride, int16_t val); #define xavs2_blockcopy_ss_16x4_avx FPFX(blockcopy_ss_16x4_avx) void xavs2_blockcopy_ss_16x4_avx (int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_16x8_avx FPFX(blockcopy_ss_16x8_avx) void xavs2_blockcopy_ss_16x8_avx (int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_16x12_avx FPFX(blockcopy_ss_16x12_avx) void xavs2_blockcopy_ss_16x12_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_16x16_avx FPFX(blockcopy_ss_16x16_avx) void xavs2_blockcopy_ss_16x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_16x24_avx FPFX(blockcopy_ss_16x24_avx) void xavs2_blockcopy_ss_16x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_16x32_avx FPFX(blockcopy_ss_16x32_avx) void xavs2_blockcopy_ss_16x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_16x64_avx FPFX(blockcopy_ss_16x64_avx) void xavs2_blockcopy_ss_16x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_64x16_avx FPFX(blockcopy_ss_64x16_avx) void xavs2_blockcopy_ss_64x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_64x32_avx FPFX(blockcopy_ss_64x32_avx) void xavs2_blockcopy_ss_64x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_64x48_avx FPFX(blockcopy_ss_64x48_avx) void xavs2_blockcopy_ss_64x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_64x64_avx FPFX(blockcopy_ss_64x64_avx) void xavs2_blockcopy_ss_64x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_32x8_avx FPFX(blockcopy_ss_32x8_avx) void xavs2_blockcopy_ss_32x8_avx (int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_32x16_avx FPFX(blockcopy_ss_32x16_avx) void xavs2_blockcopy_ss_32x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_32x24_avx FPFX(blockcopy_ss_32x24_avx) void xavs2_blockcopy_ss_32x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_32x32_avx FPFX(blockcopy_ss_32x32_avx) void xavs2_blockcopy_ss_32x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_32x48_avx FPFX(blockcopy_ss_32x48_avx) void xavs2_blockcopy_ss_32x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_32x64_avx FPFX(blockcopy_ss_32x64_avx) void xavs2_blockcopy_ss_32x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_48x64_avx FPFX(blockcopy_ss_48x64_avx) void xavs2_blockcopy_ss_48x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_24x32_avx FPFX(blockcopy_ss_24x32_avx) void xavs2_blockcopy_ss_24x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_ss_24x64_avx FPFX(blockcopy_ss_24x64_avx) void xavs2_blockcopy_ss_24x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); #define xavs2_blockcopy_pp_32x8_avx FPFX(blockcopy_pp_32x8_avx) void xavs2_blockcopy_pp_32x8_avx (uint8_t *a, intptr_t stridea, const uint8_t *b, intptr_t strideb); #define xavs2_blockcopy_pp_32x16_avx FPFX(blockcopy_pp_32x16_avx) void xavs2_blockcopy_pp_32x16_avx(uint8_t *a, intptr_t stridea, const uint8_t *b, intptr_t strideb); #define xavs2_blockcopy_pp_32x24_avx FPFX(blockcopy_pp_32x24_avx) void xavs2_blockcopy_pp_32x24_avx(uint8_t *a, intptr_t stridea, const uint8_t *b, intptr_t strideb); #define xavs2_blockcopy_pp_32x32_avx FPFX(blockcopy_pp_32x32_avx) void xavs2_blockcopy_pp_32x32_avx(uint8_t *a, intptr_t stridea, const uint8_t *b, intptr_t strideb); #define xavs2_blockcopy_pp_32x48_avx FPFX(blockcopy_pp_32x48_avx) void xavs2_blockcopy_pp_32x48_avx(uint8_t *a, intptr_t stridea, const uint8_t *b, intptr_t strideb); #define xavs2_blockcopy_pp_32x64_avx FPFX(blockcopy_pp_32x64_avx) void xavs2_blockcopy_pp_32x64_avx(uint8_t *a, intptr_t stridea, const uint8_t *b, intptr_t strideb); #define xavs2_blockcopy_pp_64x16_avx FPFX(blockcopy_pp_64x16_avx) void xavs2_blockcopy_pp_64x16_avx(uint8_t *a, intptr_t stridea, const uint8_t *b, intptr_t strideb); #define xavs2_blockcopy_pp_64x32_avx FPFX(blockcopy_pp_64x32_avx) void xavs2_blockcopy_pp_64x32_avx(uint8_t *a, intptr_t stridea, const uint8_t *b, intptr_t strideb); #define xavs2_blockcopy_pp_64x48_avx FPFX(blockcopy_pp_64x48_avx) void xavs2_blockcopy_pp_64x48_avx(uint8_t *a, intptr_t stridea, const uint8_t *b, intptr_t strideb); #define xavs2_blockcopy_pp_64x64_avx FPFX(blockcopy_pp_64x64_avx) void xavs2_blockcopy_pp_64x64_avx(uint8_t *a, intptr_t stridea, const uint8_t *b, intptr_t strideb); #define xavs2_blockcopy_pp_48x64_avx FPFX(blockcopy_pp_48x64_avx) void xavs2_blockcopy_pp_48x64_avx(uint8_t *a, intptr_t stridea, const uint8_t *b, intptr_t strideb); #define xavs2_blockfill_s_16x16_avx2 FPFX(blockfill_s_16x16_avx2) void xavs2_blockfill_s_16x16_avx2(int16_t* dst, intptr_t dstride, int16_t val); #define xavs2_blockfill_s_32x32_avx2 FPFX(blockfill_s_32x32_avx2) void xavs2_blockfill_s_32x32_avx2(int16_t* dst, intptr_t dstride, int16_t val); // copy_sp primitives // 16 x N #define xavs2_blockcopy_sp_16x16_avx2 FPFX(blockcopy_sp_16x16_avx2) void xavs2_blockcopy_sp_16x16_avx2(uint8_t *a, intptr_t stridea, const int16_t* b, intptr_t strideb); #define xavs2_blockcopy_sp_16x32_avx2 FPFX(blockcopy_sp_16x32_avx2) void xavs2_blockcopy_sp_16x32_avx2(uint8_t *a, intptr_t stridea, const int16_t* b, intptr_t strideb); // 32 x N #define xavs2_blockcopy_sp_32x32_avx2 FPFX(blockcopy_sp_32x32_avx2) void xavs2_blockcopy_sp_32x32_avx2(uint8_t *a, intptr_t stridea, const int16_t* b, intptr_t strideb); #define xavs2_blockcopy_sp_32x64_avx2 FPFX(blockcopy_sp_32x64_avx2) void xavs2_blockcopy_sp_32x64_avx2(uint8_t *a, intptr_t stridea, const int16_t* b, intptr_t strideb); // 64 x N #define xavs2_blockcopy_sp_64x64_avx2 FPFX(blockcopy_sp_64x64_avx2) void xavs2_blockcopy_sp_64x64_avx2(uint8_t *a, intptr_t stridea, const int16_t* b, intptr_t strideb); // copy_ps primitives // 16 x N #define xavs2_blockcopy_ps_16x16_avx2 FPFX(blockcopy_ps_16x16_avx2) void xavs2_blockcopy_ps_16x16_avx2(int16_t* a, intptr_t stridea, const uint8_t *b, intptr_t strideb); #define xavs2_blockcopy_ps_16x32_avx2 FPFX(blockcopy_ps_16x32_avx2) void xavs2_blockcopy_ps_16x32_avx2(int16_t* a, intptr_t stridea, const uint8_t *b, intptr_t strideb); // 32 x N #define xavs2_blockcopy_ps_32x32_avx2 FPFX(blockcopy_ps_32x32_avx2) void xavs2_blockcopy_ps_32x32_avx2(int16_t* a, intptr_t stridea, const uint8_t *b, intptr_t strideb); #define xavs2_blockcopy_ps_32x64_avx2 FPFX(blockcopy_ps_32x64_avx2) void xavs2_blockcopy_ps_32x64_avx2(int16_t* a, intptr_t stridea, const uint8_t *b, intptr_t strideb); // 64 x N #define xavs2_blockcopy_ps_64x64_avx2 FPFX(blockcopy_ps_64x64_avx2) void xavs2_blockcopy_ps_64x64_avx2(int16_t* a, intptr_t stridea, const uint8_t *b, intptr_t strideb); #undef BLOCKCOPY_COMMON #undef BLOCKCOPY_SS_PP #undef BLOCKCOPY_SP #undef BLOCKCOPY_PS #undef SETUP_BLOCKCOPY_PS #undef SETUP_BLOCKCOPY_SP #undef SETUP_BLOCKCOPY_SS_PP #undef SETUP_BLOCKCOPY_FUNC #endif // __xXAVS2_BLOCKCOPY_H xavs2-1.3/source/common/x86/const-a.asm000066400000000000000000000206301340660520300177030ustar00rootroot00000000000000;***************************************************************************** ;* const-a.asm: x86 global constants ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Min Chen ;* Praveen Kumar Tiwari ;* Jiaqi Zhang ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %include "x86inc.asm" SECTION_RODATA 32 ;; 8-bit constants const pb_0, times 32 db 0 const pb_1, times 32 db 1 const pb_2, times 32 db 2 const pb_3, times 32 db 3 const pb_4, times 32 db 4 const pb_8, times 32 db 8 const pb_15, times 32 db 15 const pb_16, times 32 db 16 const pb_31, times 32 db 31 const pb_32, times 32 db 32 const pb_64, times 32 db 64 const pb_124, times 32 db 124 const pb_128, times 32 db 128 const pb_a1, times 16 db 0xa1 const pb_01, times 8 db 0, 1 const pb_0123, times 4 db 0, 1 times 4 db 2, 3 const hsub_mul, times 16 db 1, -1 const pw_swap, times 2 db 6, 7, 4, 5, 2, 3, 0, 1 const pb_unpackbd1, times 2 db 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 const pb_unpackbd2, times 2 db 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7 const pb_unpackwq1, times 1 db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 const pb_unpackwq2, times 1 db 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7 const pb_shuf8x8c, times 1 db 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6 const pb_movemask, times 16 db 0x00 times 16 db 0xFF const pb_movemask_32, times 32 db 0x00 times 32 db 0xFF times 32 db 0x00 const pb_0000000000000F0F, times 2 db 0xff, 0x00 times 12 db 0x00 const pb_000000000000000F, db 0xff times 15 db 0x00 const pb_shuf_off4, times 2 db 0, 4, 1, 5, 2, 6, 3, 7 const pw_shuf_off4, times 1 db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ;; 16-bit constants const pw_n1, times 16 dw -1 const pw_1, times 16 dw 1 const pw_2, times 16 dw 2 const pw_3, times 16 dw 3 const pw_7, times 16 dw 7 const pw_m2, times 8 dw -2 const pw_4, times 8 dw 4 const pw_8, times 8 dw 8 const pw_16, times 16 dw 16 const pw_15, times 16 dw 15 const pw_31, times 16 dw 31 const pw_32, times 16 dw 32 const pw_64, times 8 dw 64 const pw_128, times 16 dw 128 const pw_256, times 16 dw 256 const pw_257, times 16 dw 257 const pw_512, times 16 dw 512 const pw_1023, times 16 dw 1023 const pw_1024, times 16 dw 1024 const pw_2048, times 16 dw 2048 const pw_4096, times 16 dw 4096 const pw_8192, times 8 dw 8192 const pw_00ff, times 16 dw 0x00ff const pw_ff00, times 8 dw 0xff00 const pw_2000, times 16 dw 0x2000 const pw_8000, times 8 dw 0x8000 const pw_3fff, times 16 dw 0x3fff const pw_32_0, times 4 dw 32, times 4 dw 0 const pw_pixel_max, times 16 dw ((1 << BIT_DEPTH)-1) const pw_0_7, times 2 dw 0, 1, 2, 3, 4, 5, 6, 7 const pw_ppppmmmm, times 1 dw 1, 1, 1, 1, -1, -1, -1, -1 const pw_ppmmppmm, times 1 dw 1, 1, -1, -1, 1, 1, -1, -1 const pw_pmpmpmpm, times 16 dw 1, -1, 1, -1, 1, -1, 1, -1 const pw_pmmpzzzz, times 1 dw 1, -1, -1, 1, 0, 0, 0, 0 const multi_2Row, times 1 dw 1, 2, 3, 4, 1, 2, 3, 4 const multiH, times 1 dw 9, 10, 11, 12, 13, 14, 15, 16 const multiH3, times 1 dw 25, 26, 27, 28, 29, 30, 31, 32 const multiL, times 1 dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 const pw_planar32_mul, times 1 dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 const pw_FFFFFFFFFFFFFFF0, dw 0x00 times 7 dw 0xff const hmul_16p, times 16 db 1 times 8 db 1, -1 const pw_exp2_0_15, dw 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768 const pw_1_ffff, times 4 dw 1 times 4 dw 0xFFFF ;; 32-bit constants const pd_0, times 8 dd 0 const pd_1, times 8 dd 1 const pd_2, times 8 dd 2 const pd_3, times 8 dd 3 const pd_4, times 4 dd 4 const pd_8, times 4 dd 8 const pd_11, times 4 dd 11 const pd_12, times 4 dd 12 const pd_15, times 8 dd 15 const pd_16, times 8 dd 16 const pd_31, times 8 dd 31 const pd_32, times 8 dd 32 const pd_64, times 4 dd 64 const pd_128, times 4 dd 128 const pd_256, times 4 dd 256 const pd_512, times 4 dd 512 const pd_1024, times 4 dd 1024 const pd_2048, times 4 dd 2048 const pd_ffff, times 4 dd 0xffff const pd_32767, times 4 dd 32767 const pd_n32768, times 4 dd 0xffff8000 const pd_524416, times 4 dd 524416 const pd_n32768, times 8 dd 0xffff8000 const pd_n131072, times 4 dd 0xfffe0000 const pd_0000ffff, times 8 dd 0x0000FFFF const pd_planar16_mul0, times 1 dd 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 const pd_planar16_mul1, times 1 dd 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 const pd_planar32_mul1, times 1 dd 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 const pd_planar32_mul2, times 1 dd 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 const pd_planar16_mul2, times 1 dd 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 const trans8_shuf, times 1 dd 0, 4, 1, 5, 2, 6, 3, 7 ;; 64-bit constants const pq_1, times 1 dq 1 xavs2-1.3/source/common/x86/cpu-a.asm000066400000000000000000000133431340660520300173470ustar00rootroot00000000000000;***************************************************************************** ;* cpu-a.asm: x86 cpu utilities ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Laurent Aimar ;* Loren Merritt ;* Fiona Glaser ;* Jiaqi Zhang ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %include "x86inc.asm" SECTION .text ;----------------------------------------------------------------------------- ; void cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) ;----------------------------------------------------------------------------- cglobal cpu_cpuid, 5,7 push rbx push r4 push r3 push r2 push r1 mov eax, r0d xor ecx, ecx cpuid pop r4 mov [r4], eax pop r4 mov [r4], ebx pop r4 mov [r4], ecx pop r4 mov [r4], edx pop rbx RET ;----------------------------------------------------------------------------- ; void cpu_xgetbv( int op, int *eax, int *edx ) ;----------------------------------------------------------------------------- cglobal cpu_xgetbv, 3,7 push r2 push r1 mov ecx, r0d xgetbv pop r4 mov [r4], eax pop r4 mov [r4], edx RET ;----------------------------------------------------------------------------- ; void cpuid_get_serial_number( int op, int *eax, int *ebx, int *ecx, int *edx ) ; 2017-06-18 luofl ;----------------------------------------------------------------------------- cglobal cpuid_get_serial_number, 5,7 push rbx push r4 push r3 push r2 push r1 ; first 64 bits mov eax, 00h xor edx, edx cpuid pop r4 mov [r4], edx pop r4 mov [r4], eax ; second 64 bits mov eax, 01h xor ecx, ecx xor edx, edx cpuid pop r4 mov [r4], edx pop r4 mov [r4], eax pop rbx RET %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void stack_align( void (*func)(void*), void *arg ); ;----------------------------------------------------------------------------- cglobal stack_align push rbp mov rbp, rsp %if WIN64 sub rsp, 32 ; shadow space %endif and rsp, ~31 mov rax, r0 mov r0, r1 mov r1, r2 mov r2, r3 call rax leave ret %else ;----------------------------------------------------------------------------- ; int cpu_cpuid_test( void ) ; return 0 if unsupported ;----------------------------------------------------------------------------- cglobal cpu_cpuid_test pushfd push ebx push ebp push esi push edi pushfd pop eax mov ebx, eax xor eax, 0x200000 push eax popfd pushfd pop eax xor eax, ebx pop edi pop esi pop ebp pop ebx popfd ret cglobal stack_align push ebp mov ebp, esp sub esp, 12 and esp, ~31 mov ecx, [ebp+8] mov edx, [ebp+12] mov [esp], edx mov edx, [ebp+16] mov [esp+4], edx mov edx, [ebp+20] mov [esp+8], edx call ecx leave ret %endif ;----------------------------------------------------------------------------- ; void cpu_emms( void ) ;----------------------------------------------------------------------------- cglobal cpu_emms emms ret ;----------------------------------------------------------------------------- ; void cpu_sfence( void ) ;----------------------------------------------------------------------------- cglobal cpu_sfence sfence ret %if 0 ; REMOVED cextern intel_cpu_indicator_init ;----------------------------------------------------------------------------- ; void safe_intel_cpu_indicator_init( void ); ;----------------------------------------------------------------------------- cglobal safe_intel_cpu_indicator_init push r0 push r1 push r2 push r3 push r4 push r5 push r6 %if ARCH_X86_64 push r7 push r8 push r9 push r10 push r11 push r12 push r13 push r14 %endif push rbp mov rbp, rsp %if WIN64 sub rsp, 32 ; shadow space %endif and rsp, ~31 call intel_cpu_indicator_init leave %if ARCH_X86_64 pop r14 pop r13 pop r12 pop r11 pop r10 pop r9 pop r8 pop r7 %endif pop r6 pop r5 pop r4 pop r3 pop r2 pop r1 pop r0 ret %endif ; if 0xavs2-1.3/source/common/x86/dct8.asm000066400000000000000000003532431340660520300172120ustar00rootroot00000000000000;***************************************************************************** ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Nabajit Deka ;* Min Chen ;* Li Cao ;* Praveen Kumar Tiwari ;* Jiaqi Zhang ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;*****************************************************************************/ ;TO-DO : Further optimize the routines. %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 ; ---------------------------------------------------------------------------- ; const local table defines ; ---------------------------------------------------------------------------- tab_dct8: dw 32, 32, 32, 32, 32, 32, 32, 32 dw 44, 38, 25, 9, -9, -25, -38, -44 dw 42, 17, -17, -42, -42, -17, 17, 42 dw 38, -9, -44, -25, 25, 44, 9, -38 dw 32, -32, -32, 32, 32, -32, -32, 32 dw 25, -44, 9, 38, -38, -9, 44, -25 dw 17, -42, 42, -17, -17, 42, -42, 17 dw 9, -25, 38, -44, 44, -38, 25, -9 dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9 tab_dct16_1: dw 32, 32, 32, 32, 32, 32, 32, 32 dw 45, 43, 40, 35, 29, 21, 13, 4 dw 44, 38, 25, 9, -9, -25, -38, -44 dw 43, 29, 4, -21, -40, -45, -35, -13 dw 42, 17, -17, -42, -42, -17, 17, 42 dw 40, 4, -35, -43, -13, 29, 45, 21 dw 38, -9, -44, -25, 25, 44, 9, -38 dw 35, -21, -43, 4, 45, 13, -40, -29 dw 32, -32, -32, 32, 32, -32, -32, 32 dw 29, -40, -13, 45, -4, -43, 21, 35 dw 25, -44, 9, 38, -38, -9, 44, -25 dw 21, -45, 29, 13, -43, 35, 4, -40 dw 17, -42, 42, -17, -17, 42, -42, 17 dw 13, -35, 45, -40, 21, 4, -29, 43 dw 9, -25, 38, -44, 44, -38, 25, -9 dw 4, -13, 21, -29, 35, -40, 43, -45 tab_dct16_2: dw 32, 32, 32, 32, 32, 32, 32, 32 dw -4, -13, -21, -29, -35, -40, -43, -45 dw -44, -38, -25, -9, 9, 25, 38, 44 dw 13, 35, 45, 40, 21, -4, -29, -43 dw 42, 17, -17, -42, -42, -17, 17, 42 dw -21, -45, -29, 13, 43, 35, -4, -40 dw -38, 9, 44, 25, -25, -44, -9, 38 dw 29, 40, -13, -45, -4, 43, 21, -35 dw 32, -32, -32, 32, 32, -32, -32, 32 dw -35, -21, 43, 4, -45, 13, 40, -29 dw -25, 44, -9, -38, 38, 9, -44, 25 dw 40, -4, -35, 43, -13, -29, 45, -21 dw 17, -42, 42, -17, -17, 42, -42, 17 dw -43, 29, -4, -21, 40, -45, 35, -13 dw -9, 25, -38, 44, -44, 38, -25, 9 dw 45, -43, 40, -35, 29, -21, 13, -4 dct16_shuf1: times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 dct16_shuf2: times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9 tab_dct32_1: dw 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 dw 45, 45, 44, 43, 41, 39, 36, 34, 30, 27, 23, 19, 15, 11, 7, 2 dw 45, 43, 40, 35, 29, 21, 13, 4, -4, -13, -21, -29, -35, -40, -43, -45 dw 45, 41, 34, 23, 11, -2, -15, -27, -36, -43, -45, -44, -39, -30, -19, -7 dw 44, 38, 25, 9, -9, -25, -38, -44, -44, -38, -25, -9, 9, 25, 38, 44 dw 44, 34, 15, -7, -27, -41, -45, -39, -23, -2, 19, 36, 45, 43, 30, 11 dw 43, 29, 4, -21, -40, -45, -35, -13, 13, 35, 45, 40, 21, -4, -29, -43 dw 43, 23, -7, -34, -45, -36, -11, 19, 41, 44, 27, -2, -30, -45, -39, -15 dw 42, 17, -17, -42, -42, -17, 17, 42, 42, 17, -17, -42, -42, -17, 17, 42 dw 41, 11, -27, -45, -30, 7, 39, 43, 15, -23, -45, -34, 2, 36, 44, 19 dw 40, 4, -35, -43, -13, 29, 45, 21, -21, -45, -29, 13, 43, 35, -4, -40 dw 39, -2, -41, -36, 7, 43, 34, -11, -44, -30, 15, 45, 27, -19, -45, -23 dw 38, -9, -44, -25, 25, 44, 9, -38, -38, 9, 44, 25, -25, -44, -9, 38 dw 36, -15, -45, -11, 39, 34, -19, -45, -7, 41, 30, -23, -44, -2, 43, 27 dw 35, -21, -43, 4, 45, 13, -40, -29, 29, 40, -13, -45, -4, 43, 21, -35 dw 34, -27, -39, 19, 43, -11, -45, 2, 45, 7, -44, -15, 41, 23, -36, -30 dw 32, -32, -32, 32, 32, -32, -32, 32, 32, -32, -32, 32, 32, -32, -32, 32 dw 30, -36, -23, 41, 15, -44, -7, 45, -2, -45, 11, 43, -19, -39, 27, 34 dw 29, -40, -13, 45, -4, -43, 21, 35, -35, -21, 43, 4, -45, 13, 40, -29 dw 27, -43, -2, 44, -23, -30, 41, 7, -45, 19, 34, -39, -11, 45, -15, -36 dw 25, -44, 9, 38, -38, -9, 44, -25, -25, 44, -9, -38, 38, 9, -44, 25 dw 23, -45, 19, 27, -45, 15, 30, -44, 11, 34, -43, 7, 36, -41, 2, 39 dw 21, -45, 29, 13, -43, 35, 4, -40, 40, -4, -35, 43, -13, -29, 45, -21 dw 19, -44, 36, -2, -34, 45, -23, -15, 43, -39, 7, 30, -45, 27, 11, -41 dw 17, -42, 42, -17, -17, 42, -42, 17, 17, -42, 42, -17, -17, 42, -42, 17 dw 15, -39, 45, -30, 2, 27, -44, 41, -19, -11, 36, -45, 34, -7, -23, 43 dw 13, -35, 45, -40, 21, 4, -29, 43, -43, 29, -4, -21, 40, -45, 35, -13 dw 11, -30, 43, -45, 36, -19, -2, 23, -39, 45, -41, 27, -7, -15, 34, -44 dw 9, -25, 38, -44, 44, -38, 25, -9, -9, 25, -38, 44, -44, 38, -25, 9 dw 7, -19, 30, -39, 44, -45, 43, -36, 27, -15, 2, 11, -23, 34, -41, 45 dw 4, -13, 21, -29, 35, -40, 43, -45, 45, -43, 40, -35, 29, -21, 13, -4 dw 2, -7, 11, -15, 19, -23, 27, -30, 34, -36, 39, -41, 43, -44, 45, -45 tab_dct32_2: dw 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 dw -2, -7, -11, -15, -19, -23, -27, -30, -34, -36, -39, -41, -43, -44, -45, -45 dw -45, -43, -40, -35, -29, -21, -13, -4, 4, 13, 21, 29, 35, 40, 43, 45 dw 7, 19, 30, 39, 44, 45, 43, 36, 27, 15, 2, -11, -23, -34, -41, -45 dw 44, 38, 25, 9, -9, -25, -38, -44, -44, -38, -25, -9, 9, 25, 38, 44 dw -11, -30, -43, -45, -36, -19, 2, 23, 39, 45, 41, 27, 7, -15, -34, -44 dw -43, -29, -4, 21, 40, 45, 35, 13, -13, -35, -45, -40, -21, 4, 29, 43 dw 15, 39, 45, 30, 2, -27, -44, -41, -19, 11, 36, 45, 34, 7, -23, -43 dw 42, 17, -17, -42, -42, -17, 17, 42, 42, 17, -17, -42, -42, -17, 17, 42 dw -19, -44, -36, -2, 34, 45, 23, -15, -43, -39, -7, 30, 45, 27, -11, -41 dw -40, -4, 35, 43, 13, -29, -45, -21, 21, 45, 29, -13, -43, -35, 4, 40 dw 23, 45, 19, -27, -45, -15, 30, 44, 11, -34, -43, -7, 36, 41, 2, -39 dw 38, -9, -44, -25, 25, 44, 9, -38, -38, 9, 44, 25, -25, -44, -9, 38 dw -27, -43, 2, 44, 23, -30, -41, 7, 45, 19, -34, -39, 11, 45, 15, -36 dw -35, 21, 43, -4, -45, -13, 40, 29, -29, -40, 13, 45, 4, -43, -21, 35 dw 30, 36, -23, -41, 15, 44, -7, -45, -2, 45, 11, -43, -19, 39, 27, -34 dw 32, -32, -32, 32, 32, -32, -32, 32, 32, -32, -32, 32, 32, -32, -32, 32 dw -34, -27, 39, 19, -43, -11, 45, 2, -45, 7, 44, -15, -41, 23, 36, -30 dw -29, 40, 13, -45, 4, 43, -21, -35, 35, 21, -43, -4, 45, -13, -40, 29 dw 36, 15, -45, 11, 39, -34, -19, 45, -7, -41, 30, 23, -44, 2, 43, -27 dw 25, -44, 9, 38, -38, -9, 44, -25, -25, 44, -9, -38, 38, 9, -44, 25 dw -39, -2, 41, -36, -7, 43, -34, -11, 44, -30, -15, 45, -27, -19, 45, -23 dw -21, 45, -29, -13, 43, -35, -4, 40, -40, 4, 35, -43, 13, 29, -45, 21 dw 41, -11, -27, 45, -30, -7, 39, -43, 15, 23, -45, 34, 2, -36, 44, -19 dw 17, -42, 42, -17, -17, 42, -42, 17, 17, -42, 42, -17, -17, 42, -42, 17 dw -43, 23, 7, -34, 45, -36, 11, 19, -41, 44, -27, -2, 30, -45, 39, -15 dw -13, 35, -45, 40, -21, -4, 29, -43, 43, -29, 4, 21, -40, 45, -35, 13 dw 44, -34, 15, 7, -27, 41, -45, 39, -23, 2, 19, -36, 45, -43, 30, -11 dw 9, -25, 38, -44, 44, -38, 25, -9, -9, 25, -38, 44, -44, 38, -25, 9 dw -45, 41, -34, 23, -11, -2, 15, -27, 36, -43, 45, -44, 39, -30, 19, -7 dw -4, 13, -21, 29, -35, 40, -43, 45, -45, 43, -40, 35, -29, 21, -13, 4 dw 45, -45, 44, -43, 41, -39, 36, -34, 30, -27, 23, -19, 15, -11, 7, -2 avx2_idct8_1: times 4 dw 32, 42, 32, 17 times 4 dw 32, 17, -32, -42 times 4 dw 32, -17, -32, 42 times 4 dw 32, -42, 32, -17 avx2_idct8_2: times 4 dw 44, 38, 25, 9 times 4 dw 38, -9, -44, -25 times 4 dw 25, -44, 9, 38 times 4 dw 9, -25, 38, -44 idct8_shuf1: dd 0, 2, 4, 6, 1, 3, 5, 7 const idct8_shuf2, times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 idct8_shuf3: times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 tab_idct16_1: dw 45, 43, 40, 35, 29, 21, 13, 4 dw 43, 29, 4, -21, -40, -45, -35, -13 dw 40, 4, -35, -43, -13, 29, 45, 21 dw 35, -21, -43, 4, 45, 13, -40, -29 dw 29, -40, -13, 45, -4, -43, 21, 35 dw 21, -45, 29, 13, -43, 35, 4, -40 dw 13, -35, 45, -40, 21, 4, -29, 43 dw 4, -13, 21, -29, 35, -40, 43, -45 tab_idct16_2: dw 32, 44, 42, 38, 32, 25, 17, 9 dw 32, 38, 17, -9, -32, -44, -42, -25 dw 32, 25, -17, -44, -32, 9, 42, 38 dw 32, 9, -42, -25, 32, 38, -17, -44 dw 32, -9, -42, 25, 32, -38, -17, 44 dw 32, -25, -17, 44, -32, -9, 42, -38 dw 32, -38, 17, 9, -32, 44, -42, 25 dw 32, -44, 42, -38, 32, -25, 17, -9 idct16_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7 idct16_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5 tab_idct32_1: dw 45, 45, 44, 43, 41, 39, 36, 34, 30, 27, 23, 19, 15, 11, 7, 2 dw 45, 41, 34, 23, 11, -2, -15, -27, -36, -43, -45, -44, -39, -30, -19, -7 dw 44, 34, 15, -7, -27, -41, -45, -39, -23, -2, 19, 36, 45, 43, 30, 11 dw 43, 23, -7, -34, -45, -36, -11, 19, 41, 44, 27, -2, -30, -45, -39, -15 dw 41, 11, -27, -45, -30, 7, 39, 43, 15, -23, -45, -34, 2, 36, 44, 19 dw 39, -2, -41, -36, 7, 43, 34, -11, -44, -30, 15, 45, 27, -19, -45, -23 dw 36, -15, -45, -11, 39, 34, -19, -45, -7, 41, 30, -23, -44, -2, 43, 27 dw 34, -27, -39, 19, 43, -11, -45, 2, 45, 7, -44, -15, 41, 23, -36, -30 dw 30, -36, -23, 41, 15, -44, -7, 45, -2, -45, 11, 43, -19, -39, 27, 34 dw 27, -43, -2, 44, -23, -30, 41, 7, -45, 19, 34, -39, -11, 45, -15, -36 dw 23, -45, 19, 27, -45, 15, 30, -44, 11, 34, -43, 7, 36, -41, 2, 39 dw 19, -44, 36, -2, -34, 45, -23, -15, 43, -39, 7, 30, -45, 27, 11, -41 dw 15, -39, 45, -30, 2, 27, -44, 41, -19, -11, 36, -45, 34, -7, -23, 43 dw 11, -30, 43, -45, 36, -19, -2, 23, -39, 45, -41, 27, -7, -15, 34, -44 dw 7, -19, 30, -39, 44, -45, 43, -36, 27, -15, 2, 11, -23, 34, -41, 45 dw 2, -7, 11, -15, 19, -23, 27, -30, 34, -36, 39, -41, 43, -44, 45, -45 tab_idct32_2: dw 32, 44, 42, 38, 32, 25, 17, 9 dw 32, 38, 17, -9, -32, -44, -42, -25 dw 32, 25, -17, -44, -32, 9, 42, 38 dw 32, 9, -42, -25, 32, 38, -17, -44 dw 32, -9, -42, 25, 32, -38, -17, 44 dw 32, -25, -17, 44, -32, -9, 42, -38 dw 32, -38, 17, 9, -32, 44, -42, 25 dw 32, -44, 42, -38, 32, -25, 17, -9 tab_idct32_3: dw 45, 43, 40, 35, 29, 21, 13, 4 dw 43, 29, 4, -21, -40, -45, -35, -13 dw 40, 4, -35, -43, -13, 29, 45, 21 dw 35, -21, -43, 4, 45, 13, -40, -29 dw 29, -40, -13, 45, -4, -43, 21, 35 dw 21, -45, 29, 13, -43, 35, 4, -40 dw 13, -35, 45, -40, 21, 4, -29, 43 dw 4, -13, 21, -29, 35, -40, 43, -45 tab_idct32_4: dw 32, 45, 44, 43, 42, 40, 38, 35, 32, 29, 25, 21, 17, 13, 9, 4 dw 32, 43, 38, 29, 17, 4, -9, -21, -32, -40, -44, -45, -42, -35, -25, -13 dw 32, 40, 25, 4, -17, -35, -44, -43, -32, -13, 9, 29, 42, 45, 38, 21 dw 32, 35, 9, -21, -42, -43, -25, 4, 32, 45, 38, 13, -17, -40, -44, -29 dw 32, 29, -9, -40, -42, -13, 25, 45, 32, -4, -38, -43, -17, 21, 44, 35 dw 32, 21, -25, -45, -17, 29, 44, 13, -32, -43, -9, 35, 42, 4, -38, -40 dw 32, 13, -38, -35, 17, 45, 9, -40, -32, 21, 44, 4, -42, -29, 25, 43 dw 32, 4, -44, -13, 42, 21, -38, -29, 32, 35, -25, -40, 17, 43, -9, -45 dw 32, -4, -44, 13, 42, -21, -38, 29, 32, -35, -25, 40, 17, -43, -9, 45 dw 32, -13, -38, 35, 17, -45, 9, 40, -32, -21, 44, -4, -42, 29, 25, -43 dw 32, -21, -25, 45, -17, -29, 44, -13, -32, 43, -9, -35, 42, -4, -38, 40 dw 32, -29, -9, 40, -42, 13, 25, -45, 32, 4, -38, 43, -17, -21, 44, -35 dw 32, -35, 9, 21, -42, 43, -25, -4, 32, -45, 38, -13, -17, 40, -44, 29 dw 32, -40, 25, -4, -17, 35, -44, 43, -32, 13, 9, -29, 42, -45, 38, -21 dw 32, -43, 38, -29, 17, -4, -9, 21, -32, 40, -44, 45, -42, 35, -25, 13 dw 32, -45, 44, -43, 42, -40, 38, -35, 32, -29, 25, -21, 17, -13, 9, -4 avx2_dct4: dw 32, 32, 32, 32, 32, 32, 32, 32, 32, -32, 32, -32, 32, -32, 32, -32 dw 42, 17, 42, 17, 42, 17, 42, 17, 17, -42, 17, -42, 17, -42, 17, -42 avx2_idct4_1: dw 32, 32, 32, 32, 32, 32, 32, 32, 32, -32, 32, -32, 32, -32, 32, -32 dw 42, 17, 42, 17, 42, 17, 42, 17, 17, -42, 17, -42, 17, -42, 17, -42 avx2_idct4_2: dw 32, 32, 32,-32, 42, 17, 17,-42 const idct4_shuf1, times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 idct4_shuf2: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11 tab_dct4: times 4 dw 32, 32 times 4 dw 42, 17 times 4 dw 32, -32 times 4 dw 17, -42 dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13 tab_dct8_1: times 2 dw 44, 25, 38, 9 times 2 dw 38, -44, -9, -25 times 2 dw 25, 9, -44, 38 times 2 dw 9, 38, -25, -44 tab_dct8_2: times 2 dd 42, 17 times 2 dd 17, 42 times 1 dd 44, 38, 25, 9 times 1 dd 38, -9, -44, -25 times 1 dd 25, -44, 9, 38 times 1 dd 9, -25, 38, -44 tab_idct8_3: times 4 dw 44, 38 times 4 dw 25, 9 times 4 dw 38, -9 times 4 dw -44, -25 times 4 dw 25, -44 times 4 dw 9, 38 times 4 dw 9, -25 times 4 dw 38, -44 pb_unpackhlw1: db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15 pb_idct8even: db 0, 1, 8, 9, 4, 5, 12, 13, 0, 1, 8, 9, 4, 5, 12, 13 tab_idct8_1: times 1 dw 32, -32, 17, -42, 32, 32, 42, 17 tab_idct8_2: times 1 dw 44, 38, 25, 9, 38, -9, -44, -25 times 1 dw 25, -44, 9, 38, 9, -25, 38, -44 pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 SECTION .text cextern pd_0 cextern pd_1 cextern pd_2 cextern pd_3 cextern pd_4 cextern pd_8 cextern pd_11 cextern pd_12 cextern pd_16 cextern pd_32 cextern pd_64 cextern pd_128 cextern pd_256 cextern pd_512 cextern pd_1024 cextern pd_2048 cextern pw_ppppmmmm cextern trans8_shuf %if BIT_DEPTH == 10 %define DCT4_SHIFT1 2 %define DCT4_ROUND1 2 %define DCT8_SHIFT1 4 %define DCT8_ROUND1 8 %define DCT16_SHIFT1 5 %define DCT16_ROUND1 16 %define DCT32_SHIFT1 5 %define DCT32_ROUND1 16 %define IDCT4_SHIFT2 10 %define IDCT4_ROUND2 512 %define IDCT8_SHIFT2 10 %define IDCT8_ROUND2 512 %define IDCT16_SHIFT2 10 %define IDCT16_ROUND2 512 %define IDCT32_SHIFT2 10 %define IDCT32_ROUND2 512 %elif BIT_DEPTH == 8 %define DCT4_SHIFT1 0 %define DCT4_ROUND1 0 %define DCT8_SHIFT1 1 %define DCT8_ROUND1 1 %define DCT16_SHIFT1 2 %define DCT16_ROUND1 2 %define DCT32_SHIFT1 3 %define DCT32_ROUND1 4 %define IDCT4_SHIFT2 12 %define IDCT4_ROUND2 2048 %define IDCT8_SHIFT2 12 %define IDCT8_ROUND2 2048 %define IDCT16_SHIFT2 12 %define IDCT16_ROUND2 2048 %define IDCT32_SHIFT2 12 %define IDCT32_ROUND2 2048 %else %error Unsupported BIT_DEPTH! %endif %define DCT4_SHIFT2 7 %define DCT4_ROUND2 64 %define DCT8_SHIFT2 8 %define DCT8_ROUND2 128 %define DCT16_SHIFT2 9 %define DCT16_ROUND2 256 %define DCT32_SHIFT2 10 %define DCT32_ROUND2 512 %define IDCT4_SHIFT1 5 %define IDCT4_ROUND1 16 %define IDCT8_SHIFT1 5 %define IDCT8_ROUND1 16 %define IDCT16_SHIFT1 5 %define IDCT16_ROUND1 16 %define IDCT32_SHIFT1 5 %define IDCT32_ROUND1 16 ; ============================================================================ ; void dct_4x4(const coeff_t *src, coeff_t *dst, int i_src) ; ============================================================================ ; ---------------------------------------------------------------------------- ; dct_4x4_sse2 INIT_XMM sse2 cglobal dct_4x4, 3, 4, 8 mova m7, [pd_ %+ DCT4_ROUND1] add r2d, r2d lea r3, [tab_dct4] mova m4, [r3 + 0 * 16] mova m5, [r3 + 1 * 16] mova m6, [r3 + 2 * 16] movh m0, [r0 + 0 * r2] movh m1, [r0 + 1 * r2] punpcklqdq m0, m1 pshufd m0, m0, 0xD8 pshufhw m0, m0, 0xB1 lea r0, [r0 + 2 * r2] movh m1, [r0 ] movh m2, [r0 + r2] punpcklqdq m1, m2 pshufd m1, m1, 0xD8 pshufhw m1, m1, 0xB1 punpcklqdq m2, m0, m1 punpckhqdq m0, m1 paddw m1, m2, m0 psubw m2, m0 pmaddwd m0, m1, m4 paddd m0, m7 psrad m0, DCT4_SHIFT1 pmaddwd m3, m2, m5 paddd m3, m7 psrad m3, DCT4_SHIFT1 packssdw m0, m3 pshufd m0, m0, 0xD8 pshufhw m0, m0, 0xB1 pmaddwd m1, m6 paddd m1, m7 psrad m1, DCT4_SHIFT1 pmaddwd m2, [r3 + 3 * 16] paddd m2, m7 psrad m2, DCT4_SHIFT1 packssdw m1, m2 pshufd m1, m1, 0xD8 pshufhw m1, m1, 0xB1 punpcklqdq m2, m0, m1 punpckhqdq m0, m1 mova m7, [pd_ %+ DCT4_ROUND2] pmaddwd m1, m2, m4 pmaddwd m3, m0, m4 paddd m1, m3 paddd m1, m7 psrad m1, DCT4_SHIFT2 pmaddwd m4, m2, m5 pmaddwd m3, m0, m5 psubd m4, m3 paddd m4, m7 psrad m4, DCT4_SHIFT2 packssdw m1, m4 movu [r1 + 0 * 16], m1 pmaddwd m1, m2, m6 pmaddwd m3, m0, m6 paddd m1, m3 paddd m1, m7 psrad m1, DCT4_SHIFT2 pmaddwd m2, [r3 + 3 * 16] pmaddwd m0, [r3 + 3 * 16] psubd m2, m0 paddd m2, m7 psrad m2, DCT4_SHIFT2 packssdw m1, m2 movu [r1 + 1 * 16], m1 RET ; ---------------------------------------------------------------------------- ; dct_4x4_avx2 INIT_YMM avx2 cglobal dct_4x4, 3, 4, 8, src, dst, i_src vbroadcasti128 m7, [pd_ %+ DCT4_ROUND1] add r2d, r2d lea r3, [avx2_dct4] vbroadcasti128 m4, [dct4_shuf] mova m5, [r3 ] mova m6, [r3 + 32] movq xm0, [r0 ] movhps xm0, [r0 + r2] lea r0, [r0 + 2 * r2] movq xm1, [r0 ] movhps xm1, [r0 + r2] vinserti128 m0, m0, xm1, 1 pshufb m0, m4 vpermq m1, m0, 11011101b vpermq m0, m0, 10001000b paddw m2, m0, m1 psubw m0, m1 pmaddwd m2, m5 paddd m2, m7 psrad m2, DCT4_SHIFT1 pmaddwd m0, m6 paddd m0, m7 psrad m0, DCT4_SHIFT1 packssdw m2, m0 pshufb m2, m4 vpermq m1, m2, 11011101b vpermq m2, m2, 10001000b vbroadcasti128 m7, [pd_ %+ DCT4_ROUND2] pmaddwd m0, m2, m5 pmaddwd m3, m1, m5 paddd m3, m0 paddd m3, m7 psrad m3, DCT4_SHIFT2 pmaddwd m2, m6 pmaddwd m1, m6 psubd m2, m1 paddd m2, m7 psrad m2, DCT4_SHIFT2 packssdw m3, m2 movu [r1], m3 RET ; ============================================================================ ; void idct_4x4(const coeff_t *src, coeff_t *dst, int i_dst) ; ============================================================================ ; ---------------------------------------------------------------------------- ; idct_4x4_sse2 INIT_XMM sse2 cglobal idct_4x4, 3, 4, 6 add r2d, r2d lea r3, [tab_dct4] movu m0, [r0 + 0 * 16] movu m1, [r0 + 1 * 16] punpcklwd m2, m0, m1 pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1 paddd m3, [pd_ %+ IDCT4_ROUND1] pmaddwd m2, [r3 + 2 * 16] ; m2 = E2 paddd m2, [pd_ %+ IDCT4_ROUND1] punpckhwd m0, m1 pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2 paddd m4, m3, m1 psrad m4, IDCT4_SHIFT1 ; m4 = m128iA paddd m5, m2, m0 psrad m5, IDCT4_SHIFT1 packssdw m4, m5 ; m4 = m128iA psubd m2, m0 psrad m2, IDCT4_SHIFT1 psubd m3, m1 psrad m3, IDCT4_SHIFT1 packssdw m2, m3 ; m2 = m128iD punpcklwd m1, m4, m2 ; m1 = S0 punpckhwd m4, m2 ; m4 = S8 punpcklwd m0, m1, m4 ; m0 = m128iA punpckhwd m1, m4 ; m1 = m128iD punpcklwd m2, m0, m1 pmaddwd m3, m2, [r3 + 0 * 16] paddd m3, [pd_ %+ IDCT4_ROUND2] ; m3 = E1 pmaddwd m2, [r3 + 2 * 16] paddd m2, [pd_ %+ IDCT4_ROUND2] ; m2 = E2 punpckhwd m0, m1 pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2 paddd m4, m3, m1 psrad m4, IDCT4_SHIFT2 ; m4 = m128iA paddd m5, m2, m0 psrad m5, IDCT4_SHIFT2 packssdw m4, m5 ; m4 = m128iA psubd m2, m0 psrad m2, IDCT4_SHIFT2 psubd m3, m1 psrad m3, IDCT4_SHIFT2 packssdw m2, m3 ; m2 = m128iD punpcklwd m1, m4, m2 punpckhwd m4, m2 punpcklwd m0, m1, m4 movlps [r1 + 0 * r2], m0 movhps [r1 + 1 * r2], m0 punpckhwd m1, m4 movlps [r1 + 2 * r2], m1 lea r1, [r1 + 2 * r2] movhps [r1 + r2], m1 RET ; ============================================================================ ; void dct_8x8(const coeff_t *src, coeff_t *dst, int i_src) ; ============================================================================ ; ---------------------------------------------------------------------------- ; dct_8x8_sse2 INIT_XMM sse2 cglobal dct_8x8, 3,6,8,0-16*mmsize ;------------------------ ; Stack Mapping(dword) ;------------------------ ; Row0[0-3] Row1[0-3] ; ... ; Row6[0-3] Row7[0-3] ; Row0[0-3] Row7[0-3] ; ... ; Row6[4-7] Row7[4-7] ;------------------------ add r2, r2 lea r3, [r2 * 3] mov r5, rsp %assign x 0 %rep 2 movu m0, [r0 ] movu m1, [r0 + r2] movu m2, [r0 + r2 * 2] movu m3, [r0 + r3] punpcklwd m4, m0, m1 punpckhwd m0, m1 punpcklwd m5, m2, m3 punpckhwd m2, m3 punpckldq m1, m4, m5 ; m1 = [1 0] punpckhdq m4, m5 ; m4 = [3 2] punpckldq m3, m0, m2 punpckhdq m0, m2 pshufd m2, m3, 0x4E ; m2 = [4 5] pshufd m0, m0, 0x4E ; m0 = [6 7] paddw m3, m1, m0 psubw m1, m0 ; m1 = [d1 d0] paddw m0, m4, m2 psubw m4, m2 ; m4 = [d3 d2] punpcklqdq m2, m3, m0 ; m2 = [s2 s0] punpckhqdq m3, m0 pshufd m3, m3, 0x4E ; m3 = [s1 s3] punpcklwd m0, m1, m4 ; m0 = [d2/d0] punpckhwd m1, m4 ; m1 = [d3/d1] punpckldq m4, m0, m1 ; m4 = [d3 d1 d2 d0] punpckhdq m0, m1 ; m0 = [d3 d1 d2 d0] ; odd lea r4, [tab_dct8_1] pmaddwd m1, m4, [r4 + 0*16] pmaddwd m5, m0, [r4 + 0*16] pshufd m1, m1, 0xD8 pshufd m5, m5, 0xD8 mova m7, m1 punpckhqdq m7, m5 punpcklqdq m1, m5 paddd m1, m7 paddd m1, [pd_ %+ DCT8_ROUND1] psrad m1, DCT8_SHIFT1 %if x == 1 pshufd m1, m1, 0x1B %endif mova [r5 + 1*2*mmsize], m1 ; Row 1 pmaddwd m1, m4, [r4 + 1*16] pmaddwd m5, m0, [r4 + 1*16] pshufd m1, m1, 0xD8 pshufd m5, m5, 0xD8 mova m7, m1 punpckhqdq m7, m5 punpcklqdq m1, m5 paddd m1, m7 paddd m1, [pd_ %+ DCT8_ROUND1] psrad m1, DCT8_SHIFT1 %if x == 1 pshufd m1, m1, 0x1B %endif mova [r5 + 3*2*mmsize], m1 ; Row 3 pmaddwd m1, m4, [r4 + 2*16] pmaddwd m5, m0, [r4 + 2*16] pshufd m1, m1, 0xD8 pshufd m5, m5, 0xD8 mova m7, m1 punpckhqdq m7, m5 punpcklqdq m1, m5 paddd m1, m7 paddd m1, [pd_ %+ DCT8_ROUND1] psrad m1, DCT8_SHIFT1 %if x == 1 pshufd m1, m1, 0x1B %endif mova [r5 + 5*2*mmsize], m1 ; Row 5 pmaddwd m4, [r4 + 3*16] pmaddwd m0, [r4 + 3*16] pshufd m4, m4, 0xD8 pshufd m0, m0, 0xD8 mova m7, m4 punpckhqdq m7, m0 punpcklqdq m4, m0 paddd m4, m7 paddd m4, [pd_ %+ DCT8_ROUND1] psrad m4, DCT8_SHIFT1 %if x == 1 pshufd m4, m4, 0x1B %endif mova [r5 + 7*2*mmsize], m4 ; Row 7 ; even lea r4, [tab_dct4] paddw m0, m2, m3 ; m0 = [EE1 EE0] pshufd m0, m0, 0xD8 pshuflw m0, m0, 0xD8 pshufhw m0, m0, 0xD8 psubw m2, m3 ; m2 = [EO1 EO0] pmullw m2, [pw_ppppmmmm] pshufd m2, m2, 0xD8 pshuflw m2, m2, 0xD8 pshufhw m2, m2, 0xD8 pmaddwd m3, m0, [r4 + 0*16] paddd m3, [pd_ %+ DCT8_ROUND1] psrad m3, DCT8_SHIFT1 %if x == 1 pshufd m3, m3, 0x1B %endif mova [r5 + 0*2*mmsize], m3 ; Row 0 pmaddwd m0, [r4 + 2*16] paddd m0, [pd_ %+ DCT8_ROUND1] psrad m0, DCT8_SHIFT1 %if x == 1 pshufd m0, m0, 0x1B %endif mova [r5 + 4*2*mmsize], m0 ; Row 4 pmaddwd m3, m2, [r4 + 1*16] paddd m3, [pd_ %+ DCT8_ROUND1] psrad m3, DCT8_SHIFT1 %if x == 1 pshufd m3, m3, 0x1B %endif mova [r5 + 2*2*mmsize], m3 ; Row 2 pmaddwd m2, [r4 + 3*16] paddd m2, [pd_ %+ DCT8_ROUND1] psrad m2, DCT8_SHIFT1 %if x == 1 pshufd m2, m2, 0x1B %endif mova [r5 + 6*2*mmsize], m2 ; Row 6 %if x != 1 lea r0, [r0 + r2 * 4] add r5, mmsize %endif %assign x x+1 %endrep mov r0, rsp ; r0 = pointer to Low Part lea r4, [tab_dct8_2] %assign x 0 %rep 4 mova m0, [r0 + 0*2*mmsize] ; [3 2 1 0] mova m1, [r0 + 1*2*mmsize] paddd m2, m0, [r0 + (0*2+1)*mmsize] pshufd m2, m2, 0x9C ; m2 = [s2 s1 s3 s0] paddd m3, m1, [r0 + (1*2+1)*mmsize] pshufd m3, m3, 0x9C ; m3 = ^^ psubd m0, [r0 + (0*2+1)*mmsize] ; m0 = [d3 d2 d1 d0] psubd m1, [r0 + (1*2+1)*mmsize] ; m1 = ^^ ; even pshufd m4, m2, 0xD8 pshufd m3, m3, 0xD8 mova m7, m4 punpckhqdq m7, m3 punpcklqdq m4, m3 mova m2, m4 paddd m4, m7 ; m4 = [EE1 EE0 EE1 EE0] psubd m2, m7 ; m2 = [EO1 EO0 EO1 EO0] pslld m4, 5 ; m4 = [32*EE1 32*EE0] mova m5, m2 pmuludq m5, [r4 + 0*16] pshufd m7, m2, 0xF5 movu m6, [r4 + 0*16 + 4] pmuludq m7, m6 pshufd m5, m5, 0x88 pshufd m7, m7, 0x88 punpckldq m5, m7 ; m5 = [17*EO1 42*EO0] pshufd m7, m2, 0xF5 pmuludq m2, [r4 + 1*16] movu m6, [r4 + 1*16 + 4] pmuludq m7, m6 pshufd m2, m2, 0x88 pshufd m7, m7, 0x88 punpckldq m2, m7 ; m2 = [42*EO1 17*EO0] pshufd m3, m4, 0xD8 pshufd m5, m5, 0xD8 mova m7, m3 punpckhqdq m7, m5 punpcklqdq m3, m5 paddd m3, m7 ; m3 = [Row2 Row0] paddd m3, [pd_ %+ DCT8_ROUND2] psrad m3, DCT8_SHIFT2 pshufd m4, m4, 0xD8 pshufd m2, m2, 0xD8 mova m7, m4 punpckhqdq m7, m2 punpcklqdq m4, m2 psubd m4, m7 ; m4 = [Row6 Row4] paddd m4, [pd_ %+ DCT8_ROUND2] psrad m4, DCT8_SHIFT2 packssdw m3, m3 movd [r1 + 0*mmsize], m3 pshufd m3, m3, 1 movd [r1 + 2*mmsize], m3 packssdw m4, m4 movd [r1 + 4*mmsize], m4 pshufd m4, m4, 1 movd [r1 + 6*mmsize], m4 ; odd mova m2, m0 pmuludq m2, [r4 + 2*16] pshufd m7, m0, 0xF5 movu m6, [r4 + 2*16 + 4] pmuludq m7, m6 pshufd m2, m2, 0x88 pshufd m7, m7, 0x88 punpckldq m2, m7 mova m3, m1 pmuludq m3, [r4 + 2*16] pshufd m7, m1, 0xF5 pmuludq m7, m6 pshufd m3, m3, 0x88 pshufd m7, m7, 0x88 punpckldq m3, m7 mova m4, m0 pmuludq m4, [r4 + 3*16] pshufd m7, m0, 0xF5 movu m6, [r4 + 3*16 + 4] pmuludq m7, m6 pshufd m4, m4, 0x88 pshufd m7, m7, 0x88 punpckldq m4, m7 mova m5, m1 pmuludq m5, [r4 + 3*16] pshufd m7, m1, 0xF5 pmuludq m7, m6 pshufd m5, m5, 0x88 pshufd m7, m7, 0x88 punpckldq m5, m7 pshufd m2, m2, 0xD8 pshufd m3, m3, 0xD8 mova m7, m2 punpckhqdq m7, m3 punpcklqdq m2, m3 paddd m2, m7 pshufd m4, m4, 0xD8 pshufd m5, m5, 0xD8 mova m7, m4 punpckhqdq m7, m5 punpcklqdq m4, m5 paddd m4, m7 pshufd m2, m2, 0xD8 pshufd m4, m4, 0xD8 mova m7, m2 punpckhqdq m7, m4 punpcklqdq m2, m4 paddd m2, m7 ; m2 = [Row3 Row1] paddd m2, [pd_ %+ DCT8_ROUND2] psrad m2, DCT8_SHIFT2 packssdw m2, m2 movd [r1 + 1*mmsize], m2 pshufd m2, m2, 1 movd [r1 + 3*mmsize], m2 mova m2, m0 pmuludq m2, [r4 + 4*16] pshufd m7, m0, 0xF5 movu m6, [r4 + 4*16 + 4] pmuludq m7, m6 pshufd m2, m2, 0x88 pshufd m7, m7, 0x88 punpckldq m2, m7 mova m3, m1 pmuludq m3, [r4 + 4*16] pshufd m7, m1, 0xF5 pmuludq m7, m6 pshufd m3, m3, 0x88 pshufd m7, m7, 0x88 punpckldq m3, m7 mova m4, m0 pmuludq m4, [r4 + 5*16] pshufd m7, m0, 0xF5 movu m6, [r4 + 5*16 + 4] pmuludq m7, m6 pshufd m4, m4, 0x88 pshufd m7, m7, 0x88 punpckldq m4, m7 mova m5, m1 pmuludq m5, [r4 + 5*16] pshufd m7, m1, 0xF5 pmuludq m7, m6 pshufd m5, m5, 0x88 pshufd m7, m7, 0x88 punpckldq m5, m7 pshufd m2, m2, 0xD8 pshufd m3, m3, 0xD8 mova m7, m2 punpckhqdq m7, m3 punpcklqdq m2, m3 paddd m2, m7 pshufd m4, m4, 0xD8 pshufd m5, m5, 0xD8 mova m7, m4 punpckhqdq m7, m5 punpcklqdq m4, m5 paddd m4, m7 pshufd m2, m2, 0xD8 pshufd m4, m4, 0xD8 mova m7, m2 punpckhqdq m7, m4 punpcklqdq m2, m4 paddd m2, m7 ; m2 = [Row7 Row5] paddd m2, [pd_ %+ DCT8_ROUND2] psrad m2, DCT8_SHIFT2 packssdw m2, m2 movd [r1 + 5*mmsize], m2 pshufd m2, m2, 1 movd [r1 + 7*mmsize], m2 %if x < 3 add r1, mmsize/4 add r0, 2*2*mmsize %endif %assign x x+1 %endrep RET ; ============================================================================ ; void dct_8x8(const coeff_t *src, coeff_t *dst, int i_src) ; ============================================================================ ; ---------------------------------------------------------------------------- ; dct_8x8_sse4 INIT_XMM sse4 cglobal dct_8x8, 3,6,7,0-16*mmsize ;------------------------ ; Stack Mapping(dword) ;------------------------ ; Row0[0-3] Row1[0-3] ; ... ; Row6[0-3] Row7[0-3] ; Row0[0-3] Row7[0-3] ; ... ; Row6[4-7] Row7[4-7] ;------------------------ mova m6, [pd_ %+ DCT8_ROUND1] add r2, r2 lea r3, [r2 * 3] mov r5, rsp %assign x 0 %rep 2 movu m0, [r0] movu m1, [r0 + r2] movu m2, [r0 + r2 * 2] movu m3, [r0 + r3] punpcklwd m4, m0, m1 punpckhwd m0, m1 punpcklwd m5, m2, m3 punpckhwd m2, m3 punpckldq m1, m4, m5 ; m1 = [1 0] punpckhdq m4, m5 ; m4 = [3 2] punpckldq m3, m0, m2 punpckhdq m0, m2 pshufd m2, m3, 0x4E ; m2 = [4 5] pshufd m0, m0, 0x4E ; m0 = [6 7] paddw m3, m1, m0 psubw m1, m0 ; m1 = [d1 d0] paddw m0, m4, m2 psubw m4, m2 ; m4 = [d3 d2] punpcklqdq m2, m3, m0 ; m2 = [s2 s0] punpckhqdq m3, m0 pshufd m3, m3, 0x4E ; m3 = [s1 s3] punpcklwd m0, m1, m4 ; m0 = [d2/d0] punpckhwd m1, m4 ; m1 = [d3/d1] punpckldq m4, m0, m1 ; m4 = [d3 d1 d2 d0] punpckhdq m0, m1 ; m0 = [d3 d1 d2 d0] ; odd lea r4, [tab_dct8_1] pmaddwd m1, m4, [r4 + 0*16] pmaddwd m5, m0, [r4 + 0*16] phaddd m1, m5 paddd m1, m6 psrad m1, DCT8_SHIFT1 %if x == 1 pshufd m1, m1, 0x1B %endif mova [r5 + 1*2*mmsize], m1 ; Row 1 pmaddwd m1, m4, [r4 + 1*16] pmaddwd m5, m0, [r4 + 1*16] phaddd m1, m5 paddd m1, m6 psrad m1, DCT8_SHIFT1 %if x == 1 pshufd m1, m1, 0x1B %endif mova [r5 + 3*2*mmsize], m1 ; Row 3 pmaddwd m1, m4, [r4 + 2*16] pmaddwd m5, m0, [r4 + 2*16] phaddd m1, m5 paddd m1, m6 psrad m1, DCT8_SHIFT1 %if x == 1 pshufd m1, m1, 0x1B %endif mova [r5 + 5*2*mmsize], m1 ; Row 5 pmaddwd m4, [r4 + 3*16] pmaddwd m0, [r4 + 3*16] phaddd m4, m0 paddd m4, m6 psrad m4, DCT8_SHIFT1 %if x == 1 pshufd m4, m4, 0x1B %endif mova [r5 + 7*2*mmsize], m4 ; Row 7 ; even lea r4, [tab_dct4] paddw m0, m2, m3 ; m0 = [EE1 EE0] pshufb m0, [pb_unpackhlw1] psubw m2, m3 ; m2 = [EO1 EO0] psignw m2, [pw_ppppmmmm] pshufb m2, [pb_unpackhlw1] pmaddwd m3, m0, [r4 + 0*16] paddd m3, m6 psrad m3, DCT8_SHIFT1 %if x == 1 pshufd m3, m3, 0x1B %endif mova [r5 + 0*2*mmsize], m3 ; Row 0 pmaddwd m0, [r4 + 2*16] paddd m0, m6 psrad m0, DCT8_SHIFT1 %if x == 1 pshufd m0, m0, 0x1B %endif mova [r5 + 4*2*mmsize], m0 ; Row 4 pmaddwd m3, m2, [r4 + 1*16] paddd m3, m6 psrad m3, DCT8_SHIFT1 %if x == 1 pshufd m3, m3, 0x1B %endif mova [r5 + 2*2*mmsize], m3 ; Row 2 pmaddwd m2, [r4 + 3*16] paddd m2, m6 psrad m2, DCT8_SHIFT1 %if x == 1 pshufd m2, m2, 0x1B %endif mova [r5 + 6*2*mmsize], m2 ; Row 6 %if x != 1 lea r0, [r0 + r2 * 4] add r5, mmsize %endif %assign x x+1 %endrep mov r2, 2 mov r0, rsp ; r0 = pointer to Low Part lea r4, [tab_dct8_2] mova m6, [pd_ %+ DCT8_ROUND2] .pass2: %rep 2 mova m0, [r0 + 0*2*mmsize] ; [3 2 1 0] mova m1, [r0 + 1*2*mmsize] paddd m2, m0, [r0 + (0*2+1)*mmsize] pshufd m2, m2, 0x9C ; m2 = [s2 s1 s3 s0] paddd m3, m1, [r0 + (1*2+1)*mmsize] pshufd m3, m3, 0x9C ; m3 = ^^ psubd m0, [r0 + (0*2+1)*mmsize] ; m0 = [d3 d2 d1 d0] psubd m1, [r0 + (1*2+1)*mmsize] ; m1 = ^^ ; even phaddd m4, m2, m3 ; m4 = [EE1 EE0 EE1 EE0] phsubd m2, m3 ; m2 = [EO1 EO0 EO1 EO0] pslld m4, 5 ; m4 = [32*EE1 32*EE0] pmulld m5, m2, [r4 + 0*16] ; m5 = [17*EO1 42*EO0] pmulld m2, [r4 + 1*16] ; m2 = [42*EO1 17*EO0] phaddd m3, m4, m5 ; m3 = [Row2 Row0] paddd m3, m6 psrad m3, DCT8_SHIFT2 phsubd m4, m2 ; m4 = [Row6 Row4] paddd m4, m6 psrad m4, DCT8_SHIFT2 packssdw m3, m3 movd [r1 + 0*mmsize], m3 pshufd m3, m3, 1 movd [r1 + 2*mmsize], m3 packssdw m4, m4 movd [r1 + 4*mmsize], m4 pshufd m4, m4, 1 movd [r1 + 6*mmsize], m4 ; odd pmulld m2, m0, [r4 + 2*16] pmulld m3, m1, [r4 + 2*16] pmulld m4, m0, [r4 + 3*16] pmulld m5, m1, [r4 + 3*16] phaddd m2, m3 phaddd m4, m5 phaddd m2, m4 ; m2 = [Row3 Row1] paddd m2, m6 psrad m2, DCT8_SHIFT2 packssdw m2, m2 movd [r1 + 1*mmsize], m2 pshufd m2, m2, 1 movd [r1 + 3*mmsize], m2 pmulld m2, m0, [r4 + 4*16] pmulld m3, m1, [r4 + 4*16] pmulld m4, m0, [r4 + 5*16] pmulld m5, m1, [r4 + 5*16] phaddd m2, m3 phaddd m4, m5 phaddd m2, m4 ; m2 = [Row7 Row5] paddd m2, m6 psrad m2, DCT8_SHIFT2 packssdw m2, m2 movd [r1 + 5*mmsize], m2 pshufd m2, m2, 1 movd [r1 + 7*mmsize], m2 add r1, mmsize/4 add r0, 2*2*mmsize %endrep dec r2 jnz .pass2 RET ; ============================================================================ ; void idct_8x8(const coeff_t *src, coeff_t *dst, int i_dst) ; ============================================================================ ; ---------------------------------------------------------------------------- %if ARCH_X86_64 INIT_XMM sse2 cglobal idct_8x8, 3, 6, 16, 0-5*mmsize mova m9, [r0 + 1 * mmsize] mova m1, [r0 + 3 * mmsize] mova m7, m9 punpcklwd m7, m1 punpckhwd m9, m1 mova m14, [tab_idct8_3] mova m3, m14 pmaddwd m14, m7 pmaddwd m3, m9 mova m0, [r0 + 5 * mmsize] mova m10, [r0 + 7 * mmsize] mova m2, m0 punpcklwd m2, m10 punpckhwd m0, m10 mova m15, [tab_idct8_3 + 1 * mmsize] mova m11, [tab_idct8_3 + 1 * mmsize] pmaddwd m15, m2 mova m4, [tab_idct8_3 + 2 * mmsize] pmaddwd m11, m0 mova m1, [tab_idct8_3 + 2 * mmsize] paddd m15, m14 mova m5, [tab_idct8_3 + 4 * mmsize] mova m12, [tab_idct8_3 + 4 * mmsize] paddd m11, m3 mova [rsp + 0 * mmsize], m11 mova [rsp + 1 * mmsize], m15 pmaddwd m4, m7 pmaddwd m1, m9 mova m14, [tab_idct8_3 + 3 * mmsize] mova m3, [tab_idct8_3 + 3 * mmsize] pmaddwd m14, m2 pmaddwd m3, m0 paddd m14, m4 paddd m3, m1 mova [rsp + 2 * mmsize], m3 pmaddwd m5, m9 pmaddwd m9, [tab_idct8_3 + 6 * mmsize] mova m6, [tab_idct8_3 + 5 * mmsize] pmaddwd m12, m7 pmaddwd m7, [tab_idct8_3 + 6 * mmsize] mova m4, [tab_idct8_3 + 5 * mmsize] pmaddwd m6, m2 paddd m6, m12 pmaddwd m2, [tab_idct8_3 + 7 * mmsize] paddd m7, m2 mova [rsp + 3 * mmsize], m6 pmaddwd m4, m0 pmaddwd m0, [tab_idct8_3 + 7 * mmsize] paddd m9, m0 paddd m5, m4 mova m6, [r0 + 0 * mmsize] mova m0, [r0 + 4 * mmsize] mova m4, m6 punpcklwd m4, m0 punpckhwd m6, m0 mova m12, [r0 + 2 * mmsize] mova m0, [r0 + 6 * mmsize] mova m13, m12 mova m8, [tab_dct4] punpcklwd m13, m0 mova m10, [tab_dct4] punpckhwd m12, m0 pmaddwd m8, m4 mova m3, m8 pmaddwd m4, [tab_dct4 + 2 * mmsize] pmaddwd m10, m6 mova m2, [tab_dct4 + 1 * mmsize] mova m1, m10 pmaddwd m6, [tab_dct4 + 2 * mmsize] mova m0, [tab_dct4 + 1 * mmsize] pmaddwd m2, m13 paddd m3, m2 psubd m8, m2 mova m2, m6 pmaddwd m13, [tab_dct4 + 3 * mmsize] pmaddwd m0, m12 paddd m1, m0 psubd m10, m0 mova m0, m4 pmaddwd m12, [tab_dct4 + 3 * mmsize] paddd m3, [pd_ %+ IDCT8_ROUND1] paddd m1, [pd_ %+ IDCT8_ROUND1] paddd m8, [pd_ %+ IDCT8_ROUND1] paddd m10, [pd_ %+ IDCT8_ROUND1] paddd m0, m13 paddd m2, m12 paddd m0, [pd_ %+ IDCT8_ROUND1] paddd m2, [pd_ %+ IDCT8_ROUND1] psubd m4, m13 psubd m6, m12 paddd m4, [pd_ %+ IDCT8_ROUND1] paddd m6, [pd_ %+ IDCT8_ROUND1] mova m12, m8 psubd m8, m7 psrad m8, IDCT8_SHIFT1 paddd m15, m3 psubd m3, [rsp + 1 * mmsize] psrad m15, IDCT8_SHIFT1 paddd m12, m7 psrad m12, IDCT8_SHIFT1 paddd m11, m1 mova m13, m14 psrad m11, IDCT8_SHIFT1 packssdw m15, m11 psubd m1, [rsp + 0 * mmsize] psrad m1, IDCT8_SHIFT1 mova m11, [rsp + 2 * mmsize] paddd m14, m0 psrad m14, IDCT8_SHIFT1 psubd m0, m13 psrad m0, IDCT8_SHIFT1 paddd m11, m2 mova m13, [rsp + 3 * mmsize] psrad m11, IDCT8_SHIFT1 packssdw m14, m11 mova m11, m6 psubd m6, m5 paddd m13, m4 psrad m13, IDCT8_SHIFT1 psrad m6, IDCT8_SHIFT1 paddd m11, m5 psrad m11, IDCT8_SHIFT1 packssdw m13, m11 mova m11, m10 psubd m4, [rsp + 3 * mmsize] psubd m10, m9 psrad m4, IDCT8_SHIFT1 psrad m10, IDCT8_SHIFT1 packssdw m4, m6 packssdw m8, m10 paddd m11, m9 psrad m11, IDCT8_SHIFT1 packssdw m12, m11 psubd m2, [rsp + 2 * mmsize] mova m5, m15 psrad m2, IDCT8_SHIFT1 packssdw m0, m2 mova m2, m14 psrad m3, IDCT8_SHIFT1 packssdw m3, m1 mova m6, m13 punpcklwd m5, m8 punpcklwd m2, m4 mova m1, m12 punpcklwd m6, m0 punpcklwd m1, m3 mova m9, m5 punpckhwd m13, m0 mova m0, m2 punpcklwd m9, m6 punpckhwd m5, m6 punpcklwd m0, m1 punpckhwd m2, m1 punpckhwd m15, m8 mova m1, m5 punpckhwd m14, m4 punpckhwd m12, m3 mova m6, m9 punpckhwd m9, m0 punpcklwd m1, m2 mova m4, [tab_idct8_3 + 0 * mmsize] punpckhwd m5, m2 punpcklwd m6, m0 mova m2, m15 mova m0, m14 mova m7, m9 punpcklwd m2, m13 punpcklwd m0, m12 punpcklwd m7, m5 punpckhwd m14, m12 mova m10, m2 punpckhwd m15, m13 punpckhwd m9, m5 pmaddwd m4, m7 mova m13, m1 punpckhwd m2, m0 punpcklwd m10, m0 mova m0, m15 punpckhwd m15, m14 mova m12, m1 mova m3, [tab_idct8_3 + 0 * mmsize] punpcklwd m0, m14 pmaddwd m3, m9 mova m11, m2 punpckhwd m2, m15 punpcklwd m11, m15 mova m8, [tab_idct8_3 + 1 * mmsize] punpcklwd m13, m0 punpckhwd m12, m0 pmaddwd m8, m11 paddd m8, m4 mova [rsp + 4 * mmsize], m8 mova m4, [tab_idct8_3 + 2 * mmsize] pmaddwd m4, m7 mova m15, [tab_idct8_3 + 2 * mmsize] mova m5, [tab_idct8_3 + 1 * mmsize] pmaddwd m15, m9 pmaddwd m5, m2 paddd m5, m3 mova [rsp + 3 * mmsize], m5 mova m14, [tab_idct8_3 + 3 * mmsize] mova m5, [tab_idct8_3 + 3 * mmsize] pmaddwd m14, m11 paddd m14, m4 mova [rsp + 2 * mmsize], m14 pmaddwd m5, m2 paddd m5, m15 mova [rsp + 1 * mmsize], m5 mova m15, [tab_idct8_3 + 4 * mmsize] mova m5, [tab_idct8_3 + 4 * mmsize] pmaddwd m15, m7 pmaddwd m7, [tab_idct8_3 + 6 * mmsize] pmaddwd m5, m9 pmaddwd m9, [tab_idct8_3 + 6 * mmsize] mova m4, [tab_idct8_3 + 5 * mmsize] pmaddwd m4, m2 paddd m5, m4 mova m4, m6 mova m8, [tab_idct8_3 + 5 * mmsize] punpckhwd m6, m10 pmaddwd m2, [tab_idct8_3 + 7 * mmsize] punpcklwd m4, m10 paddd m9, m2 pmaddwd m8, m11 mova m10, [tab_dct4] paddd m8, m15 pmaddwd m11, [tab_idct8_3 + 7 * mmsize] paddd m7, m11 mova [rsp + 0 * mmsize], m8 pmaddwd m10, m6 pmaddwd m6, [tab_dct4 + 2 * mmsize] mova m1, m10 mova m8, [tab_dct4] mova m3, [tab_dct4 + 1 * mmsize] pmaddwd m8, m4 pmaddwd m4, [tab_dct4 + 2 * mmsize] mova m0, m8 mova m2, [tab_dct4 + 1 * mmsize] pmaddwd m3, m13 psubd m8, m3 paddd m0, m3 mova m3, m6 pmaddwd m13, [tab_dct4 + 3 * mmsize] pmaddwd m2, m12 paddd m1, m2 psubd m10, m2 mova m2, m4 pmaddwd m12, [tab_dct4 + 3 * mmsize] mova m15, [pd_ %+ IDCT8_ROUND2] paddd m0, m15 paddd m1, m15 paddd m8, m15 paddd m10, m15 paddd m2, m13 paddd m3, m12 paddd m2, m15 paddd m3, m15 psubd m4, m13 psubd m6, m12 paddd m4, m15 paddd m6, m15 mova m15, [rsp + 4 * mmsize] mova m12, m8 psubd m8, m7 psrad m8, IDCT8_SHIFT2 mova m11, [rsp + 3 * mmsize] paddd m15, m0 psrad m15, IDCT8_SHIFT2 psubd m0, [rsp + 4 * mmsize] psrad m0, IDCT8_SHIFT2 paddd m12, m7 paddd m11, m1 mova m14, [rsp + 2 * mmsize] psrad m11, IDCT8_SHIFT2 packssdw m15, m11 psubd m1, [rsp + 3 * mmsize] psrad m1, IDCT8_SHIFT2 mova m11, [rsp + 1 * mmsize] paddd m14, m2 psrad m14, IDCT8_SHIFT2 packssdw m0, m1 psrad m12, IDCT8_SHIFT2 psubd m2, [rsp + 2 * mmsize] paddd m11, m3 mova m13, [rsp + 0 * mmsize] psrad m11, IDCT8_SHIFT2 packssdw m14, m11 mova m11, m6 psubd m6, m5 paddd m13, m4 psrad m13, IDCT8_SHIFT2 mova m1, m15 paddd m11, m5 psrad m11, IDCT8_SHIFT2 packssdw m13, m11 mova m11, m10 psubd m10, m9 psrad m10, IDCT8_SHIFT2 packssdw m8, m10 psrad m6, IDCT8_SHIFT2 psubd m4, [rsp + 0 * mmsize] paddd m11, m9 psrad m11, IDCT8_SHIFT2 packssdw m12, m11 punpcklwd m1, m14 mova m5, m13 psrad m4, IDCT8_SHIFT2 packssdw m4, m6 psubd m3, [rsp + 1 * mmsize] psrad m2, IDCT8_SHIFT2 mova m6, m8 psrad m3, IDCT8_SHIFT2 punpcklwd m5, m12 packssdw m2, m3 punpcklwd m6, m4 punpckhwd m8, m4 mova m4, m1 mova m3, m2 punpckhdq m1, m5 punpckldq m4, m5 punpcklwd m3, m0 punpckhwd m2, m0 mova m0, m6 lea r2, [r2 + r2] lea r4, [r2 + r2] lea r3, [r4 + r2] lea r4, [r4 + r3] lea r0, [r4 + r2 * 2] movq [r1], m4 punpckhwd m15, m14 movhps [r1 + r2], m4 punpckhdq m0, m3 movq [r1 + r2 * 2], m1 punpckhwd m13, m12 movhps [r1 + r3], m1 mova m1, m6 punpckldq m1, m3 movq [r1 + 8], m1 movhps [r1 + r2 + 8], m1 movq [r1 + r2 * 2 + 8], m0 movhps [r1 + r3 + 8], m0 mova m0, m15 punpckhdq m15, m13 punpckldq m0, m13 movq [r1 + r2 * 4], m0 movhps [r1 + r4], m0 mova m0, m8 punpckhdq m8, m2 movq [r1 + r3 * 2], m15 punpckldq m0, m2 movhps [r1 + r0], m15 movq [r1 + r2 * 4 + 8], m0 movhps [r1 + r4 + 8], m0 movq [r1 + r3 * 2 + 8], m8 movhps [r1 + r0 + 8], m8 RET %endif ; ============================================================================ ; void idct_8x8(const coeff_t *src, coeff_t *dst, int i_dst) ; ============================================================================ ; ---------------------------------------------------------------------------- INIT_XMM ssse3 cglobal patial_butterfly_inverse_internal_pass1 movh m0, [r0] movhps m0, [r0 + 2 * 16] movh m1, [r0 + 4 * 16] movhps m1, [r0 + 6 * 16] punpckhwd m2, m0, m1 ; [2 6] punpcklwd m0, m1 ; [0 4] pmaddwd m1, m0, [r6] ; EE[0] pmaddwd m0, [r6 + 32] ; EE[1] pmaddwd m3, m2, [r6 + 16] ; EO[0] pmaddwd m2, [r6 + 48] ; EO[1] paddd m4, m1, m3 ; E[0] psubd m1, m3 ; E[3] paddd m3, m0, m2 ; E[1] psubd m0, m2 ; E[2] ;E[K] = E[k] + add mova m5, [pd_ %+ IDCT8_ROUND1] paddd m0, m5 paddd m1, m5 paddd m3, m5 paddd m4, m5 movh m2, [r0 + 16] movhps m2, [r0 + 5 * 16] movh m5, [r0 + 3 * 16] movhps m5, [r0 + 7 * 16] punpcklwd m6, m2, m5 ; [1 3] punpckhwd m2, m5 ; [5 7] pmaddwd m5, m6, [r4 ] pmaddwd m7, m2, [r4 + 16] paddd m5, m7 ; O[0] paddd m7, m4, m5 psrad m7, IDCT8_SHIFT1 psubd m4, m5 psrad m4, IDCT8_SHIFT1 packssdw m7, m4 movh [r5 + 0 * 16], m7 movhps [r5 + 7 * 16], m7 pmaddwd m5, m6, [r4 + 32] pmaddwd m4, m2, [r4 + 48] paddd m5, m4 ; O[1] paddd m4, m3, m5 psrad m4, IDCT8_SHIFT1 psubd m3, m5 psrad m3, IDCT8_SHIFT1 packssdw m4, m3 movh [r5 + 1 * 16], m4 movhps [r5 + 6 * 16], m4 pmaddwd m5, m6, [r4 + 64] pmaddwd m4, m2, [r4 + 80] paddd m5, m4 ; O[2] paddd m4, m0, m5 psrad m4, IDCT8_SHIFT1 psubd m0, m5 psrad m0, IDCT8_SHIFT1 packssdw m4, m0 movh [r5 + 2 * 16], m4 movhps [r5 + 5 * 16], m4 pmaddwd m5, m6, [r4 + 96] pmaddwd m4, m2, [r4 + 112] paddd m5, m4 ; O[3] paddd m4, m1, m5 psrad m4, IDCT8_SHIFT1 psubd m1, m5 psrad m1, IDCT8_SHIFT1 packssdw m4, m1 movh [r5 + 3 * 16], m4 movhps [r5 + 4 * 16], m4 ret %macro PARTIAL_BUTTERFLY_PROCESS_ROW 1 pshufb m4, %1, [pb_idct8even] pmaddwd m4, [tab_idct8_1] phsubd m5, m4 pshufd m4, m4, 0x4E phaddd m4, m4 punpckhqdq m4, m5 ; m4 = dd e[ 0 1 2 3] paddd m4, m6 pshufb %1, %1, [r6] pmaddwd m5, %1, [r4] pmaddwd %1, [r4 + 16] phaddd m5, %1 ; m5 = dd O[0, 1, 2, 3] paddd %1, m4, m5 psrad %1, IDCT8_SHIFT2 psubd m4, m5 psrad m4, IDCT8_SHIFT2 pshufd m4, m4, 0x1B packssdw %1, m4 %endmacro ; ---------------------------------------------------------------------------- INIT_XMM ssse3 cglobal patial_butterfly_inverse_internal_pass2 mova m0, [r5] PARTIAL_BUTTERFLY_PROCESS_ROW m0 movu [r1], m0 mova m2, [r5 + 16] PARTIAL_BUTTERFLY_PROCESS_ROW m2 movu [r1 + r2], m2 mova m1, [r5 + 32] PARTIAL_BUTTERFLY_PROCESS_ROW m1 movu [r1 +2*r2], m1 mova m3, [r5 + 48] PARTIAL_BUTTERFLY_PROCESS_ROW m3 movu [r1 + r3], m3 ret ; ---------------------------------------------------------------------------- ; idct_8x8_ssse3 INIT_XMM ssse3 cglobal idct_8x8, 3,7,8 ;,0-16*mmsize ; alignment stack to 64-bytes mov r5, rsp sub rsp, 16*mmsize + gprsize and rsp, ~(64-1) mov [rsp + 16*mmsize], r5 mov r5, rsp lea r4, [tab_idct8_3] lea r6, [tab_dct4] call patial_butterfly_inverse_internal_pass1 add r0, 8 add r5, 8 call patial_butterfly_inverse_internal_pass1 mova m6, [pd_ %+ IDCT8_ROUND2] add r2, r2 lea r3, [r2 * 3] lea r4, [tab_idct8_2] lea r6, [pb_idct8odd] sub r5, 8 call patial_butterfly_inverse_internal_pass2 lea r1, [r1 + 4 * r2] add r5, 64 call patial_butterfly_inverse_internal_pass2 ; restore origin stack pointer mov rsp, [rsp + 16*mmsize] RET ; ============================================================================ ; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size) ; ============================================================================ ; ---------------------------------------------------------------------------- ; denoise_dct_sse4 INIT_XMM sse4 cglobal denoise_dct, 4, 4, 6 pxor m5, m5 shr r3d, 3 .loop: mova m0, [r0] pabsw m1, m0 movu m2, [r1] pmovsxwd m3, m1 paddd m2, m3 movu [r1], m2 movu m2, [r1 + 16] psrldq m3, m1, 8 pmovsxwd m4, m3 paddd m2, m4 movu [r1 + 16], m2 movu m3, [r2] psubusw m1, m3 pcmpgtw m4, m1, m5 pand m1, m4 psignw m1, m0 mova [r0], m1 add r0, 16 add r1, 32 add r2, 16 dec r3d jnz .loop RET ; ---------------------------------------------------------------------------- ; denoise_dct_avx2 INIT_YMM avx2 cglobal denoise_dct, 4, 4, 6 pxor m5, m5 shr r3d, 4 .loop: movu m0, [r0] pabsw m1, m0 movu m2, [r1] pmovsxwd m4, xm1 paddd m2, m4 movu [r1], m2 vextracti128 xm4, m1, 1 movu m2, [r1 + 32] pmovsxwd m3, xm4 paddd m2, m3 movu [r1 + 32], m2 movu m3, [r2] psubusw m1, m3 pcmpgtw m4, m1, m5 pand m1, m4 psignw m1, m0 movu [r0], m1 add r0, 32 add r1, 64 add r2, 32 dec r3d jnz .loop RET ; ============================================================================ ; ARCH_X86_64 ONLY ; ============================================================================ %if ARCH_X86_64 == 1 %macro DCT8_PASS_1 4 vpbroadcastq m0, [r6 + %1] pmaddwd m2, m%3, m0 pmaddwd m0, m%4 phaddd m2, m0 paddd m2, m5 psrad m2, DCT8_SHIFT1 packssdw m2, m2 vpermq m2, m2, 0x08 mova [r5 + %2], xm2 %endmacro %macro DCT8_PASS_2 2 vbroadcasti128 m4, [r6 + %1] pmaddwd m6, m0, m4 pmaddwd m7, m1, m4 pmaddwd m8, m2, m4 pmaddwd m9, m3, m4 phaddd m6, m7 phaddd m8, m9 phaddd m6, m8 paddd m6, m5 psrad m6, DCT8_SHIFT2 vbroadcasti128 m4, [r6 + %2] pmaddwd m10, m0, m4 pmaddwd m7, m1, m4 pmaddwd m8, m2, m4 pmaddwd m9, m3, m4 phaddd m10, m7 phaddd m8, m9 phaddd m10, m8 paddd m10, m5 psrad m10, DCT8_SHIFT2 packssdw m6, m10 vpermq m10, m6, 0xD8 %endmacro ; ============================================================================ ; void dct_8x8(const coeff_t *src, coeff_t *dst, int i_src) ; ============================================================================ ; ---------------------------------------------------------------------------- ; dct_8x8_avx2 INIT_YMM avx2 cglobal dct_8x8, 3, 7, 11, 0-8*16 vbroadcasti128 m5, [pd_ %+ DCT8_ROUND1] add r2d, r2d lea r3, [r2 * 3] lea r4, [r0 + r2 * 4] mov r5, rsp lea r6, [tab_dct8] mova m6, [dct8_shuf] ;pass1 mova xm0, [r0] vinserti128 m0, m0, [r4], 1 mova xm1, [r0 + r2] vinserti128 m1, m1, [r4 + r2], 1 mova xm2, [r0 + r2 * 2] vinserti128 m2, m2, [r4 + r2 * 2], 1 mova xm3, [r0 + r3] vinserti128 m3, m3, [r4 + r3], 1 punpcklqdq m4, m0, m1 punpckhqdq m0, m1 punpcklqdq m1, m2, m3 punpckhqdq m2, m3 pshufb m0, m6 pshufb m2, m6 paddw m3, m4, m0 paddw m7, m1, m2 psubw m4, m0 psubw m1, m2 DCT8_PASS_1 0 * 16, 0 * 16, 3, 7 DCT8_PASS_1 1 * 16, 2 * 16, 4, 1 DCT8_PASS_1 2 * 16, 4 * 16, 3, 7 DCT8_PASS_1 3 * 16, 6 * 16, 4, 1 DCT8_PASS_1 4 * 16, 1 * 16, 3, 7 DCT8_PASS_1 5 * 16, 3 * 16, 4, 1 DCT8_PASS_1 6 * 16, 5 * 16, 3, 7 DCT8_PASS_1 7 * 16, 7 * 16, 4, 1 ;pass2 vbroadcasti128 m5, [pd_ %+ DCT8_ROUND2] mova m0, [r5] mova m1, [r5 + 32] mova m2, [r5 + 64] mova m3, [r5 + 96] DCT8_PASS_2 0 * 16, 1 * 16 movu [r1 ], m10 DCT8_PASS_2 2 * 16, 3 * 16 movu [r1 + 32], m10 DCT8_PASS_2 4 * 16, 5 * 16 movu [r1 + 64], m10 DCT8_PASS_2 6 * 16, 7 * 16 movu [r1 + 96], m10 RET %macro DCT16_PASS_1_E 2 vpbroadcastq m7, [r7 + %1] pmaddwd m4, m0, m7 pmaddwd m6, m2, m7 phaddd m4, m6 paddd m4, m9 psrad m4, DCT16_SHIFT1 packssdw m4, m4 vpermq m4, m4, 0x08 mova [r5 + %2], xm4 %endmacro %macro DCT16_PASS_1_O 2 vbroadcasti128 m7, [r7 + %1] pmaddwd m10, m0, m7 pmaddwd m11, m2, m7 phaddd m10, m11 ; [d0 d0 d1 d1 d4 d4 d5 d5] pmaddwd m11, m4, m7 pmaddwd m12, m6, m7 phaddd m11, m12 ; [d2 d2 d3 d3 d6 d6 d7 d7] phaddd m10, m11 ; [d0 d1 d2 d3 d4 d5 d6 d7] paddd m10, m9 psrad m10, DCT16_SHIFT1 packssdw m10, m10 ; [w0 w1 w2 w3 - - - - w4 w5 w6 w7 - - - -] vpermq m10, m10, 0x08 mova [r5 + %2], xm10 %endmacro %macro DCT16_PASS_2 2 vbroadcasti128 m8, [r7 + %1] vbroadcasti128 m13, [r8 + %1] pmaddwd m10, m0, m8 pmaddwd m11, m1, m13 paddd m10, m11 pmaddwd m11, m2, m8 pmaddwd m12, m3, m13 paddd m11, m12 phaddd m10, m11 pmaddwd m11, m4, m8 pmaddwd m12, m5, m13 paddd m11, m12 pmaddwd m12, m6, m8 pmaddwd m13, m7, m13 paddd m12, m13 phaddd m11, m12 phaddd m10, m11 paddd m10, m9 psrad m10, DCT16_SHIFT2 vbroadcasti128 m8, [r7 + %2] vbroadcasti128 m13, [r8 + %2] pmaddwd m14, m0, m8 pmaddwd m11, m1, m13 paddd m14, m11 pmaddwd m11, m2, m8 pmaddwd m12, m3, m13 paddd m11, m12 phaddd m14, m11 pmaddwd m11, m4, m8 pmaddwd m12, m5, m13 paddd m11, m12 pmaddwd m12, m6, m8 pmaddwd m13, m7, m13 paddd m12, m13 phaddd m11, m12 phaddd m14, m11 paddd m14, m9 psrad m14, DCT16_SHIFT2 packssdw m10, m14 vextracti128 xm14, m10, 1 movlhps xm15, xm10, xm14 movhlps xm14, xm10 %endmacro ; ============================================================================ ; void dct_16x16(const coeff_t *src, coeff_t *dst, int i_src) ; ============================================================================ ; ---------------------------------------------------------------------------- ; dct_16x16_avx2 INIT_YMM avx2 cglobal dct_16x16, 3, 9, 16, 0-16*mmsize vbroadcasti128 m9, [pd_ %+ DCT16_ROUND1] add r2d, r2d mova m13, [dct16_shuf1] mova m14, [dct16_shuf2] lea r7, [tab_dct16_1 + 8 * 16] lea r8, [tab_dct16_2 + 8 * 16] lea r3, [r2 * 3] mov r5, rsp mov r4d, 2 ; Each iteration process 8 rows, so 16/8 iterations .pass1: lea r6, [r0 + r2 * 4] movu m2, [r0] movu m1, [r6] vperm2i128 m0, m2, m1, 0x20 ; [row0lo row4lo] vperm2i128 m1, m2, m1, 0x31 ; [row0hi row4hi] movu m4, [r0 + r2] movu m3, [r6 + r2] vperm2i128 m2, m4, m3, 0x20 ; [row1lo row5lo] vperm2i128 m3, m4, m3, 0x31 ; [row1hi row5hi] movu m6, [r0 + r2 * 2] movu m5, [r6 + r2 * 2] vperm2i128 m4, m6, m5, 0x20 ; [row2lo row6lo] vperm2i128 m5, m6, m5, 0x31 ; [row2hi row6hi] movu m8, [r0 + r3] movu m7, [r6 + r3] vperm2i128 m6, m8, m7, 0x20 ; [row3lo row7lo] vperm2i128 m7, m8, m7, 0x31 ; [row3hi row7hi] pshufb m1, m13 pshufb m3, m13 pshufb m5, m13 pshufb m7, m13 paddw m8, m0, m1 ; E psubw m0, m1 ; O paddw m1, m2, m3 ; E psubw m2, m3 ; O paddw m3, m4, m5 ; E psubw m4, m5 ; O paddw m5, m6, m7 ; E psubw m6, m7 ; O DCT16_PASS_1_O -7 * 16, 1 * 32 DCT16_PASS_1_O -5 * 16, 3 * 32 DCT16_PASS_1_O -3 * 16, 1 * 32 + 16 DCT16_PASS_1_O -1 * 16, 3 * 32 + 16 DCT16_PASS_1_O 1 * 16, 5 * 32 DCT16_PASS_1_O 3 * 16, 7 * 32 DCT16_PASS_1_O 5 * 16, 5 * 32 + 16 DCT16_PASS_1_O 7 * 16, 7 * 32 + 16 pshufb m8, m14 pshufb m1, m14 phaddw m0, m8, m1 pshufb m3, m14 pshufb m5, m14 phaddw m2, m3, m5 DCT16_PASS_1_E -8 * 16, 0 * 32 DCT16_PASS_1_E -4 * 16, 0 * 32 + 16 DCT16_PASS_1_E 0 * 16, 4 * 32 DCT16_PASS_1_E 4 * 16, 4 * 32 + 16 phsubw m0, m8, m1 phsubw m2, m3, m5 DCT16_PASS_1_E -6 * 16, 2 * 32 DCT16_PASS_1_E -2 * 16, 2 * 32 + 16 DCT16_PASS_1_E 2 * 16, 6 * 32 DCT16_PASS_1_E 6 * 16, 6 * 32 + 16 lea r0, [r0 + 8 * r2] add r5, 256 dec r4d jnz .pass1 mov r5, rsp mov r4d, 2 mov r2d, 32 lea r3, [r2 * 3] vbroadcasti128 m9, [pd_ %+ DCT16_ROUND2] .pass2: mova m0, [r5 + 0 * 32] ; [row0lo row4lo] mova m1, [r5 + 8 * 32] ; [row0hi row4hi] mova m2, [r5 + 1 * 32] ; [row1lo row5lo] mova m3, [r5 + 9 * 32] ; [row1hi row5hi] mova m4, [r5 + 2 * 32] ; [row2lo row6lo] mova m5, [r5 + 10 * 32] ; [row2hi row6hi] mova m6, [r5 + 3 * 32] ; [row3lo row7lo] mova m7, [r5 + 11 * 32] ; [row3hi row7hi] DCT16_PASS_2 -8 * 16, -7 * 16 movu [r1 ], xm15 movu [r1+r2], xm14 DCT16_PASS_2 -6 * 16, -5 * 16 movu [r1+r2*2], xm15 movu [r1+r3 ], xm14 lea r6, [r1 + r2 * 4] DCT16_PASS_2 -4 * 16, -3 * 16 movu [r6 ], xm15 movu [r6+r2], xm14 DCT16_PASS_2 -2 * 16, -1 * 16 movu [r6+r2*2], xm15 movu [r6+r3 ], xm14 lea r6, [r6 + r2 * 4] DCT16_PASS_2 0 * 16, 1 * 16 movu [r6 ], xm15 movu [r6+r2], xm14 DCT16_PASS_2 2 * 16, 3 * 16 movu [r6+r2*2], xm15 movu [r6+r3 ], xm14 lea r6, [r6 + r2 * 4] DCT16_PASS_2 4 * 16, 5 * 16 movu [r6 ], xm15 movu [r6+r2], xm14 DCT16_PASS_2 6 * 16, 7 * 16 movu [r6+r2*2], xm15 movu [r6+r3], xm14 add r1, 16 add r5, 128 dec r4d jnz .pass2 RET %macro DCT32_PASS_1 4 vbroadcasti128 m8, [r7 + %1] pmaddwd m11, m%3, m8 pmaddwd m12, m%4, m8 phaddd m11, m12 vbroadcasti128 m8, [r7 + %1 + 32] vbroadcasti128 m10, [r7 + %1 + 48] pmaddwd m12, m5, m8 pmaddwd m13, m6, m10 phaddd m12, m13 pmaddwd m13, m4, m8 pmaddwd m14, m7, m10 phaddd m13, m14 phaddd m12, m13 phaddd m11, m12 paddd m11, m9 psrad m11, DCT32_SHIFT1 ; DCT32_SHIFT1 vpermq m11, m11, 0xD8 packssdw m11, m11 movq [r5+%2 ], xm11 vextracti128 xm10, m11, 1 movq [r5+%2+64], xm10 %endmacro %macro DCT32_PASS_2 1 mova m8, [r7 + %1] mova m10, [r8 + %1] pmaddwd m11, m0, m8 pmaddwd m12, m1, m10 paddd m11, m12 pmaddwd m12, m2, m8 pmaddwd m13, m3, m10 paddd m12, m13 phaddd m11, m12 pmaddwd m12, m4, m8 pmaddwd m13, m5, m10 paddd m12, m13 pmaddwd m13, m6, m8 pmaddwd m14, m7, m10 paddd m13, m14 phaddd m12, m13 phaddd m11, m12 vextracti128 xm10, m11, 1 paddd xm11, xm10 paddd xm11, xm9 psrad xm11, DCT32_SHIFT2 packssdw xm11, xm11 %endmacro ; ============================================================================ ; void dct_32x32(const coeff_t *src, coeff_t *dst, int i_src) ; ============================================================================ ; ---------------------------------------------------------------------------- ; dct_32x32_avx2 INIT_YMM avx2 cglobal dct_32x32, 3, 9, 16, 0-64*mmsize vpbroadcastq m9, [pd_4] ; add1 add r2d, r2d ; r2 <-- i_src lea r7, [tab_dct32_1] lea r8, [tab_dct32_2] lea r3, [r2 * 3] mov r5, rsp mov r4d, 8 mova m15, [dct16_shuf1] .pass1: movu m2, [r0 ] movu m1, [r0 + 32] pshufb m1, m15 vpermq m1, m1, 0x4E psubw m7, m2, m1 paddw m2, m1 movu m1, [r0 + r2 * 2 ] movu m0, [r0 + r2 * 2 + 32] pshufb m0, m15 vpermq m0, m0, 0x4E psubw m8, m1, m0 paddw m1, m0 vperm2i128 m0, m2, m1, 0x20 ; [row0lo row2lo] for E vperm2i128 m3, m2, m1, 0x31 ; [row0hi row2hi] for E pshufb m3, m15 psubw m1, m0, m3 paddw m0, m3 vperm2i128 m5, m7, m8, 0x20 ; [row0lo row2lo] for O vperm2i128 m6, m7, m8, 0x31 ; [row0hi row2hi] for O movu m4, [r0 + r2 ] movu m2, [r0 + r2 + 32] pshufb m2, m15 vpermq m2, m2, 0x4E psubw m10, m4, m2 paddw m4, m2 movu m3, [r0 + r3 ] movu m2, [r0 + r3 + 32] pshufb m2, m15 vpermq m2, m2, 0x4E psubw m11, m3, m2 paddw m3, m2 vperm2i128 m2, m4, m3, 0x20 ; [row1lo row3lo] for E vperm2i128 m8, m4, m3, 0x31 ; [row1hi row3hi] for E pshufb m8, m15 psubw m3, m2, m8 paddw m2, m8 vperm2i128 m4, m10, m11, 0x20 ; [row1lo row3lo] for O vperm2i128 m7, m10, m11, 0x31 ; [row1hi row3hi] for O DCT32_PASS_1 0 * 32, 0 * 64, 0, 2 DCT32_PASS_1 2 * 32, 2 * 64, 1, 3 DCT32_PASS_1 4 * 32, 4 * 64, 0, 2 DCT32_PASS_1 6 * 32, 6 * 64, 1, 3 DCT32_PASS_1 8 * 32, 8 * 64, 0, 2 DCT32_PASS_1 10 * 32, 10 * 64, 1, 3 DCT32_PASS_1 12 * 32, 12 * 64, 0, 2 DCT32_PASS_1 14 * 32, 14 * 64, 1, 3 DCT32_PASS_1 16 * 32, 16 * 64, 0, 2 DCT32_PASS_1 18 * 32, 18 * 64, 1, 3 DCT32_PASS_1 20 * 32, 20 * 64, 0, 2 DCT32_PASS_1 22 * 32, 22 * 64, 1, 3 DCT32_PASS_1 24 * 32, 24 * 64, 0, 2 DCT32_PASS_1 26 * 32, 26 * 64, 1, 3 DCT32_PASS_1 28 * 32, 28 * 64, 0, 2 DCT32_PASS_1 30 * 32, 30 * 64, 1, 3 add r5, 8 lea r0, [r0 + r2 * 4] dec r4d jnz .pass1 mov r2d, 64 lea r3, [r2 * 3] mov r5, rsp mov r4d, 8 vpbroadcastq xm9, [pd_ %+ DCT32_ROUND2] ; sfdong .pass2: mova m0, [r5 + 0 * 64 ] mova m1, [r5 + 0 * 64 + 32] mova m2, [r5 + 1 * 64 ] mova m3, [r5 + 1 * 64 + 32] mova m4, [r5 + 2 * 64 ] mova m5, [r5 + 2 * 64 + 32] mova m6, [r5 + 3 * 64 ] mova m7, [r5 + 3 * 64 + 32] DCT32_PASS_2 0 * 32 movq [r1 ], xm11 DCT32_PASS_2 1 * 32 movq [r1+r2 ], xm11 DCT32_PASS_2 2 * 32 movq [r1+r2*2], xm11 DCT32_PASS_2 3 * 32 movq [r1+r3 ], xm11 lea r6, [r1 + r2 * 4] DCT32_PASS_2 4 * 32 movq [r6 ], xm11 DCT32_PASS_2 5 * 32 movq [r6+r2 ], xm11 DCT32_PASS_2 6 * 32 movq [r6+r2*2], xm11 DCT32_PASS_2 7 * 32 movq [r6+r3 ], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 8 * 32 movq [r6 ], xm11 DCT32_PASS_2 9 * 32 movq [r6+r2 ], xm11 DCT32_PASS_2 10 * 32 movq [r6+r2*2], xm11 DCT32_PASS_2 11 * 32 movq [r6+r3 ], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 12 * 32 movq [r6 ], xm11 DCT32_PASS_2 13 * 32 movq [r6+r2 ], xm11 DCT32_PASS_2 14 * 32 movq [r6+r2*2], xm11 DCT32_PASS_2 15 * 32 movq [r6+r3 ], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 16 * 32 movq [r6 ], xm11 DCT32_PASS_2 17 * 32 movq [r6+r2], xm11 DCT32_PASS_2 18 * 32 movq [r6+r2*2], xm11 DCT32_PASS_2 19 * 32 movq [r6+r3 ], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 20 * 32 movq [r6 ], xm11 DCT32_PASS_2 21 * 32 movq [r6+r2 ], xm11 DCT32_PASS_2 22 * 32 movq [r6+r2*2], xm11 DCT32_PASS_2 23 * 32 movq [r6+r3 ], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 24 * 32 movq [r6 ], xm11 DCT32_PASS_2 25 * 32 movq [r6+r2 ], xm11 DCT32_PASS_2 26 * 32 movq [r6+r2*2], xm11 DCT32_PASS_2 27 * 32 movq [r6+r3 ], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 28 * 32 movq [r6 ], xm11 DCT32_PASS_2 29 * 32 movq [r6+r2 ], xm11 DCT32_PASS_2 30 * 32 movq [r6+r2*2], xm11 DCT32_PASS_2 31 * 32 movq [r6+r3], xm11 add r5, 256 add r1, 8 dec r4d jnz .pass2 RET %macro IDCT8_PASS_1 1 vpbroadcastd m7, [r5 + %1 ] vpbroadcastd m10, [r5 + %1 + 4] pmaddwd m5, m4, m7 pmaddwd m6, m0, m10 paddd m5, m6 vpbroadcastd m7, [r6 + %1 ] vpbroadcastd m10, [r6 + %1 + 4] pmaddwd m6, m1, m7 pmaddwd m3, m2, m10 paddd m6, m3 paddd m3, m5, m6 paddd m3, m11 psrad m3, IDCT8_SHIFT1 psubd m5, m6 paddd m5, m11 psrad m5, IDCT8_SHIFT1 vpbroadcastd m7, [r5 + %1 + 32] vpbroadcastd m10, [r5 + %1 + 36] pmaddwd m6, m4, m7 pmaddwd m8, m0, m10 paddd m6, m8 vpbroadcastd m7, [r6 + %1 + 32] vpbroadcastd m10, [r6 + %1 + 36] pmaddwd m8, m1, m7 pmaddwd m9, m2, m10 paddd m8, m9 paddd m9, m6, m8 paddd m9, m11 psrad m9, IDCT8_SHIFT1 psubd m6, m8 paddd m6, m11 psrad m6, IDCT8_SHIFT1 packssdw m3, m9 vpermq m3, m3, 0xD8 packssdw m6, m5 vpermq m6, m6, 0xD8 %endmacro %macro IDCT8_PASS_2 0 punpcklqdq m2, m0, m1 punpckhqdq m0, m1 pmaddwd m3, m2, [r5] pmaddwd m5, m2, [r5 + 32] pmaddwd m6, m2, [r5 + 64] pmaddwd m7, m2, [r5 + 96] phaddd m3, m5 phaddd m6, m7 pshufb m3, [idct8_shuf2] pshufb m6, [idct8_shuf2] punpcklqdq m7, m3, m6 punpckhqdq m3, m6 pmaddwd m5, m0, [r6] pmaddwd m6, m0, [r6 + 32] pmaddwd m8, m0, [r6 + 64] pmaddwd m9, m0, [r6 + 96] phaddd m5, m6 phaddd m8, m9 pshufb m5, [idct8_shuf2] pshufb m8, [idct8_shuf2] punpcklqdq m6, m5, m8 punpckhqdq m5, m8 paddd m8, m7, m6 paddd m8, m12 psrad m8, IDCT8_SHIFT2 psubd m7, m6 paddd m7, m12 psrad m7, IDCT8_SHIFT2 pshufb m7, [idct8_shuf3] packssdw m8, m7 paddd m9, m3, m5 paddd m9, m12 psrad m9, IDCT8_SHIFT2 psubd m3, m5 paddd m3, m12 psrad m3, IDCT8_SHIFT2 pshufb m3, [idct8_shuf3] packssdw m9, m3 %endmacro ; ============================================================================ ; void idct_8x8(const coeff_t *src, coeff_t *dst, int i_dst) ; ============================================================================ ; ---------------------------------------------------------------------------- ; dct_32x32_avx2 INIT_YMM avx2 cglobal idct_8x8, 3, 7, 13, 0-8*16 vbroadcasti128 m11, [pd_ %+ IDCT8_ROUND1] vpbroadcastd m12, [pd_ %+ IDCT8_ROUND2] mov r4, rsp lea r5, [avx2_idct8_1] lea r6, [avx2_idct8_2] ;pass1 mova m1, [r0 + 0 * 32] ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1] mova m0, [r0 + 1 * 32] ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3] vpunpcklwd m5, m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3] vpunpckhwd m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3] vinserti128 m4, m5, xm1, 1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2] vextracti128 xm2, m5, 1 ; [1 3 1 3 1 3 1 3] vinserti128 m1, m1, xm2, 0 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3] mova m2, [r0 + 2 * 32] ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5] mova m0, [r0 + 3 * 32] ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7] vpunpcklwd m5, m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7] vpunpckhwd m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7] vinserti128 m0, m5, xm2, 1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6] vextracti128 xm5, m5, 1 ; [5 7 5 7 5 7 5 7] vinserti128 m2, m2, xm5, 0 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7] mova m5, [idct8_shuf1] vpermd m4, m5, m4 vpermd m0, m5, m0 vpermd m1, m5, m1 vpermd m2, m5, m2 IDCT8_PASS_1 0 mova [r4], m3 mova [r4 + 96], m6 IDCT8_PASS_1 64 mova [r4 + 32], m3 mova [r4 + 64], m6 ;pass2 add r2d, r2d lea r3, [r2 * 3] mova m0, [r4 ] mova m1, [r4 + 32] IDCT8_PASS_2 vextracti128 xm3, m8, 1 mova [r1 ], xm8 mova [r1 + r2], xm3 vextracti128 xm3, m9, 1 mova [r1+r2*2], xm9 mova [r1+r3 ], xm3 lea r1, [r1 + r2 * 4] mova m0, [r4 + 64] mova m1, [r4 + 96] IDCT8_PASS_2 vextracti128 xm3, m8, 1 mova [r1 ], xm8 mova [r1 + r2], xm3 vextracti128 xm3, m9, 1 mova [r1+r2*2], xm9 mova [r1+r3 ], xm3 RET %macro IDCT16_PASS1 2 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16] pmaddwd m9, m0, m5 pmaddwd m10, m7, m5 phaddd m9, m10 pmaddwd m10, m6, m5 pmaddwd m11, m8, m5 phaddd m10, m11 phaddd m9, m10 vbroadcasti128 m5, [tab_idct16_1 + %1 * 16] pmaddwd m10, m1, m5 pmaddwd m11, m3, m5 phaddd m10, m11 pmaddwd m11, m4, m5 pmaddwd m12, m2, m5 phaddd m11, m12 phaddd m10, m11 paddd m11, m9, m10 paddd m11, m14 psrad m11, IDCT16_SHIFT1 psubd m9, m10 paddd m9, m14 psrad m9, IDCT16_SHIFT1 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16 + 16] pmaddwd m10, m0, m5 pmaddwd m12, m7, m5 phaddd m10, m12 pmaddwd m12, m6, m5 pmaddwd m13, m8, m5 phaddd m12, m13 phaddd m10, m12 vbroadcasti128 m5, [tab_idct16_1 + %1 * 16 + 16] pmaddwd m12, m1, m5 pmaddwd m13, m3, m5 phaddd m12, m13 pmaddwd m13, m4, m5 pmaddwd m5, m2 phaddd m13, m5 phaddd m12, m13 paddd m5, m10, m12 paddd m5, m14 psrad m5, IDCT16_SHIFT1 psubd m10, m12 paddd m10, m14 psrad m10, IDCT16_SHIFT1 packssdw m11, m5 packssdw m9, m10 mova m10, [idct16_shuff] mova m5, [idct16_shuff1] vpermd m12, m10, m11 vpermd m13, m5, m9 mova [r3 + %1 * 16 * 2 ], xm12 mova [r3 + %2 * 16 * 2 ], xm13 vextracti128 [r3 + %2 * 16 * 2 + 32], m13, 1 vextracti128 [r3 + %1 * 16 * 2 + 32], m12, 1 %endmacro ; ============================================================================ ; void idct_16x16(const coeff_t *src, coeff_t *dst, int i_dst) ; ============================================================================ ; ------------------------------------------------------------------ ; idct_16x16_avx2 INIT_YMM avx2 cglobal idct_16x16, 3, 7, 16, 0-16*mmsize vbroadcasti128 m14, [pd_ %+ IDCT16_ROUND1] vpbroadcastd m15, [pd_ %+ IDCT16_ROUND2] add r2d, r2d mov r3, rsp mov r4d, 2 .pass1: movu xm0, [r0 + 0 * 32] movu xm1, [r0 + 8 * 32] punpckhqdq xm2, xm0, xm1 punpcklqdq xm0, xm1 vinserti128 m0, m0, xm2, 1 movu xm1, [r0 + 1 * 32] movu xm2, [r0 + 9 * 32] punpckhqdq xm3, xm1, xm2 punpcklqdq xm1, xm2 vinserti128 m1, m1, xm3, 1 movu xm2, [r0 + 2 * 32] movu xm3, [r0 + 10 * 32] punpckhqdq xm4, xm2, xm3 punpcklqdq xm2, xm3 vinserti128 m2, m2, xm4, 1 movu xm3, [r0 + 3 * 32] movu xm4, [r0 + 11 * 32] punpckhqdq xm5, xm3, xm4 punpcklqdq xm3, xm4 vinserti128 m3, m3, xm5, 1 movu xm4, [r0 + 4 * 32] movu xm5, [r0 + 12 * 32] punpckhqdq xm6, xm4, xm5 punpcklqdq xm4, xm5 vinserti128 m4, m4, xm6, 1 movu xm5, [r0 + 5 * 32] movu xm6, [r0 + 13 * 32] punpckhqdq xm7, xm5, xm6 punpcklqdq xm5, xm6 vinserti128 m5, m5, xm7, 1 movu xm6, [r0 + 6 * 32] movu xm7, [r0 + 14 * 32] punpckhqdq xm8, xm6, xm7 punpcklqdq xm6, xm7 vinserti128 m6, m6, xm8, 1 movu xm7, [r0 + 7 * 32] movu xm8, [r0 + 15 * 32] punpckhqdq xm9, xm7, xm8 punpcklqdq xm7, xm8 vinserti128 m7, m7, xm9, 1 punpckhwd m8, m0, m2 ;[8 10] punpcklwd m0, m2 ;[0 2] punpckhwd m2, m1, m3 ;[9 11] punpcklwd m1, m3 ;[1 3] punpckhwd m3, m4, m6 ;[12 14] punpcklwd m4, m6 ;[4 6] punpckhwd m6, m5, m7 ;[13 15] punpcklwd m5, m7 ;[5 7] punpckhdq m7, m0, m4 ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67] punpckldq m0, m4 ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65] punpckhdq m4, m8, m3 ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147] punpckldq m8, m3 ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145] punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77] punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75] punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157] punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155] punpckhqdq m6, m0, m8 ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145] punpcklqdq m0, m8 ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144] punpckhqdq m8, m7, m4 ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147] punpcklqdq m7, m4 ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146] punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155] punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154] punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157] punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156] IDCT16_PASS1 0, 14 IDCT16_PASS1 2, 12 IDCT16_PASS1 4, 10 IDCT16_PASS1 6, 8 add r0, 16 add r3, 16 dec r4d jnz .pass1 mov r3, rsp mov r4d, 8 lea r5, [tab_idct16_2] lea r6, [tab_idct16_1] vbroadcasti128 m7, [r5 ] vbroadcasti128 m8, [r5 + 16] vbroadcasti128 m9, [r5 + 32] vbroadcasti128 m10, [r5 + 48] vbroadcasti128 m11, [r5 + 64] vbroadcasti128 m12, [r5 + 80] vbroadcasti128 m13, [r5 + 96] .pass2: movu m1, [r3] vpermq m0, m1, 0xD8 pmaddwd m1, m0, m7 pmaddwd m2, m0, m8 phaddd m1, m2 pmaddwd m2, m0, m9 pmaddwd m3, m0, m10 phaddd m2, m3 phaddd m1, m2 pmaddwd m2, m0, m11 pmaddwd m3, m0, m12 phaddd m2, m3 vbroadcasti128 m14, [r5 + 112] pmaddwd m3, m0, m13 pmaddwd m4, m0, m14 phaddd m3, m4 phaddd m2, m3 movu m3, [r3 + 32] vpermq m0, m3, 0xD8 vbroadcasti128 m14, [r6 ] pmaddwd m3, m0, m14 vbroadcasti128 m14, [r6 + 16] pmaddwd m4, m0, m14 phaddd m3, m4 vbroadcasti128 m14, [r6 + 32] pmaddwd m4, m0, m14 vbroadcasti128 m14, [r6 + 48] pmaddwd m5, m0, m14 phaddd m4, m5 phaddd m3, m4 vbroadcasti128 m14, [r6 + 64] pmaddwd m4, m0, m14 vbroadcasti128 m14, [r6 + 80] pmaddwd m5, m0, m14 phaddd m4, m5 vbroadcasti128 m14, [r6 + 96] pmaddwd m6, m0, m14 vbroadcasti128 m14, [r6 + 112] pmaddwd m0, m14 phaddd m6, m0 phaddd m4, m6 paddd m5, m1, m3 paddd m5, m15 psrad m5, IDCT16_SHIFT2 psubd m1, m3 paddd m1, m15 psrad m1, IDCT16_SHIFT2 paddd m6, m2, m4 paddd m6, m15 psrad m6, IDCT16_SHIFT2 psubd m2, m4 paddd m2, m15 psrad m2, IDCT16_SHIFT2 packssdw m5, m6 packssdw m1, m2 pshufb m2, m1, [dct16_shuf1] mova [r1 ], xm5 mova [r1 + 16], xm2 vextracti128 [r1 + r2 ], m5, 1 vextracti128 [r1 + r2 + 16], m2, 1 lea r1, [r1 + 2 * r2] add r3, 64 dec r4d jnz .pass2 RET %macro IDCT32_PASS1 1 vbroadcasti128 m3, [tab_idct32_1 + %1 * 32 ] vbroadcasti128 m13, [tab_idct32_1 + %1 * 32 + 16] pmaddwd m9, m4, m3 pmaddwd m10, m8, m13 phaddd m9, m10 pmaddwd m10, m2, m3 pmaddwd m11, m1, m13 phaddd m10, m11 phaddd m9, m10 vbroadcasti128 m3, [tab_idct32_1 + (15 - %1) * 32 ] vbroadcasti128 m13, [tab_idct32_1 + (15 - %1) * 32 + 16] pmaddwd m10, m4, m3 pmaddwd m11, m8, m13 phaddd m10, m11 pmaddwd m11, m2, m3 pmaddwd m12, m1, m13 phaddd m11, m12 phaddd m10, m11 phaddd m9, m10 ; [row0s0 row2s0 row0s15 row2s15 row1s0 row3s0 row1s15 row3s15] vbroadcasti128 m3, [tab_idct32_2 + %1 * 16] pmaddwd m10, m0, m3 pmaddwd m11, m7, m3 phaddd m10, m11 phaddd m10, m10 vbroadcasti128 m3, [tab_idct32_3 + %1 * 16] pmaddwd m11, m5, m3 pmaddwd m12, m6, m3 phaddd m11, m12 phaddd m11, m11 paddd m12, m10, m11 ;[row0a0 row2a0 NIL NIL row1sa0 row3a0 NIL NIL] psubd m10, m11 ;[row0a15 row2a15 NIL NIL row1a15 row3a15 NIL NIL] punpcklqdq m12, m10 ;[row0a0 row2a0 row0a15 row2a15 row1a0 row3a0 row1a15 row3a15] paddd m10, m9, m12 paddd m10, m15 psrad m10, IDCT32_SHIFT1 psubd m12, m9 paddd m12, m15 psrad m12, IDCT32_SHIFT1 packssdw m10, m12 vextracti128 xm12, m10, 1 movd [r3 + %1 * 64], xm10 movd [r3 + 32 + %1 * 64], xm12 pextrd [r4 - %1 * 64], xm10, 1 pextrd [r4 + 32 - %1 * 64], xm12, 1 pextrd [r3 + 16 * 64 + %1 * 64], xm10, 3 pextrd [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3 pextrd [r4 + 16 * 64 - %1 * 64], xm10, 2 pextrd [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2 %endmacro ; ============================================================================ ; void idct_32x32(const coeff_t *src, coeff_t *dst, int i_dst) ; ============================================================================ ; ------------------------------------------------------------------ ; idct_32x32_avx2 ; TODO: Reduce PHADDD instruction by PADDD INIT_YMM avx2 cglobal idct_32x32, 3, 6, 16, 0-32*64 vbroadcasti128 m15, [pd_ %+ IDCT32_ROUND1] mov r3, rsp lea r4, [r3 + 15 * 64] mov r5d, 8 .pass1: movq xm0, [r0 + 2 * 64] movq xm1, [r0 + 18 * 64] punpcklqdq xm0, xm0, xm1 movq xm1, [r0 + 0 * 64] movq xm2, [r0 + 16 * 64] punpcklqdq xm1, xm1, xm2 vinserti128 m0, m0, xm1, 1 ;[2 18 0 16] movq xm1, [r0 + 1 * 64] movq xm2, [r0 + 9 * 64] punpcklqdq xm1, xm1, xm2 movq xm2, [r0 + 17 * 64] movq xm3, [r0 + 25 * 64] punpcklqdq xm2, xm2, xm3 vinserti128 m1, m1, xm2, 1 ;[1 9 17 25] movq xm2, [r0 + 6 * 64] movq xm3, [r0 + 22 * 64] punpcklqdq xm2, xm2, xm3 movq xm3, [r0 + 4 * 64] movq xm4, [r0 + 20 * 64] punpcklqdq xm3, xm3, xm4 vinserti128 m2, m2, xm3, 1 ;[6 22 4 20] movq xm3, [r0 + 3 * 64] movq xm4, [r0 + 11 * 64] punpcklqdq xm3, xm3, xm4 movq xm4, [r0 + 19 * 64] movq xm5, [r0 + 27 * 64] punpcklqdq xm4, xm4, xm5 vinserti128 m3, m3, xm4, 1 ;[3 11 17 25] movq xm4, [r0 + 10 * 64] movq xm5, [r0 + 26 * 64] punpcklqdq xm4, xm4, xm5 movq xm5, [r0 + 8 * 64] movq xm6, [r0 + 24 * 64] punpcklqdq xm5, xm5, xm6 vinserti128 m4, m4, xm5, 1 ;[10 26 8 24] movq xm5, [r0 + 5 * 64] movq xm6, [r0 + 13 * 64] punpcklqdq xm5, xm5, xm6 movq xm6, [r0 + 21 * 64] movq xm7, [r0 + 29 * 64] punpcklqdq xm6, xm6, xm7 vinserti128 m5, m5, xm6, 1 ;[5 13 21 9] movq xm6, [r0 + 14 * 64] movq xm7, [r0 + 30 * 64] punpcklqdq xm6, xm6, xm7 movq xm7, [r0 + 12 * 64] movq xm8, [r0 + 28 * 64] punpcklqdq xm7, xm7, xm8 vinserti128 m6, m6, xm7, 1 ;[14 30 12 28] movq xm7, [r0 + 7 * 64] movq xm8, [r0 + 15 * 64] punpcklqdq xm7, xm7, xm8 movq xm8, [r0 + 23 * 64] movq xm9, [r0 + 31 * 64] punpcklqdq xm8, xm8, xm9 vinserti128 m7, m7, xm8, 1 ;[7 15 23 31] punpckhwd m8, m0, m2 ;[18 22 16 20] punpcklwd m0, m2 ;[2 6 0 4] punpckhwd m2, m1, m3 ;[9 11 25 27] punpcklwd m1, m3 ;[1 3 17 19] punpckhwd m3, m4, m6 ;[26 30 24 28] punpcklwd m4, m6 ;[10 14 8 12] punpckhwd m6, m5, m7 ;[13 15 29 31] punpcklwd m5, m7 ;[5 7 21 23] punpckhdq m7, m0, m4 ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123] punpckldq m0, m4 ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121] punpckhdq m4, m8, m3 ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283] punpckldq m8, m3 ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281] punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233] punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231] punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313] punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311] punpckhqdq m6, m0, m8 ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281] punpcklqdq m0, m8 ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280] punpckhqdq m8, m7, m4 ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283] punpcklqdq m7, m4 ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282] punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311] punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310] punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313] punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312] vperm2i128 m5, m0, m6, 0x20 ;[20 60 100 140 180 220 260 300 21 61 101 141 181 221 261 301] vperm2i128 m0, m0, m6, 0x31 ;[00 40 80 120 160 200 240 280 01 41 81 121 161 201 241 281] vperm2i128 m6, m7, m8, 0x20 ;[22 62 102 142 182 222 262 302 23 63 103 143 183 223 263 303] vperm2i128 m7, m7, m8, 0x31 ;[02 42 82 122 162 202 242 282 03 43 83 123 163 203 243 283] vperm2i128 m8, m1, m4, 0x31 ;[170 190 210 230 250 270 290 310 171 191 211 231 251 271 291 311] vperm2i128 m4, m1, m4, 0x20 ;[10 30 50 70 90 110 130 150 11 31 51 71 91 111 131 151] vperm2i128 m1, m3, m2, 0x31 ;[172 192 212 232 252 272 292 312 173 193 213 233 253 273 293 313] vperm2i128 m2, m3, m2, 0x20 ;[12 32 52 72 92 112 132 152 13 33 53 73 93 113 133 153] IDCT32_PASS1 0 IDCT32_PASS1 1 IDCT32_PASS1 2 IDCT32_PASS1 3 IDCT32_PASS1 4 IDCT32_PASS1 5 IDCT32_PASS1 6 IDCT32_PASS1 7 add r0, 8 add r3, 4 add r4, 4 dec r5d jnz .pass1 %if BIT_DEPTH == 10 vpbroadcastd m15, [pd_ %+ IDCT32_ROUND2] ; add2 %elif BIT_DEPTH == 8 vpbroadcastd m15, [pd_ %+ IDCT32_ROUND2] ; add2 %else %error Unsupported BIT_DEPTH! %endif mov r3, rsp add r2d, r2d mov r4d, 32 mova m7, [tab_idct32_4 ] mova m8, [tab_idct32_4 + 32] mova m9, [tab_idct32_4 + 64] mova m10, [tab_idct32_4 + 96] mova m11, [tab_idct32_4 + 128] mova m12, [tab_idct32_4 + 160] mova m13, [tab_idct32_4 + 192] mova m14, [tab_idct32_4 + 224] .pass2: movu m0, [r3] movu m1, [r3 + 32] pmaddwd m2, m0, m7 pmaddwd m3, m0, m8 phaddd m2, m3 pmaddwd m3, m0, m9 pmaddwd m4, m0, m10 phaddd m3, m4 phaddd m2, m3 pmaddwd m3, m0, m11 pmaddwd m4, m0, m12 phaddd m3, m4 pmaddwd m4, m0, m13 pmaddwd m5, m0, m14 phaddd m4, m5 phaddd m3, m4 vperm2i128 m4, m2, m3, 0x31 vperm2i128 m2, m2, m3, 0x20 paddd m2, m4 pmaddwd m3, m0, [tab_idct32_4 + 256] pmaddwd m4, m0, [tab_idct32_4 + 288] phaddd m3, m4 pmaddwd m4, m0, [tab_idct32_4 + 320] pmaddwd m5, m0, [tab_idct32_4 + 352] phaddd m4, m5 phaddd m3, m4 pmaddwd m4, m0, [tab_idct32_4 + 384] pmaddwd m5, m0, [tab_idct32_4 + 416] phaddd m4, m5 pmaddwd m5, m0, [tab_idct32_4 + 448] pmaddwd m0, [tab_idct32_4 + 480] phaddd m5, m0 phaddd m4, m5 vperm2i128 m0, m3, m4, 0x31 vperm2i128 m3, m3, m4, 0x20 paddd m3, m0 pmaddwd m4, m1, [tab_idct32_1] pmaddwd m0, m1, [tab_idct32_1 + 32] phaddd m4, m0 pmaddwd m5, m1, [tab_idct32_1 + 64] pmaddwd m0, m1, [tab_idct32_1 + 96] phaddd m5, m0 phaddd m4, m5 pmaddwd m5, m1, [tab_idct32_1 + 128] pmaddwd m0, m1, [tab_idct32_1 + 160] phaddd m5, m0 pmaddwd m6, m1, [tab_idct32_1 + 192] pmaddwd m0, m1, [tab_idct32_1 + 224] phaddd m6, m0 phaddd m5, m6 vperm2i128 m0, m4, m5, 0x31 vperm2i128 m4, m4, m5, 0x20 paddd m4, m0 pmaddwd m5, m1, [tab_idct32_1 + 256] pmaddwd m0, m1, [tab_idct32_1 + 288] phaddd m5, m0 pmaddwd m6, m1, [tab_idct32_1 + 320] pmaddwd m0, m1, [tab_idct32_1 + 352] phaddd m6, m0 phaddd m5, m6 pmaddwd m6, m1, [tab_idct32_1 + 384] pmaddwd m0, m1, [tab_idct32_1 + 416] phaddd m6, m0 pmaddwd m0, m1, [tab_idct32_1 + 448] pmaddwd m1, [tab_idct32_1 + 480] phaddd m0, m1 phaddd m6, m0 vperm2i128 m0, m5, m6, 0x31 vperm2i128 m5, m5, m6, 0x20 paddd m5, m0 paddd m6, m2, m4 paddd m6, m15 psrad m6, IDCT32_SHIFT2 ; IDCT32_SHIFT2 psubd m2, m4 paddd m2, m15 psrad m2, IDCT32_SHIFT2 ; IDCT32_SHIFT2 paddd m4, m3, m5 paddd m4, m15 psrad m4, IDCT32_SHIFT2 ; IDCT32_SHIFT2 psubd m3, m5 paddd m3, m15 psrad m3, IDCT32_SHIFT2 ; IDCT32_SHIFT2 packssdw m6, m4 packssdw m2, m3 vpermq m6, m6, 0xD8 vpermq m2, m2, 0x8D pshufb m2, [dct16_shuf1] mova [r1 ], m6 mova [r1 + 32], m2 add r1, r2 add r3, 64 dec r4d jnz .pass2 RET ; ============================================================================ ; void idct_4x4(const coeff_t *src, coeff_t *dst, int i_dst) ; ============================================================================ ; ------------------------------------------------------------------ ; idct_4x4_avx2 INIT_YMM avx2 cglobal idct_4x4, 3, 4, 6 vbroadcasti128 m4, [pd_ %+ IDCT4_ROUND1] vpbroadcastd m5, [pd_ %+ IDCT4_ROUND2] add r2d, r2d lea r3, [r2 * 3] movu m0, [r0] ; [00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33] pshufb m0, [idct4_shuf1] ; [00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33] vextracti128 xm1, m0, 1 ; [20 22 21 23 30 32 31 33] punpcklwd xm2, xm0, xm1 ; [00 20 02 22 01 21 03 23] punpckhwd xm0, xm1 ; [10 30 12 32 11 31 13 33] vinserti128 m2, m2, xm2, 1 ; [00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23] vinserti128 m0, m0, xm0, 1 ; [10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33] mova m1, [avx2_idct4_1] mova m3, [avx2_idct4_1 + 32] pmaddwd m1, m2 pmaddwd m3, m0 paddd m0, m1, m3 paddd m0, m4 psrad m0, IDCT4_SHIFT1 ; [00 20 10 30 01 21 11 31] psubd m1, m3 paddd m1, m4 psrad m1, IDCT4_SHIFT1 ; [03 23 13 33 02 22 12 32] packssdw m0, m1 ; [00 20 10 30 03 23 13 33 01 21 11 31 02 22 12 32] vmovshdup m1, m0 ; [10 30 10 30 13 33 13 33 11 31 11 31 12 32 12 32] vmovsldup m0, m0 ; [00 20 00 20 03 23 03 23 01 21 01 21 02 22 02 22] vpbroadcastq m2, [avx2_idct4_2] vpbroadcastq m3, [avx2_idct4_2 + 8] pmaddwd m0, m2 pmaddwd m1, m3 paddd m2, m0, m1 paddd m2, m5 psrad m2, IDCT4_SHIFT2 ; [00 01 10 11 30 31 20 21] psubd m0, m1 paddd m0, m5 psrad m0, IDCT4_SHIFT2 ; [03 02 13 12 33 32 23 22] pshufb m0, [idct4_shuf2] ; [02 03 12 13 32 33 22 23] punpcklqdq m1, m2, m0 ; [00 01 02 03 10 11 12 13] punpckhqdq m2, m0 ; [30 31 32 33 20 21 22 23] packssdw m1, m2 ; [00 01 02 03 30 31 32 33 10 11 12 13 20 21 22 23] vextracti128 xm0, m1, 1 movq [r1 ], xm1 movq [r1 + r2], xm0 movhps [r1 + 2 * r2], xm0 movhps [r1 + r3], xm1 RET %endif xavs2-1.3/source/common/x86/dct8.h000066400000000000000000000060431340660520300166520ustar00rootroot00000000000000/***************************************************************************** * Copyright (C) 2013-2017 MulticoreWare, Inc * * Authors: Nabajit Deka ;* Min Chen * Jiaqi Zhang * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ #ifndef XAVS2_I386_DCT8_H #define XAVS2_I386_DCT8_H #define xavs2_dct_4x4_sse2 FPFX(dct_4x4_sse2) void xavs2_dct_4x4_sse2 (const coeff_t *src, coeff_t *dst, int i_src); #define xavs2_dct_4x4_avx2 FPFX(dct_4x4_avx2) void xavs2_dct_4x4_avx2 (const coeff_t *src, coeff_t *dst, int i_src); #define xavs2_dct_8x8_sse2 FPFX(dct_8x8_sse2) void xavs2_dct_8x8_sse2 (const coeff_t *src, coeff_t *dst, int i_src); #define xavs2_dct_8x8_sse4 FPFX(dct_8x8_sse4) void xavs2_dct_8x8_sse4 (const coeff_t *src, coeff_t *dst, int i_src); #if ARCH_X86_64 #define xavs2_dct_8x8_avx2 FPFX(dct_8x8_avx2) void xavs2_dct_8x8_avx2 (const coeff_t *src, coeff_t *dst, int i_src); #define xavs2_dct_16x16_avx2 FPFX(dct_16x16_avx2) void xavs2_dct_16x16_avx2 (const coeff_t *src, coeff_t *dst, int i_src); #define xavs2_dct_32x32_avx2 FPFX(dct_32x32_avx2) void xavs2_dct_32x32_avx2 (const coeff_t *src, coeff_t *dst, int i_src); #endif #define xavs2_idct_4x4_sse2 FPFX(idct_4x4_sse2) void xavs2_idct_4x4_sse2 (const coeff_t *src, coeff_t *dst, int i_dst); #define xavs2_idct_8x8_ssse3 FPFX(idct_8x8_ssse3) void xavs2_idct_8x8_ssse3 (const coeff_t *src, coeff_t *dst, int i_dst); #if ARCH_X86_64 #define xavs2_idct_4x4_avx2 FPFX(idct_4x4_avx2) void xavs2_idct_4x4_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define xavs2_idct_8x8_sse2 FPFX(idct_8x8_sse2) void xavs2_idct_8x8_sse2 (const coeff_t *src, coeff_t *dst, int i_dst); #define xavs2_idct_8x8_avx2 FPFX(idct_8x8_avx2) void xavs2_idct_8x8_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define xavs2_idct_16x16_avx2 FPFX(idct_16x16_avx2) void xavs2_idct_16x16_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define xavs2_idct_32x32_avx2 FPFX(idct_32x32_avx2) void xavs2_idct_32x32_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #endif #endif // ifndef XAVS2_I386_DCT8_H xavs2-1.3/source/common/x86/mc-a.asm000066400000000000000000004724021340660520300171640ustar00rootroot00000000000000;***************************************************************************** ;* mc-a.asm: x86 motion compensation ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Laurent Aimar ;* Dylan Yudaken ;* Holger Lubitz ;* Min Chen ;* Oskar Arvidsson ;* Jiaqi Zhang ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" %if BIT_DEPTH==8 %define ADDAVG_FACTOR 256 %define ADDAVG_ROUND 128 %elif BIT_DEPTH==10 %define ADDAVG_FACTOR 1024 %define ADDAVG_ROUND 512 %elif BIT_DEPTH==12 %define ADDAVG_FACTOR 4096 %define ADDAVG_ROUND 2048 %else %error Unsupport bit depth! %endif SECTION_RODATA 32 ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 ch_shuf_adj: times 8 db 0 times 8 db 2 times 8 db 4 times 8 db 6 SECTION .text cextern pb_0 cextern pw_1 cextern pw_4 cextern pw_8 cextern pw_32 cextern pw_64 cextern pw_128 cextern pw_256 cextern pw_512 cextern pw_1023 cextern pw_1024 cextern pw_2048 cextern pw_4096 cextern pw_00ff cextern pw_pixel_max cextern pd_32 cextern pd_64 cextern pq_1 ;==================================================================================================================== ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) ;==================================================================================================================== ; r0 = pSrc0, r1 = pSrc1 ; r2 = pDst, r3 = iStride0 ; r4 = iStride1, r5 = iDstStride %if HIGH_BIT_DEPTH INIT_XMM sse4 cglobal addAvg_2x4, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride add r3, r3 add r4, r4 add r5, r5 movd m1, [r0] movd m2, [r0 + r3] movd m3, [r1] movd m4, [r1 + r4] punpckldq m1, m2 punpckldq m3, m4 lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movd m2, [r0] movd m4, [r0 + r3] movd m5, [r1] movd m0, [r1 + r4] punpckldq m2, m4 punpckldq m5, m0 punpcklqdq m1, m2 punpcklqdq m3, m5 paddw m1, m3 pmulhrsw m1, [pw_ %+ ADDAVG_FACTOR] paddw m1, [pw_ %+ ADDAVG_ROUND] pxor m0, m0 pmaxsw m1, m0 pminsw m1, [pw_pixel_max] movd [r2], m1 pextrd [r2 + r5], m1, 1 lea r2, [r2 + 2 * r5] pextrd [r2], m1, 2 pextrd [r2 + r5], m1, 3 RET ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m0, [pw_ %+ ADDAVG_ROUND] pxor m7, m7 add r3, r3 add r4, r4 add r5, r5 %rep 2 movd m1, [r0] movd m2, [r0 + r3] movd m3, [r1] movd m4, [r1 + r4] punpckldq m1, m2 punpckldq m3, m4 lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movd m2, [r0] movd m4, [r0 + r3] movd m5, [r1] movd m6, [r1 + r4] punpckldq m2, m4 punpckldq m5, m6 punpcklqdq m1, m2 punpcklqdq m3, m5 paddw m1, m3 pmulhrsw m1, [pw_ %+ ADDAVG_FACTOR] paddw m1, m0 pmaxsw m1, m7 pminsw m1, [pw_pixel_max] movd [r2], m1 pextrd [r2 + r5], m1, 1 lea r2, [r2 + 2 * r5] pextrd [r2], m1, 2 pextrd [r2 + r5], m1, 3 lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] lea r2, [r2 + 2 * r5] %endrep RET ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_2x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m6, [pw_pixel_max] mova m7, [pw_ %+ ADDAVG_FACTOR] mov r6d, 16/4 add r3, r3 add r4, r4 add r5, r5 .loop: movd m1, [r0] movd m2, [r0 + r3] movd m3, [r1] movd m4, [r1 + r4] lea r0, [r0 + r3 * 2] lea r1, [r1 + r4 * 2] punpckldq m1, m2 punpckldq m3, m4 movd m2, [r0] movd m4, [r0 + r3] movd m5, [r1] movd m0, [r1 + r4] lea r0, [r0 + r3 * 2] lea r1, [r1 + r4 * 2] punpckldq m2, m4 punpckldq m5, m0 punpcklqdq m1, m2 punpcklqdq m3, m5 paddw m1, m3 pmulhrsw m1, m7 paddw m1, [pw_ %+ ADDAVG_ROUND] pxor m0, m0 pmaxsw m1, m0 pminsw m1, m6 movd [r2], m1 pextrd [r2 + r5], m1, 1 lea r2, [r2 + r5 * 2] pextrd [r2], m1, 2 pextrd [r2 + r5], m1, 3 lea r2, [r2 + r5 * 2] dec r6d jnz .loop RET ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_4x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride add r3, r3 add r4, r4 add r5, r5 movh m0, [r0] movh m1, [r0 + r3] movh m2, [r1] movh m3, [r1 + r4] punpcklqdq m0, m1 punpcklqdq m2, m3 paddw m0, m2 pmulhrsw m0, [pw_ %+ ADDAVG_FACTOR] paddw m0, [pw_ %+ ADDAVG_ROUND] pxor m6, m6 pmaxsw m0, m6 pminsw m0, [pw_pixel_max] movh [r2], m0 movhps [r2 + r5], m0 RET ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_6x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m7, [pw_ %+ ADDAVG_FACTOR] pxor m6, m6 add r3, r3 add r4, r4 add r5, r5 %rep 4 movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movh [r2], m0 pextrd [r2 + 8], m0, 2 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movh [r2 + r5], m1 pextrd [r2 + r5 + 8], m1, 2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] %endrep RET ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_6x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m7, [pw_ %+ ADDAVG_FACTOR] pxor m6, m6 mov r6d, 16/2 add r3, r3 add r4, r4 add r5, r5 .loop: movu m0, [r0] movu m2, [r1] movu m1, [r0 + r3] movu m3, [r1 + r4] dec r6d lea r0, [r0 + r3 * 2] lea r1, [r1 + r4 * 2] paddw m0, m2 paddw m1, m3 pmulhrsw m0, m7 pmulhrsw m1, m7 paddw m0, m4 paddw m1, m4 pmaxsw m0, m6 pmaxsw m1, m6 pminsw m0, m5 pminsw m1, m5 movh [r2], m0 pextrd [r2 + 8], m0, 2 movh [r2 + r5], m1 pextrd [r2 + r5 + 8], m1, 2 lea r2, [r2 + r5 * 2] jnz .loop RET ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m7, [pw_ %+ ADDAVG_FACTOR] pxor m6, m6 add r3, r3 add r4, r4 add r5, r5 movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + r5], m1 RET ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m7, [pw_ %+ ADDAVG_FACTOR] pxor m6, m6 add r3, r3 add r4, r4 add r5, r5 %rep 3 movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + r5], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] %endrep RET ;----------------------------------------------------------------------------- %macro ADDAVG_W4_H4 1 INIT_XMM sse4 cglobal addAvg_4x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m7, [pw_ %+ ADDAVG_FACTOR] pxor m6, m6 add r3, r3 add r4, r4 add r5, r5 mov r6d, %1/4 .loop: %rep 2 movh m0, [r0] movh m1, [r0 + r3] movh m2, [r1] movh m3, [r1 + r4] punpcklqdq m0, m1 punpcklqdq m2, m3 paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movh [r2], m0 movhps [r2 + r5], m0 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] %endrep dec r6d jnz .loop RET %endmacro ADDAVG_W4_H4 4 ADDAVG_W4_H4 8 ADDAVG_W4_H4 16 ADDAVG_W4_H4 32 ;----------------------------------------------------------------------------- %macro ADDAVG_W8_H4 1 INIT_XMM sse4 cglobal addAvg_8x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m7, [pw_ %+ ADDAVG_FACTOR] pxor m6, m6 add r3, r3 add r4, r4 add r5, r5 mov r6d, %1/4 .loop: %rep 2 movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + r5], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] %endrep dec r6d jnz .loop RET %endmacro ADDAVG_W8_H4 4 ADDAVG_W8_H4 8 ADDAVG_W8_H4 16 ADDAVG_W8_H4 32 ADDAVG_W8_H4 12 ADDAVG_W8_H4 64 ;----------------------------------------------------------------------------- %macro ADDAVG_W12_H4 1 INIT_XMM sse4 cglobal addAvg_12x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m7, [pw_ %+ ADDAVG_FACTOR] pxor m6, m6 add r3, r3 add r4, r4 add r5, r5 mov r6d, %1/4 .loop: %rep 2 movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2], m0 movh m0, [r0 + 16] movh m1, [r0 + 16 + r3] movh m2, [r1 + 16] movh m3, [r1 + 16 + r4] punpcklqdq m0, m1 punpcklqdq m2, m3 paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movh [r2 + 16], m0 movhps [r2 + r5 + 16], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + r5], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] %endrep dec r6d jnz .loop RET %endmacro ADDAVG_W12_H4 16 ADDAVG_W12_H4 32 ;----------------------------------------------------------------------------- %macro ADDAVG_W16_H4 1 INIT_XMM sse4 cglobal addAvg_16x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m7, [pw_ %+ ADDAVG_FACTOR] pxor m6, m6 add r3, r3 add r4, r4 add r5, r5 mov r6d, %1/4 .loop: %rep 2 movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2], m0 movu m1, [r0 + 16] movu m2, [r1 + 16] paddw m1, m2 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + 16], m1 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + r5], m1 movu m2, [r0 + 16 + r3] movu m3, [r1 + 16 + r4] paddw m2, m3 pmulhrsw m2, m7 paddw m2, m4 pmaxsw m2, m6 pminsw m2, m5 movu [r2 + r5 + 16], m2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] %endrep dec r6d jnz .loop RET %endmacro ADDAVG_W16_H4 4 ADDAVG_W16_H4 8 ADDAVG_W16_H4 12 ADDAVG_W16_H4 16 ADDAVG_W16_H4 32 ADDAVG_W16_H4 64 ADDAVG_W16_H4 24 ;----------------------------------------------------------------------------- %macro ADDAVG_W24_H2 2 INIT_XMM sse4 cglobal addAvg_%1x%2, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m7, [pw_ %+ ADDAVG_FACTOR] pxor m6, m6 add r3, r3 add r4, r4 add r5, r5 mov r6d, %2/2 .loop: movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2], m0 movu m1, [r0 + 16] movu m2, [r1 + 16] paddw m1, m2 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + 16], m1 movu m0, [r0 + 32] movu m2, [r1 + 32] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2 + 32], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + r5], m1 movu m2, [r0 + r3 + 16] movu m3, [r1 + r4 + 16] paddw m2, m3 pmulhrsw m2, m7 paddw m2, m4 pmaxsw m2, m6 pminsw m2, m5 movu [r2 + r5 + 16], m2 movu m1, [r0 + r3 + 32] movu m3, [r1 + r4 + 32] paddw m1, m3 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + r5 + 32], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W24_H2 24, 32 ADDAVG_W24_H2 24, 64 ;----------------------------------------------------------------------------- %macro ADDAVG_W32_H2 1 INIT_XMM sse4 cglobal addAvg_32x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m7, [pw_ %+ ADDAVG_FACTOR] pxor m6, m6 add r3, r3 add r4, r4 add r5, r5 mov r6d, %1/2 .loop: movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2], m0 movu m1, [r0 + 16] movu m2, [r1 + 16] paddw m1, m2 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + 16], m1 movu m0, [r0 + 32] movu m2, [r1 + 32] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2 + 32], m0 movu m1, [r0 + 48] movu m2, [r1 + 48] paddw m1, m2 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + 48], m1 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + r5], m1 movu m2, [r0 + 16 + r3] movu m3, [r1 + 16 + r4] paddw m2, m3 pmulhrsw m2, m7 paddw m2, m4 pmaxsw m2, m6 pminsw m2, m5 movu [r2 + r5 + 16], m2 movu m1, [r0 + 32 + r3] movu m3, [r1 + 32 + r4] paddw m1, m3 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + r5 + 32], m1 movu m2, [r0 + 48 + r3] movu m3, [r1 + 48 + r4] paddw m2, m3 pmulhrsw m2, m7 paddw m2, m4 pmaxsw m2, m6 pminsw m2, m5 movu [r2 + r5 + 48], m2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W32_H2 8 ADDAVG_W32_H2 16 ADDAVG_W32_H2 24 ADDAVG_W32_H2 32 ADDAVG_W32_H2 64 ADDAVG_W32_H2 48 ;----------------------------------------------------------------------------- %macro ADDAVG_W48_H2 1 INIT_XMM sse4 cglobal addAvg_48x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m7, [pw_ %+ ADDAVG_FACTOR] pxor m6, m6 add r3, r3 add r4, r4 add r5, r5 mov r6d, %1/2 .loop: movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2], m0 movu m1, [r0 + 16] movu m2, [r1 + 16] paddw m1, m2 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + 16], m1 movu m0, [r0 + 32] movu m2, [r1 + 32] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2 + 32], m0 movu m1, [r0 + 48] movu m2, [r1 + 48] paddw m1, m2 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + 48], m1 movu m0, [r0 + 64] movu m2, [r1 + 64] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2 + 64], m0 movu m1, [r0 + 80] movu m2, [r1 + 80] paddw m1, m2 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + 80], m1 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + r5], m1 movu m2, [r0 + 16 + r3] movu m3, [r1 + 16 + r4] paddw m2, m3 pmulhrsw m2, m7 paddw m2, m4 pmaxsw m2, m6 pminsw m2, m5 movu [r2 + 16 + r5], m2 movu m1, [r0 + 32 + r3] movu m3, [r1 + 32 + r4] paddw m1, m3 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + 32 + r5], m1 movu m2, [r0 + 48 + r3] movu m3, [r1 + 48 + r4] paddw m2, m3 pmulhrsw m2, m7 paddw m2, m4 pmaxsw m2, m6 pminsw m2, m5 movu [r2 + 48 + r5], m2 movu m1, [r0 + 64 + r3] movu m3, [r1 + 64 + r4] paddw m1, m3 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + 64 + r5], m1 movu m2, [r0 + 80 + r3] movu m3, [r1 + 80 + r4] paddw m2, m3 pmulhrsw m2, m7 paddw m2, m4 pmaxsw m2, m6 pminsw m2, m5 movu [r2 + 80 + r5], m2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W48_H2 64 ;----------------------------------------------------------------------------- %macro ADDAVG_W64_H1 1 INIT_XMM sse4 cglobal addAvg_64x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m7, [pw_ %+ ADDAVG_FACTOR] pxor m6, m6 add r3, r3 add r4, r4 add r5, r5 mov r6d, %1 .loop: movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2], m0 movu m1, [r0 + 16] movu m2, [r1 + 16] paddw m1, m2 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + 16], m1 movu m0, [r0 + 32] movu m2, [r1 + 32] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2 + 32], m0 movu m1, [r0 + 48] movu m2, [r1 + 48] paddw m1, m2 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + 48], m1 movu m0, [r0 + 64] movu m2, [r1 + 64] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2 + 64], m0 movu m1, [r0 + 80] movu m2, [r1 + 80] paddw m1, m2 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + 80], m1 movu m0, [r0 + 96] movu m2, [r1 + 96] paddw m0, m2 pmulhrsw m0, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m5 movu [r2 + 96], m0 movu m1, [r0 + 112] movu m2, [r1 + 112] paddw m1, m2 pmulhrsw m1, m7 paddw m1, m4 pmaxsw m1, m6 pminsw m1, m5 movu [r2 + 112], m1 add r2, r5 add r0, r3 add r1, r4 dec r6d jnz .loop RET %endmacro ADDAVG_W64_H1 16 ADDAVG_W64_H1 32 ADDAVG_W64_H1 48 ADDAVG_W64_H1 64 ;------------------------------------------------------------------------------ ; avx2 asm for addAvg high_bit_depth ;------------------------------------------------------------------------------ INIT_YMM avx2 cglobal addAvg_8x2, 6,6,2, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride movu xm0, [r0] vinserti128 m0, m0, [r0 + r3 * 2], 1 movu xm1, [r1] vinserti128 m1, m1, [r1 + r4 * 2], 1 paddw m0, m1 pxor m1, m1 pmulhrsw m0, [pw_ %+ ADDAVG_FACTOR] paddw m0, [pw_ %+ ADDAVG_ROUND] pmaxsw m0, m1 pminsw m0, [pw_pixel_max] vextracti128 xm1, m0, 1 movu [r2], xm0 movu [r2 + r5 * 2], xm1 RET cglobal addAvg_8x6, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m3, [pw_ %+ ADDAVG_FACTOR] pxor m1, m1 add r3d, r3d add r4d, r4d add r5d, r5d movu xm0, [r0] vinserti128 m0, m0, [r0 + r3], 1 movu xm2, [r1] vinserti128 m2, m2, [r1 + r4], 1 paddw m0, m2 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m1 pminsw m0, m5 vextracti128 xm2, m0, 1 movu [r2], xm0 movu [r2 + r5], xm2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu xm0, [r0] vinserti128 m0, m0, [r0 + r3], 1 movu xm2, [r1] vinserti128 m2, m2, [r1 + r4], 1 paddw m0, m2 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m1 pminsw m0, m5 vextracti128 xm2, m0, 1 movu [r2], xm0 movu [r2 + r5], xm2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu xm0, [r0] vinserti128 m0, m0, [r0 + r3], 1 movu xm2, [r1] vinserti128 m2, m2, [r1 + r4], 1 paddw m0, m2 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m1 pminsw m0, m5 vextracti128 xm2, m0, 1 movu [r2], xm0 movu [r2 + r5], xm2 RET %macro ADDAVG_W8_H4_AVX2 1 cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m3, [pw_ %+ ADDAVG_FACTOR] pxor m1, m1 add r3d, r3d add r4d, r4d add r5d, r5d mov r6d, %1/4 .loop: movu m0, [r0] vinserti128 m0, m0, [r0 + r3], 1 movu m2, [r1] vinserti128 m2, m2, [r1 + r4], 1 paddw m0, m2 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m1 pminsw m0, m5 vextracti128 xm2, m0, 1 movu [r2], xm0 movu [r2 + r5], xm2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu m0, [r0] vinserti128 m0, m0, [r0 + r3], 1 movu m2, [r1] vinserti128 m2, m2, [r1 + r4], 1 paddw m0, m2 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m1 pminsw m0, m5 vextracti128 xm2, m0, 1 movu [r2], xm0 movu [r2 + r5], xm2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W8_H4_AVX2 4 ADDAVG_W8_H4_AVX2 8 ADDAVG_W8_H4_AVX2 12 ADDAVG_W8_H4_AVX2 16 ADDAVG_W8_H4_AVX2 32 ADDAVG_W8_H4_AVX2 64 cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m3, [pw_ %+ ADDAVG_FACTOR] pxor m1, m1 add r3, r3 add r4, r4 add r5, r5 mov r6d, 4 .loop: %rep 2 movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m1 pminsw m0, m5 vextracti128 xm2, m0, 1 movu [r2], xm0 movq [r2 + 16], xm2 movu m0, [r0 + r3] movu m2, [r1 + r4] paddw m0, m2 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m1 pminsw m0, m5 vextracti128 xm2, m0, 1 movu [r2 + r5], xm0 movq [r2 + r5 + 16], xm2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] %endrep dec r6d jnz .loop RET cglobal addAvg_12x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] paddw m3, m4, m4 pxor m1, m1 add r3, r3 add r4, r4 add r5, r5 mov r6d, 8 .loop: %rep 2 movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m1 pminsw m0, m5 vextracti128 xm2, m0, 1 movu [r2], xm0 movq [r2 + 16], xm2 movu m0, [r0 + r3] movu m2, [r1 + r4] paddw m0, m2 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m1 pminsw m0, m5 vextracti128 xm2, m0, 1 movu [r2 + r5], xm0 movq [r2 + r5 + 16], xm2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] %endrep dec r6d jnz .loop RET %macro ADDAVG_W16_H4_AVX2 1 cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m3, [pw_ %+ ADDAVG_FACTOR] pxor m2, m2 add r3, r3 add r4, r4 add r5, r5 mov r6d, %1/4 .loop: %rep 2 movu m0, [r0] movu m1, [r1] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2], m0 movu m0, [r0 + r3] movu m1, [r1 + r4] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + r5], m0 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] %endrep dec r6d jnz .loop RET %endmacro ADDAVG_W16_H4_AVX2 4 ADDAVG_W16_H4_AVX2 8 ADDAVG_W16_H4_AVX2 12 ADDAVG_W16_H4_AVX2 16 ADDAVG_W16_H4_AVX2 24 ADDAVG_W16_H4_AVX2 32 ADDAVG_W16_H4_AVX2 64 cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m3, [pw_ %+ ADDAVG_FACTOR] pxor m1, m1 add r3, r3 add r4, r4 add r5, r5 mov r6d, 16 .loop: movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m1 pminsw m0, m5 movu [r2], m0 movu xm0, [r0 + 32] movu xm2, [r1 + 32] paddw xm0, xm2 pmulhrsw xm0, xm3 paddw xm0, xm4 pmaxsw xm0, xm1 pminsw xm0, xm5 movu [r2 + 32], xm0 movu m0, [r0 + r3] movu m2, [r1 + r4] paddw m0, m2 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m1 pminsw m0, m5 movu [r2 + r5], m0 movu xm2, [r0 + r3 + 32] movu xm0, [r1 + r4 + 32] paddw xm2, xm0 pmulhrsw xm2, xm3 paddw xm2, xm4 pmaxsw xm2, xm1 pminsw xm2, xm5 movu [r2 + r5 + 32], xm2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET cglobal addAvg_24x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] paddw m3, m4, m4 pxor m1, m1 add r3, r3 add r4, r4 add r5, r5 mov r6d, 32 .loop: movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m1 pminsw m0, m5 movu [r2], m0 movu xm0, [r0 + 32] movu xm2, [r1 + 32] paddw xm0, xm2 pmulhrsw xm0, xm3 paddw xm0, xm4 pmaxsw xm0, xm1 pminsw xm0, xm5 movu [r2 + 32], xm0 movu m0, [r0 + r3] movu m2, [r1 + r4] paddw m0, m2 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m1 pminsw m0, m5 movu [r2 + r5], m0 movu xm2, [r0 + r3 + 32] movu xm0, [r1 + r4 + 32] paddw xm2, xm0 pmulhrsw xm2, xm3 paddw xm2, xm4 pmaxsw xm2, xm1 pminsw xm2, xm5 movu [r2 + r5 + 32], xm2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %macro ADDAVG_W32_H2_AVX2 1 cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m3, [pw_ %+ ADDAVG_FACTOR] pxor m2, m2 add r3, r3 add r4, r4 add r5, r5 mov r6d, %1/2 .loop: movu m0, [r0] movu m1, [r1] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2], m0 movu m0, [r0 + 32] movu m1, [r1 + 32] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + 32], m0 movu m0, [r0 + r3] movu m1, [r1 + r4] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + r5], m0 movu m0, [r0 + r3 + 32] movu m1, [r1 + r4 + 32] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + r5 + 32], m0 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W32_H2_AVX2 8 ADDAVG_W32_H2_AVX2 16 ADDAVG_W32_H2_AVX2 24 ADDAVG_W32_H2_AVX2 32 ADDAVG_W32_H2_AVX2 48 ADDAVG_W32_H2_AVX2 64 cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m3, [pw_ %+ ADDAVG_FACTOR] pxor m2, m2 add r3, r3 add r4, r4 add r5, r5 mov r6d, 32 .loop: movu m0, [r0] movu m1, [r1] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2], m0 movu m0, [r0 + 32] movu m1, [r1 + 32] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + 32], m0 movu m0, [r0 + 64] movu m1, [r1 + 64] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + 64], m0 movu m0, [r0 + r3] movu m1, [r1 + r4] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + r5], m0 movu m0, [r0 + r3 + 32] movu m1, [r1 + r4 + 32] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + r5 + 32], m0 movu m0, [r0 + r3 + 64] movu m1, [r1 + r4 + 64] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + r5 + 64], m0 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %macro ADDAVG_W64_H1_AVX2 1 cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride mova m4, [pw_ %+ ADDAVG_ROUND] mova m5, [pw_pixel_max] mova m3, [pw_ %+ ADDAVG_FACTOR] pxor m2, m2 add r3d, r3d add r4d, r4d add r5d, r5d mov r6d, %1/2 .loop: movu m0, [r0] movu m1, [r1] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2], m0 movu m0, [r0 + 32] movu m1, [r1 + 32] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + 32], m0 movu m0, [r0 + 64] movu m1, [r1 + 64] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + 64], m0 movu m0, [r0 + 96] movu m1, [r1 + 96] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + 96], m0 movu m0, [r0 + r3] movu m1, [r1 + r4] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + r5], m0 movu m0, [r0 + r3 + 32] movu m1, [r1 + r4 + 32] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + r5 + 32], m0 movu m0, [r0 + r3 + 64] movu m1, [r1 + r4 + 64] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + r5 + 64], m0 movu m0, [r0 + r3 + 96] movu m1, [r1 + r4 + 96] paddw m0, m1 pmulhrsw m0, m3 paddw m0, m4 pmaxsw m0, m2 pminsw m0, m5 movu [r2 + r5 + 96], m0 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W64_H1_AVX2 16 ADDAVG_W64_H1_AVX2 32 ADDAVG_W64_H1_AVX2 48 ADDAVG_W64_H1_AVX2 64 ;----------------------------------------------------------------------------- %else ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_2x4, 6,6,8, src0, src1, dst, src0Stride, src1tride, dstStride mova m0, [pw_256] mova m7, [pw_128] add r3, r3 add r4, r4 movd m1, [r0] movd m2, [r0 + r3] movd m3, [r1] movd m4, [r1 + r4] punpckldq m1, m2 punpckldq m3, m4 lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movd m2, [r0] movd m4, [r0 + r3] movd m5, [r1] movd m6, [r1 + r4] punpckldq m2, m4 punpckldq m5, m6 punpcklqdq m1, m2 punpcklqdq m3, m5 paddw m1, m3 pmulhrsw m1, m0 paddw m1, m7 packuswb m1, m1 pextrw [r2], m1, 0 pextrw [r2 + r5], m1, 1 lea r2, [r2 + 2 * r5] pextrw [r2], m1, 2 pextrw [r2 + r5], m1, 3 RET ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_2x8, 6,6,8, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m0, [pw_256] mova m7, [pw_128] add r3, r3 add r4, r4 movd m1, [r0] movd m2, [r0 + r3] movd m3, [r1] movd m4, [r1 + r4] punpckldq m1, m2 punpckldq m3, m4 lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movd m2, [r0] movd m4, [r0 + r3] movd m5, [r1] movd m6, [r1 + r4] punpckldq m2, m4 punpckldq m5, m6 punpcklqdq m1, m2 punpcklqdq m3, m5 paddw m1, m3 pmulhrsw m1, m0 paddw m1, m7 packuswb m1, m1 pextrw [r2], m1, 0 pextrw [r2 + r5], m1, 1 lea r2, [r2 + 2 * r5] pextrw [r2], m1, 2 pextrw [r2 + r5], m1, 3 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movd m1, [r0] movd m2, [r0 + r3] movd m3, [r1] movd m4, [r1 + r4] punpckldq m1, m2 punpckldq m3, m4 lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movd m2, [r0] movd m4, [r0 + r3] movd m5, [r1] movd m6, [r1 + r4] punpckldq m2, m4 punpckldq m5, m6 punpcklqdq m1, m2 punpcklqdq m3, m5 paddw m1, m3 pmulhrsw m1, m0 paddw m1, m7 packuswb m1, m1 pextrw [r2], m1, 0 pextrw [r2 + r5], m1, 1 lea r2, [r2 + 2 * r5] pextrw [r2], m1, 2 pextrw [r2 + r5], m1, 3 RET ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_2x16, 6,7,8, src0, src1, dst, src0Stride, src1tride, dstStride mova m0, [pw_256] mova m7, [pw_128] mov r6d, 16/4 add r3, r3 add r4, r4 .loop: movd m1, [r0] movd m2, [r0 + r3] movd m3, [r1] movd m4, [r1 + r4] lea r0, [r0 + r3 * 2] lea r1, [r1 + r4 * 2] punpckldq m1, m2 punpckldq m3, m4 movd m2, [r0] movd m4, [r0 + r3] movd m5, [r1] movd m6, [r1 + r4] lea r0, [r0 + r3 * 2] lea r1, [r1 + r4 * 2] punpckldq m2, m4 punpckldq m5, m6 punpcklqdq m1, m2 punpcklqdq m3, m5 paddw m1, m3 pmulhrsw m1, m0 paddw m1, m7 packuswb m1, m1 pextrw [r2], m1, 0 pextrw [r2 + r5], m1, 1 lea r2, [r2 + r5 * 2] pextrw [r2], m1, 2 pextrw [r2 + r5], m1, 3 lea r2, [r2 + r5 * 2] dec r6d jnz .loop RET ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_4x2, 6,6,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m1, [pw_256] mova m3, [pw_128] add r3, r3 add r4, r4 movh m0, [r0] movhps m0, [r0 + r3] movh m2, [r1] movhps m2, [r1 + r4] paddw m0, m2 pmulhrsw m0, m1 paddw m0, m3 packuswb m0, m0 movd [r2], m0 pshufd m0, m0, 1 movd [r2 + r5], m0 RET ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- %macro ADDAVG_W4_H4 1 INIT_XMM sse4 cglobal addAvg_4x%1, 6,7,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m1, [pw_256] mova m3, [pw_128] add r3, r3 add r4, r4 mov r6d, %1/4 .loop: movh m0, [r0] movhps m0, [r0 + r3] movh m2, [r1] movhps m2, [r1 + r4] paddw m0, m2 pmulhrsw m0, m1 paddw m0, m3 packuswb m0, m0 movd [r2], m0 pshufd m0, m0, 1 movd [r2 + r5], m0 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movh m0, [r0] movhps m0, [r0 + r3] movh m2, [r1] movhps m2, [r1 + r4] paddw m0, m2 pmulhrsw m0, m1 paddw m0, m3 packuswb m0, m0 movd [r2], m0 pshufd m0, m0, 1 movd [r2 + r5], m0 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W4_H4 4 ADDAVG_W4_H4 8 ADDAVG_W4_H4 16 ADDAVG_W4_H4 32 ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_6x8, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movd [r2], m0 pextrw [r2 + 4], m0, 2 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 packuswb m1, m1 movd [r2 + r5], m1 pextrw [r2 + r5 + 4], m1, 2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movd [r2], m0 pextrw [r2 + 4], m0, 2 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 packuswb m1, m1 movd [r2 + r5], m1 pextrw [r2 + r5 + 4], m1, 2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movd [r2], m0 pextrw [r2 + 4], m0, 2 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 packuswb m1, m1 movd [r2 + r5], m1 pextrw [r2 + r5 + 4], m1, 2 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movd [r2], m0 pextrw [r2 + 4], m0, 2 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 packuswb m1, m1 movd [r2 + r5], m1 pextrw [r2 + r5 + 4], m1, 2 RET ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_6x16, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] mov r6d, 16/2 add r3, r3 add r4, r4 .loop: movu m0, [r0] movu m2, [r1] movu m1, [r0 + r3] movu m3, [r1 + r4] dec r6d lea r0, [r0 + r3 * 2] lea r1, [r1 + r4 * 2] paddw m0, m2 paddw m1, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 packuswb m0, m0 packuswb m1, m1 movd [r2], m0 pextrw [r2 + 4], m0, 2 movd [r2 + r5], m1 pextrw [r2 + r5 + 4], m1, 2 lea r2, [r2 + r5 * 2] jnz .loop RET ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_8x2, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movh [r2], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 packuswb m1, m1 movh [r2 + r5], m1 RET ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal addAvg_8x6, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movh [r2], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 packuswb m1, m1 movh [r2 + r5], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movh [r2], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 packuswb m1, m1 movh [r2 + r5], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movh [r2], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 packuswb m1, m1 movh [r2 + r5], m1 RET ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- %macro ADDAVG_W8_H4 1 INIT_XMM sse4 cglobal addAvg_8x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 mov r6d, %1/4 .loop: movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movh [r2], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 packuswb m1, m1 movh [r2 + r5], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movh [r2], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 packuswb m1, m1 movh [r2 + r5], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W8_H4 4 ADDAVG_W8_H4 8 ADDAVG_W8_H4 16 ADDAVG_W8_H4 32 ADDAVG_W8_H4 12 ADDAVG_W8_H4 64 ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- %macro ADDAVG_W12_H4 1 INIT_XMM sse4 cglobal addAvg_12x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 mov r6d, %1/4 .loop: movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movh [r2], m0 movh m0, [r0 + 16] movhps m0, [r0 + 16 + r3] movh m2, [r1 + 16] movhps m2, [r1 + 16 + r4] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movd [r2 + 8], m0 pshufd m0, m0, 1 movd [r2 + 8 + r5], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 packuswb m1, m1 movh [r2 + r5], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movh [r2], m0 movh m0, [r0 + 16] movhps m0, [r0 + 16 + r3] movh m2, [r1 + 16] movhps m2, [r1 + 16 + r4] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movd [r2 + 8], m0 pshufd m0, m0, 1 movd [r2 + 8 + r5], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 packuswb m1, m1 movh [r2 + r5], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W12_H4 16 ADDAVG_W12_H4 32 ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- %macro ADDAVG_W16_H4 1 INIT_XMM sse4 cglobal addAvg_16x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 mov r6d, %1/4 .loop: movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 16] movu m2, [r1 + 16] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 movu [r2], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 movu m2, [r0 + 16 + r3] movu m3, [r1 + 16 + r4] paddw m2, m3 pmulhrsw m2, m4 paddw m2, m5 packuswb m1, m2 movu [r2 + r5], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 16] movu m2, [r1 + 16] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 movu [r2], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 movu m2, [r0 + 16 + r3] movu m3, [r1 + 16 + r4] paddw m2, m3 pmulhrsw m2, m4 paddw m2, m5 packuswb m1, m2 movu [r2 + r5], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W16_H4 4 ADDAVG_W16_H4 8 ADDAVG_W16_H4 12 ADDAVG_W16_H4 16 ADDAVG_W16_H4 32 ADDAVG_W16_H4 64 ADDAVG_W16_H4 24 ;----------------------------------------------------------------------------- ; addAvg avx2 code start ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal addAvg_8x2, 6,6,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride movu xm0, [r0] vinserti128 m0, m0, [r0 + 2 * r3], 1 movu xm2, [r1] vinserti128 m2, m2, [r1 + 2 * r4], 1 paddw m0, m2 pmulhrsw m0, [pw_256] paddw m0, [pw_128] packuswb m0, m0 vextracti128 xm1, m0, 1 movq [r2], xm0 movq [r2 + r5], xm1 RET cglobal addAvg_8x6, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 movu xm0, [r0] vinserti128 m0, m0, [r0 + r3], 1 movu xm2, [r1] vinserti128 m2, m2, [r1 + r4], 1 paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 vextracti128 xm1, m0, 1 movq [r2], xm0 movq [r2 + r5], xm1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu xm0, [r0] vinserti128 m0, m0, [r0+ r3], 1 movu xm2, [r1] vinserti128 m2, m2, [r1 + r4], 1 paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 vextracti128 xm1, m0, 1 movq [r2], xm0 movq [r2 + r5], xm1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu xm0, [r0] vinserti128 m0, m0, [r0 + r3], 1 movu xm2, [r1] vinserti128 m2, m2, [r1 + r4], 1 paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 vextracti128 xm1, m0, 1 movq [r2], xm0 movq [r2 + r5], xm1 RET %macro ADDAVG_W8_H4_AVX2 1 INIT_YMM avx2 cglobal addAvg_8x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 mov r6d, %1/4 .loop: movu xm0, [r0] vinserti128 m0, m0, [r0 + r3], 1 movu xm2, [r1] vinserti128 m2, m2, [r1 + r4], 1 paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 vextracti128 xm1, m0, 1 movq [r2], xm0 movq [r2 + r5], xm1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu xm0, [r0] vinserti128 m0, m0, [r0 + r3], 1 movu m2, [r1] vinserti128 m2, m2, [r1 + r4], 1 paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 vextracti128 xm1, m0, 1 movq [r2], xm0 movq [r2 + r5], xm1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W8_H4_AVX2 4 ADDAVG_W8_H4_AVX2 8 ADDAVG_W8_H4_AVX2 12 ADDAVG_W8_H4_AVX2 16 ADDAVG_W8_H4_AVX2 32 ADDAVG_W8_H4_AVX2 64 %macro ADDAVG_W12_H4_AVX2 1 INIT_YMM avx2 cglobal addAvg_12x%1, 6,7,7, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 mov r6d, %1/4 .loop: movu xm0, [r0] movu xm1, [r1] movq xm2, [r0 + 16] movq xm3, [r1 + 16] vinserti128 m0, m0, xm2, 1 vinserti128 m1, m1, xm3, 1 paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 movu xm1, [r0 + r3] movu xm2, [r1 + r4] movq xm3, [r0 + r3 + 16] movq xm6, [r1 + r3 + 16] vinserti128 m1, m1, xm3, 1 vinserti128 m2, m2, xm6, 1 paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [r2], xm0 movd [r2 + 8], xm1 vpshufd m1, m1, 2 movhps [r2 + r5], xm0 movd [r2 + r5 + 8], xm1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu xm0, [r0] movu xm1, [r1] movq xm2, [r0 + 16] movq xm3, [r1 + 16] vinserti128 m0, m0, xm2, 1 vinserti128 m1, m1, xm3, 1 paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 movu xm1, [r0 + r3] movu xm2, [r1 + r4] movq xm3, [r0 + r3 + 16] movq xm6, [r1 + r3 + 16] vinserti128 m1, m1, xm3, 1 vinserti128 m2, m2, xm6, 1 paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [r2], xm0 movd [r2 + 8], xm1 vpshufd m1, m1, 2 movhps [r2 + r5], xm0 movd [r2 + r5 + 8], xm1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W12_H4_AVX2 16 ADDAVG_W12_H4_AVX2 32 %macro ADDAVG_W16_H4_AVX2 1 INIT_YMM avx2 cglobal addAvg_16x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 mov r6d, %1/4 .loop: movu m0, [r0] movu m1, [r1] paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + r3] movu m2, [r1 + r4] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 vpermq m0, m0, 11011000b vextracti128 [r2], m0, 0 vextracti128 [r2 + r5], m0, 1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] movu m0, [r0] movu m1, [r1] paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + r3] movu m2, [r1 + r4] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 vpermq m0, m0, 11011000b vextracti128 [r2], m0, 0 vextracti128 [r2 + r5], m0, 1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W16_H4_AVX2 4 ADDAVG_W16_H4_AVX2 8 ADDAVG_W16_H4_AVX2 12 ADDAVG_W16_H4_AVX2 16 ADDAVG_W16_H4_AVX2 24 ADDAVG_W16_H4_AVX2 32 ADDAVG_W16_H4_AVX2 64 %macro ADDAVG_W24_H2_AVX2 1 INIT_YMM avx2 cglobal addAvg_24x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 mov r6d, %1/2 .loop: movu m0, [r0] movu m1, [r1] paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 movu xm1, [r0 + 32] movu xm2, [r1 + 32] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 vpermq m0, m0, 10001101b vextracti128 [r2], m0, 1 movq [r2 + 16], xm0 movu m0, [r0 + r3] movu m1, [r1 + r4] paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 movu xm1, [r0 + r3 + 32] movu xm2, [r1 + r4 + 32] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 vpermq m0, m0, 10001101b vextracti128 [r2 + r5], m0, 1 movq [r2 + r5 + 16], xm0 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W24_H2_AVX2 32 ADDAVG_W24_H2_AVX2 64 %macro ADDAVG_W32_H2_AVX2 1 INIT_YMM avx2 cglobal addAvg_32x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 mov r6d, %1/2 .loop: movu m0, [r0] movu m1, [r1] paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 32] movu m2, [r1 + 32] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r2], m0 movu m0, [r0 + r3] movu m1, [r1 + r4] paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + r3 + 32] movu m2, [r1 + r4 + 32] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r2 + r5], m0 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W32_H2_AVX2 8 ADDAVG_W32_H2_AVX2 16 ADDAVG_W32_H2_AVX2 24 ADDAVG_W32_H2_AVX2 32 ADDAVG_W32_H2_AVX2 48 ADDAVG_W32_H2_AVX2 64 %macro ADDAVG_W64_H2_AVX2 1 INIT_YMM avx2 cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 mov r6d, %1/2 .loop: movu m0, [r0] movu m1, [r1] paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 32] movu m2, [r1 + 32] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r2], m0 movu m0, [r0 + 64] movu m1, [r1 + 64] paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 96] movu m2, [r1 + 96] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r2 + 32], m0 movu m0, [r0 + r3] movu m1, [r1 + r4] paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + r3 + 32] movu m2, [r1 + r4 + 32] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r2 + r5], m0 movu m0, [r0 + r3 + 64] movu m1, [r1 + r4 + 64] paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + r3 + 96] movu m2, [r1 + r4 + 96] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r2 + r5 + 32], m0 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W64_H2_AVX2 16 ADDAVG_W64_H2_AVX2 32 ADDAVG_W64_H2_AVX2 48 ADDAVG_W64_H2_AVX2 64 %macro ADDAVG_W48_H2_AVX2 1 INIT_YMM avx2 cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 mov r6d, %1/2 .loop: movu m0, [r0] movu m1, [r1] paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 32] movu m2, [r1 + 32] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r2], m0 movu m0, [r0 + 64] movu m1, [r1 + 64] paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 vpermq m0, m0, 11011000b vextracti128 [r2 + 32], m0, 0 movu m0, [r0 + r3] movu m1, [r1 + r4] paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + r3 + 32] movu m2, [r1 + r4 + 32] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r2 + r5], m0 movu m0, [r0 + r3 + 64] movu m1, [r1 + r4 + 64] paddw m0, m1 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 vpermq m0, m0, 11011000b vextracti128 [r2 + r5 + 32], m0, 0 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W48_H2_AVX2 64 ;----------------------------------------------------------------------------- ; addAvg avx2 code end ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- %macro ADDAVG_W24_H2 2 INIT_XMM sse4 cglobal addAvg_%1x%2, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 mov r6d, %2/2 .loop: movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 16] movu m2, [r1 + 16] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 movu [r2], m0 movu m0, [r0 + 32] movu m2, [r1 + 32] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 packuswb m0, m0 movh [r2 + 16], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 movu m2, [r0 + 16 + r3] movu m3, [r1 + 16 + r4] paddw m2, m3 pmulhrsw m2, m4 paddw m2, m5 packuswb m1, m2 movu [r2 + r5], m1 movu m1, [r0 + 32 + r3] movu m3, [r1 + 32 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 packuswb m1, m1 movh [r2 + 16 + r5], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W24_H2 24, 32 ADDAVG_W24_H2 24, 64 ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- %macro ADDAVG_W32_H2 1 INIT_XMM sse4 cglobal addAvg_32x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 mov r6d, %1/2 .loop: movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 16] movu m2, [r1 + 16] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 movu [r2], m0 movu m0, [r0 + 32] movu m2, [r1 + 32] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 48] movu m2, [r1 + 48] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 movu [r2 + 16], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 movu m2, [r0 + 16 + r3] movu m3, [r1 + 16 + r4] paddw m2, m3 pmulhrsw m2, m4 paddw m2, m5 packuswb m1, m2 movu [r2 + r5], m1 movu m1, [r0 + 32 + r3] movu m3, [r1 + 32 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 movu m2, [r0 + 48 + r3] movu m3, [r1 + 48 + r4] paddw m2, m3 pmulhrsw m2, m4 paddw m2, m5 packuswb m1, m2 movu [r2 + 16 + r5], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W32_H2 8 ADDAVG_W32_H2 16 ADDAVG_W32_H2 24 ADDAVG_W32_H2 32 ADDAVG_W32_H2 64 ADDAVG_W32_H2 48 ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- %macro ADDAVG_W48_H2 1 INIT_XMM sse4 cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 mov r6d, %1/2 .loop: movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 16] movu m2, [r1 + 16] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 movu [r2], m0 movu m0, [r0 + 32] movu m2, [r1 + 32] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 48] movu m2, [r1 + 48] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 movu [r2 + 16], m0 movu m0, [r0 + 64] movu m2, [r1 + 64] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 80] movu m2, [r1 + 80] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 movu [r2 + 32], m0 movu m1, [r0 + r3] movu m3, [r1 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 movu m2, [r0 + 16 + r3] movu m3, [r1 + 16 + r4] paddw m2, m3 pmulhrsw m2, m4 paddw m2, m5 packuswb m1, m2 movu [r2 + r5], m1 movu m1, [r0 + 32 + r3] movu m3, [r1 + 32 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 movu m2, [r0 + 48 + r3] movu m3, [r1 + 48 + r4] paddw m2, m3 pmulhrsw m2, m4 paddw m2, m5 packuswb m1, m2 movu [r2 + 16 + r5], m1 movu m1, [r0 + 64 + r3] movu m3, [r1 + 64 + r4] paddw m1, m3 pmulhrsw m1, m4 paddw m1, m5 movu m2, [r0 + 80 + r3] movu m3, [r1 + 80 + r4] paddw m2, m3 pmulhrsw m2, m4 paddw m2, m5 packuswb m1, m2 movu [r2 + 32 + r5], m1 lea r2, [r2 + 2 * r5] lea r0, [r0 + 2 * r3] lea r1, [r1 + 2 * r4] dec r6d jnz .loop RET %endmacro ADDAVG_W48_H2 64 ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- %macro ADDAVG_W64_H1 1 INIT_XMM sse4 cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride mova m4, [pw_256] mova m5, [pw_128] add r3, r3 add r4, r4 mov r6d, %1 .loop: movu m0, [r0] movu m2, [r1] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 16] movu m2, [r1 + 16] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 movu [r2], m0 movu m0, [r0 + 32] movu m2, [r1 + 32] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 48] movu m2, [r1 + 48] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 movu [r2 + 16], m0 movu m0, [r0 + 64] movu m2, [r1 + 64] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 80] movu m2, [r1 + 80] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 movu [r2 + 32], m0 movu m0, [r0 + 96] movu m2, [r1 + 96] paddw m0, m2 pmulhrsw m0, m4 paddw m0, m5 movu m1, [r0 + 112] movu m2, [r1 + 112] paddw m1, m2 pmulhrsw m1, m4 paddw m1, m5 packuswb m0, m1 movu [r2 + 48], m0 add r2, r5 add r0, r3 add r1, r4 dec r6d jnz .loop RET %endmacro ADDAVG_W64_H1 16 ADDAVG_W64_H1 32 ADDAVG_W64_H1 48 ADDAVG_W64_H1 64 ;----------------------------------------------------------------------------- %endif ; HIGH_BIT_DEPTH ;============================================================================= ; implicit weighted biprediction ;============================================================================= ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 %if WIN64 DECLARE_REG_TMP 0,1,2,3,4,5,4,5 %macro AVG_START 0-1 0 PROLOGUE 6,7,%1 %endmacro %elif UNIX64 DECLARE_REG_TMP 0,1,2,3,4,5,7,8 %macro AVG_START 0-1 0 PROLOGUE 6,9,%1 %endmacro %else DECLARE_REG_TMP 1,2,3,4,5,6,1,2 %macro AVG_START 0-1 0 PROLOGUE 0,7,%1 mov t0, r0m mov t1, r1m mov t2, r2m mov t3, r3m mov t4, r4m mov t5, r5m %endmacro %endif %macro AVG_END 0 lea t4, [t4+t5*2*SIZEOF_PIXEL] lea t2, [t2+t3*2*SIZEOF_PIXEL] lea t0, [t0+t1*2*SIZEOF_PIXEL] sub eax, 2 jg .height_loop %ifidn movu,movq ; detect MMX EMMS %endif RET %endmacro %if HIGH_BIT_DEPTH %macro BIWEIGHT_MMX 2 movh m0, %1 movh m1, %2 punpcklwd m0, m1 pmaddwd m0, m3 paddd m0, m4 psrad m0, 6 %endmacro %macro BIWEIGHT_START_MMX 0 movzx t6d, word r6m mov t7d, 64 sub t7d, t6d shl t7d, 16 add t6d, t7d movd m3, t6d SPLATD m3, m3 mova m4, [pd_32] pxor m5, m5 %endmacro %else ;!HIGH_BIT_DEPTH %macro BIWEIGHT_MMX 2 movh m0, %1 movh m1, %2 punpcklbw m0, m5 punpcklbw m1, m5 pmullw m0, m2 pmullw m1, m3 paddw m0, m1 paddw m0, m4 psraw m0, 6 %endmacro %macro BIWEIGHT_START_MMX 0 movd m2, r6m SPLATW m2, m2 ; weight_dst mova m3, [pw_64] psubw m3, m2 ; weight_src mova m4, [pw_32] ; rounding pxor m5, m5 %endmacro %endif ;HIGH_BIT_DEPTH %macro BIWEIGHT_SSSE3 2 movh m0, %1 movh m1, %2 punpcklbw m0, m1 pmaddubsw m0, m3 pmulhrsw m0, m4 %endmacro %macro BIWEIGHT_START_SSSE3 0 movzx t6d, byte r6m ; FIXME x86_64 mov t7d, 64 sub t7d, t6d shl t7d, 8 add t6d, t7d mova m4, [pw_512] movd xm3, t6d %if cpuflag(avx2) vpbroadcastw m3, xm3 %else SPLATW m3, m3 ; weight_dst,src %endif %endmacro %if HIGH_BIT_DEPTH %macro BIWEIGHT_ROW 4 BIWEIGHT [%2], [%3] %if %4==mmsize/4 packssdw m0, m0 CLIPW m0, m5, m7 movh [%1], m0 %else SWAP 0, 6 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2] packssdw m6, m0 CLIPW m6, m5, m7 mova [%1], m6 %endif %endmacro %else ;!HIGH_BIT_DEPTH %macro BIWEIGHT_ROW 4 BIWEIGHT [%2], [%3] %if %4==mmsize/2 packuswb m0, m0 movh [%1], m0 %else SWAP 0, 6 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2] packuswb m6, m0 %if %4 != 12 mova [%1], m6 %else ; !w12 movh [%1], m6 movhlps m6, m6 movd [%1+mmsize/2], m6 %endif ; w12 %endif %endmacro %endif ;HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight ) ;----------------------------------------------------------------------------- %macro AVG_WEIGHT 1-2 0 cglobal pixel_avg_weight_w%1 BIWEIGHT_START AVG_START %2 %if HIGH_BIT_DEPTH mova m7, [pw_pixel_max] %endif .height_loop: %if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL) BIWEIGHT [t2], [t4] SWAP 0, 6 BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5] %if HIGH_BIT_DEPTH packssdw m6, m0 CLIPW m6, m5, m7 %else ;!HIGH_BIT_DEPTH packuswb m6, m0 %endif ;HIGH_BIT_DEPTH movlps [t0], m6 movhps [t0+SIZEOF_PIXEL*t1], m6 %else %assign x 0 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize %assign y mmsize %if (%1 == 12) && (%1*SIZEOF_PIXEL-x < mmsize) %assign y (%1*SIZEOF_PIXEL-x) %endif BIWEIGHT_ROW t0+x, t2+x, t4+x, y BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, y %assign x x+mmsize %endrep %endif AVG_END %endmacro %define BIWEIGHT BIWEIGHT_MMX %define BIWEIGHT_START BIWEIGHT_START_MMX INIT_MMX mmx2 AVG_WEIGHT 4 AVG_WEIGHT 8 AVG_WEIGHT 12 AVG_WEIGHT 16 AVG_WEIGHT 32 AVG_WEIGHT 64 AVG_WEIGHT 24 AVG_WEIGHT 48 %if HIGH_BIT_DEPTH INIT_XMM sse2 AVG_WEIGHT 4, 8 AVG_WEIGHT 8, 8 AVG_WEIGHT 12, 8 AVG_WEIGHT 16, 8 AVG_WEIGHT 24, 8 AVG_WEIGHT 32, 8 AVG_WEIGHT 48, 8 AVG_WEIGHT 64, 8 %else ;!HIGH_BIT_DEPTH INIT_XMM sse2 AVG_WEIGHT 8, 7 AVG_WEIGHT 12, 7 AVG_WEIGHT 16, 7 AVG_WEIGHT 32, 7 AVG_WEIGHT 64, 7 AVG_WEIGHT 24, 7 AVG_WEIGHT 48, 7 %define BIWEIGHT BIWEIGHT_SSSE3 %define BIWEIGHT_START BIWEIGHT_START_SSSE3 INIT_MMX ssse3 AVG_WEIGHT 4 INIT_XMM ssse3 AVG_WEIGHT 8, 7 AVG_WEIGHT 12, 7 AVG_WEIGHT 16, 7 AVG_WEIGHT 32, 7 AVG_WEIGHT 64, 7 AVG_WEIGHT 24, 7 AVG_WEIGHT 48, 7 INIT_YMM avx2 cglobal pixel_avg_weight_w16 BIWEIGHT_START AVG_START 5 .height_loop: movu xm0, [t2] movu xm1, [t4] vinserti128 m0, m0, [t2+t3], 1 vinserti128 m1, m1, [t4+t5], 1 SBUTTERFLY bw, 0, 1, 2 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 packuswb m0, m1 mova [t0], xm0 vextracti128 [t0+t1], m0, 1 AVG_END cglobal pixel_avg_weight_w32 BIWEIGHT_START AVG_START 5 .height_loop: movu m0, [t2] movu m1, [t4] SBUTTERFLY bw, 0, 1, 2 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 packuswb m0, m1 mova [t0], m0 AVG_END cglobal pixel_avg_weight_w64 BIWEIGHT_START AVG_START 5 .height_loop: movu m0, [t2] movu m1, [t4] SBUTTERFLY bw, 0, 1, 2 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 packuswb m0, m1 mova [t0], m0 movu m0, [t2 + 32] movu m1, [t4 + 32] SBUTTERFLY bw, 0, 1, 2 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 packuswb m0, m1 mova [t0 + 32], m0 AVG_END %endif ;HIGH_BIT_DEPTH ;============================================================================= ; P frame explicit weighted prediction ;============================================================================= %if HIGH_BIT_DEPTH ; width %macro WEIGHT_START 1 mova m0, [r4+ 0] ; 1<= mmsize WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4 %assign x (x+mmsize) %else %assign w %3-x %if w == 20 %assign w 16 %endif WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4 %assign x (x+w) %endif %if x >= %3 %exitrep %endif %endrep %endmacro %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h ) ;----------------------------------------------------------------------------- %macro WEIGHTER 1 cglobal mc_weight_w%1, 6,6,8 FIX_STRIDES r1, r3 WEIGHT_START %1 %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 ; we can merge the shift step into the scale factor ; if (m3<<7) doesn't overflow an int16_t cmp byte [r4+1], 0 jz .fast %endif .loop: WEIGHT_TWO_ROW r2, r0, %1, 0 lea r0, [r0+r1*2] lea r2, [r2+r3*2] sub r5d, 2 jg .loop RET %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 .fast: psllw m3, 7 .fastloop: WEIGHT_TWO_ROW r2, r0, %1, 1 lea r0, [r0+r1*2] lea r2, [r2+r3*2] sub r5d, 2 jg .fastloop RET %endif %endmacro INIT_MMX mmx2 WEIGHTER 4 WEIGHTER 8 WEIGHTER 12 WEIGHTER 16 WEIGHTER 20 INIT_XMM sse2 WEIGHTER 8 WEIGHTER 16 WEIGHTER 20 %if HIGH_BIT_DEPTH WEIGHTER 12 %else INIT_MMX ssse3 WEIGHTER 4 INIT_XMM ssse3 WEIGHTER 8 WEIGHTER 16 WEIGHTER 20 INIT_YMM avx2 WEIGHTER 8 WEIGHTER 16 WEIGHTER 20 %endif %macro OFFSET_OP 7 mov%6 m0, [%1] mov%6 m1, [%2] %if HIGH_BIT_DEPTH p%5usw m0, m2 p%5usw m1, m2 %ifidn %5,add pminsw m0, m3 pminsw m1, m3 %endif %else p%5usb m0, m2 p%5usb m1, m2 %endif mov%7 [%3], m0 mov%7 [%4], m1 %endmacro %macro OFFSET_TWO_ROW 4 %assign x 0 %rep %3 %if (%3*SIZEOF_PIXEL-x) >= mmsize OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a %assign x (x+mmsize) %else %if HIGH_BIT_DEPTH OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h %else OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d %endif %exitrep %endif %if x >= %3*SIZEOF_PIXEL %exitrep %endif %endrep %endmacro ;----------------------------------------------------------------------------- ;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h ) ;----------------------------------------------------------------------------- %macro OFFSET 2 cglobal mc_offset%2_w%1, 6,6 FIX_STRIDES r1, r3 mova m2, [r4] %if HIGH_BIT_DEPTH %ifidn %2,add mova m3, [pw_pixel_max] %endif %endif .loop: OFFSET_TWO_ROW r2, r0, %1, %2 lea r0, [r0+r1*2] lea r2, [r2+r3*2] sub r5d, 2 jg .loop RET %endmacro %macro OFFSETPN 1 OFFSET %1, add OFFSET %1, sub %endmacro INIT_MMX mmx2 OFFSETPN 4 OFFSETPN 8 OFFSETPN 12 OFFSETPN 16 OFFSETPN 20 INIT_XMM sse2 OFFSETPN 12 OFFSETPN 16 OFFSETPN 20 %if HIGH_BIT_DEPTH INIT_XMM sse2 OFFSETPN 8 %endif ;============================================================================= ; pixel avg ;============================================================================= ;----------------------------------------------------------------------------- ; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, ; pixel *src2, intptr_t src2_stride, int weight ); ;----------------------------------------------------------------------------- %macro AVGH 2 cglobal pixel_avg_%1x%2 mov eax, %2 cmp dword r6m, 32 jne pixel_avg_weight_w%1 %+ SUFFIX %if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads jmp pixel_avg_w%1_avx2 %else %if mmsize == 16 && (%1 % 16 == 0) test dword r4m, 15 jz pixel_avg_w%1_sse2 %endif %if (%1 == 8) jmp pixel_avg_w8_unaligned_sse2 %else jmp pixel_avg_w%1_mmx2 %endif %endif %endmacro ;----------------------------------------------------------------------------- ; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, ; pixel *src2, intptr_t src2_stride, int height, int weight ); ;----------------------------------------------------------------------------- %macro AVG_FUNC 3-4 cglobal pixel_avg_w%1 AVG_START .height_loop: %assign x 0 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize %2 m0, [t2+x] %2 m1, [t2+x+SIZEOF_PIXEL*t3] %if HIGH_BIT_DEPTH pavgw m0, [t4+x] pavgw m1, [t4+x+SIZEOF_PIXEL*t5] %else ;!HIGH_BIT_DEPTH pavgb m0, [t4+x] pavgb m1, [t4+x+SIZEOF_PIXEL*t5] %endif %if (%1 == 12) && (%1-x/SIZEOF_PIXEL < mmsize) %4 [t0+x], m0 %4 [t0+x+SIZEOF_PIXEL*t1], m1 %else %3 [t0+x], m0 %3 [t0+x+SIZEOF_PIXEL*t1], m1 %endif %assign x x+mmsize %endrep AVG_END %endmacro %macro pixel_avg_W8 0 movu m0, [r2] movu m1, [r4] pavgw m0, m1 movu [r0], m0 movu m2, [r2 + r3] movu m3, [r4 + r5] pavgw m2, m3 movu [r0 + r1], m2 movu m0, [r2 + r3 * 2] movu m1, [r4 + r5 * 2] pavgw m0, m1 movu [r0 + r1 * 2], m0 movu m2, [r2 + r6] movu m3, [r4 + r7] pavgw m2, m3 movu [r0 + r8], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] lea r4, [r4 + 4 * r5] %endmacro INIT_XMM sse2 cglobal pixel_avg_w8_unaligned AVG_START .height_loop: %if HIGH_BIT_DEPTH ; NO TEST BRANCH! movu m0, [t2] movu m1, [t2+SIZEOF_PIXEL*t3] movu m2, [t4] movu m3, [t4+SIZEOF_PIXEL*t5] pavgw m0, m2 pavgw m1, m3 movu [t0], m0 movu [t0+SIZEOF_PIXEL*t1], m1 %else ;!HIGH_BIT_DEPTH movq m0, [t2] movhps m0, [t2+SIZEOF_PIXEL*t3] movq m1, [t4] movhps m1, [t4+SIZEOF_PIXEL*t5] pavgb m0, m1 movq [t0], m0 movhps [t0+SIZEOF_PIXEL*t1], m0 %endif AVG_END ;------------------------------------------------------------------------------------------------------------------------------- ;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) ;------------------------------------------------------------------------------------------------------------------------------- %if HIGH_BIT_DEPTH %if ARCH_X86_64 INIT_XMM sse2 cglobal pixel_avg_8x4, 6,9,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] pixel_avg_W8 RET cglobal pixel_avg_8x8, 6,9,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] pixel_avg_W8 pixel_avg_W8 RET cglobal pixel_avg_8x16, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 4 .loop: pixel_avg_W8 dec r9d jnz .loop RET cglobal pixel_avg_8x32, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 8 .loop: pixel_avg_W8 dec r9d jnz .loop RET %endif %endif %if HIGH_BIT_DEPTH INIT_MMX mmx2 AVG_FUNC 4, movq, movq AVGH 4, 16 AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 ;AVG_FUNC 8, movq, movq ;AVGH 8, 32 ;AVGH 8, 16 ;AVGH 8, 8 ;AVGH 8, 4 AVG_FUNC 16, movq, movq AVGH 16, 64 AVGH 16, 32 AVGH 16, 16 AVGH 16, 12 AVGH 16, 8 AVGH 16, 4 AVG_FUNC 24, movq, movq AVGH 24, 32 AVG_FUNC 32, movq, movq AVGH 32, 32 AVGH 32, 24 AVGH 32, 16 AVGH 32, 8 AVG_FUNC 48, movq, movq AVGH 48, 64 AVG_FUNC 64, movq, movq AVGH 64, 64 AVGH 64, 48 AVGH 64, 32 AVGH 64, 16 AVG_FUNC 12, movq, movq, movq AVGH 12, 16 INIT_XMM sse2 AVG_FUNC 4, movq, movq AVGH 4, 16 AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 AVG_FUNC 16, movdqu, movdqa AVGH 16, 64 AVGH 16, 32 AVGH 16, 16 AVGH 16, 12 AVGH 16, 8 AVGH 16, 4 AVG_FUNC 24, movdqu, movdqa AVGH 24, 32 AVG_FUNC 32, movdqu, movdqa AVGH 32, 64 AVGH 32, 32 AVGH 32, 24 AVGH 32, 16 AVGH 32, 8 AVG_FUNC 48, movdqu, movdqa AVGH 48, 64 AVG_FUNC 64, movdqu, movdqa AVGH 64, 64 AVGH 64, 48 AVGH 64, 32 AVGH 64, 16 AVG_FUNC 12, movdqu, movdqa, movq AVGH 12, 16 %else ;!HIGH_BIT_DEPTH INIT_MMX mmx2 AVG_FUNC 4, movd, movd AVGH 4, 16 AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 ;AVG_FUNC 8, movq, movq AVGH 8, 32 AVGH 8, 16 AVGH 8, 8 AVGH 8, 4 AVG_FUNC 12, movq, movq, movd AVGH 12, 16 AVG_FUNC 16, movq, movq AVGH 16, 64 AVGH 16, 32 AVGH 16, 16 AVGH 16, 12 AVGH 16, 8 AVGH 16, 4 AVG_FUNC 32, movq, movq AVGH 32, 32 AVGH 32, 24 AVGH 32, 16 AVGH 32, 8 AVG_FUNC 64, movq, movq AVGH 64, 64 AVGH 64, 48 AVGH 64, 16 AVG_FUNC 24, movq, movq AVGH 24, 32 AVG_FUNC 48, movq, movq AVGH 48, 64 INIT_XMM sse2 AVG_FUNC 64, movdqu, movdqa AVGH 64, 64 AVGH 64, 48 AVGH 64, 32 AVGH 64, 16 AVG_FUNC 32, movdqu, movdqa AVGH 32, 64 AVGH 32, 32 AVGH 32, 24 AVGH 32, 16 AVGH 32, 8 AVG_FUNC 24, movdqu, movdqa AVGH 24, 32 AVG_FUNC 16, movdqu, movdqa AVGH 16, 64 AVGH 16, 32 AVGH 16, 16 AVGH 16, 12 AVGH 16, 8 AVGH 16, 4 AVG_FUNC 48, movdqu, movdqa AVGH 48, 64 AVG_FUNC 12, movdqu, movdqa, movq AVGH 12, 16 AVGH 8, 32 AVGH 8, 16 AVGH 8, 8 AVGH 8, 4 INIT_XMM ssse3 AVGH 24, 32 AVGH 64, 64 AVGH 64, 48 AVGH 64, 32 AVGH 64, 16 AVGH 32, 64 AVGH 32, 32 AVGH 32, 24 AVGH 32, 16 AVGH 32, 8 AVGH 16, 64 AVGH 16, 32 AVGH 16, 16 AVGH 16, 12 AVGH 16, 8 AVGH 16, 4 AVGH 48, 64 AVGH 12, 16 AVGH 8, 32 AVGH 8, 16 AVGH 8, 8 AVGH 8, 4 INIT_MMX ssse3 AVGH 4, 16 AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 INIT_XMM avx2 ; TODO: active AVX2 after debug ;AVG_FUNC 24, movdqu, movdqa ;AVGH 24, 32 AVG_FUNC 16, movdqu, movdqa AVGH 16, 64 AVGH 16, 32 AVGH 16, 16 AVGH 16, 12 AVGH 16, 8 AVGH 16, 4 %endif ;HIGH_BIT_DEPTH ;------------------------------------------------------------------------------------------------------------------------------- ;void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) ;------------------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 && BIT_DEPTH == 8 INIT_YMM avx2 cglobal pixel_avg_8x32 %rep 4 movu m0, [r2] movu m2, [r2 + r3] movu m1, [r4] movu m3, [r4 + r5] pavgb m0, m1 pavgb m2, m3 movu [r0], m0 movu [r0 + r1], m2 lea r2, [r2 + r3 * 2] lea r4, [r4 + r5 * 2] lea r0, [r0 + r1 * 2] %endrep ret cglobal pixel_avg_16x64_8bit %rep 8 movu m0, [r2] movu m2, [r2 + mmsize] movu m1, [r4] movu m3, [r4 + mmsize] pavgb m0, m1 pavgb m2, m3 movu [r0], m0 movu [r0 + mmsize], m2 movu m0, [r2 + r3] movu m2, [r2 + r3 + mmsize] movu m1, [r4 + r5] movu m3, [r4 + r5 + mmsize] pavgb m0, m1 pavgb m2, m3 movu [r0 + r1], m0 movu [r0 + r1 + mmsize], m2 lea r2, [r2 + r3 * 2] lea r4, [r4 + r5 * 2] lea r0, [r0 + r1 * 2] %endrep ret cglobal pixel_avg_32x8, 6,6,4 call pixel_avg_8x32 RET cglobal pixel_avg_32x16, 6,6,4 call pixel_avg_8x32 call pixel_avg_8x32 RET cglobal pixel_avg_32x24, 6,6,4 call pixel_avg_8x32 call pixel_avg_8x32 call pixel_avg_8x32 RET cglobal pixel_avg_32x32, 6,6,4 call pixel_avg_8x32 call pixel_avg_8x32 call pixel_avg_8x32 call pixel_avg_8x32 RET cglobal pixel_avg_32x64, 6,6,4 call pixel_avg_8x32 call pixel_avg_8x32 call pixel_avg_8x32 call pixel_avg_8x32 call pixel_avg_8x32 call pixel_avg_8x32 call pixel_avg_8x32 call pixel_avg_8x32 RET cglobal pixel_avg_64x16, 6,6,4 call pixel_avg_16x64_8bit RET cglobal pixel_avg_64x32, 6,6,4 call pixel_avg_16x64_8bit call pixel_avg_16x64_8bit RET cglobal pixel_avg_64x48, 6,6,4 call pixel_avg_16x64_8bit call pixel_avg_16x64_8bit call pixel_avg_16x64_8bit RET cglobal pixel_avg_64x64, 6,6,4 call pixel_avg_16x64_8bit call pixel_avg_16x64_8bit call pixel_avg_16x64_8bit call pixel_avg_16x64_8bit RET cglobal pixel_avg_48x64, 6,7,4 mov r6d, 4 .loop: %rep 8 movu m0, [r2] movu xm2, [r2 + mmsize] movu m1, [r4] movu xm3, [r4 + mmsize] pavgb m0, m1 pavgb xm2, xm3 movu [r0], m0 movu [r0 + mmsize], xm2 movu m0, [r2 + r3] movu xm2, [r2 + r3 + mmsize] movu m1, [r4 + r5] movu xm3, [r4 + r5 + mmsize] pavgb m0, m1 pavgb xm2, xm3 movu [r0 + r1], m0 movu [r0 + r1 + mmsize], xm2 lea r2, [r2 + r3 * 2] lea r4, [r4 + r5 * 2] lea r0, [r0 + r1 * 2] %endrep dec r6d jnz .loop RET %endif ;============================================================================= ; pixel avg2 ;============================================================================= %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride, ; uint16_t *src1, intptr_t src_stride, ; uint16_t *src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W_ONE 1 cglobal pixel_avg2_w%1, 6,7,4 sub r4, r2 lea r6, [r4+r3*2] .height_loop: movu m0, [r2] movu m1, [r2+r3*2] %if cpuflag(avx) || mmsize == 8 pavgw m0, [r2+r4] pavgw m1, [r2+r6] %else movu m2, [r2+r4] movu m3, [r2+r6] pavgw m0, m2 pavgw m1, m3 %endif mova [r0], m0 mova [r0+r1*2], m1 lea r2, [r2+r3*4] lea r0, [r0+r1*4] sub r5d, 2 jg .height_loop RET %endmacro %macro AVG2_W_TWO 3 cglobal pixel_avg2_w%1, 6,7,8 sub r4, r2 lea r6, [r4+r3*2] .height_loop: movu m0, [r2] %2 m1, [r2+mmsize] movu m2, [r2+r3*2] %2 m3, [r2+r3*2+mmsize] %if mmsize == 8 pavgw m0, [r2+r4] pavgw m1, [r2+r4+mmsize] pavgw m2, [r2+r6] pavgw m3, [r2+r6+mmsize] %else movu m4, [r2+r4] %2 m5, [r2+r4+mmsize] movu m6, [r2+r6] %2 m7, [r2+r6+mmsize] pavgw m0, m4 pavgw m1, m5 pavgw m2, m6 pavgw m3, m7 %endif mova [r0], m0 %3 [r0+mmsize], m1 mova [r0+r1*2], m2 %3 [r0+r1*2+mmsize], m3 lea r2, [r2+r3*4] lea r0, [r0+r1*4] sub r5d, 2 jg .height_loop RET %endmacro INIT_MMX mmx2 AVG2_W_ONE 4 AVG2_W_TWO 8, movu, mova INIT_XMM sse2 AVG2_W_ONE 8 AVG2_W_TWO 10, movd, movd AVG2_W_TWO 16, movu, mova INIT_YMM avx2 AVG2_W_ONE 16 INIT_MMX cglobal pixel_avg2_w10_mmx2, 6,7 sub r4, r2 lea r6, [r4+r3*2] .height_loop: movu m0, [r2+ 0] movu m1, [r2+ 8] movh m2, [r2+16] movu m3, [r2+r3*2+ 0] movu m4, [r2+r3*2+ 8] movh m5, [r2+r3*2+16] pavgw m0, [r2+r4+ 0] pavgw m1, [r2+r4+ 8] pavgw m2, [r2+r4+16] pavgw m3, [r2+r6+ 0] pavgw m4, [r2+r6+ 8] pavgw m5, [r2+r6+16] mova [r0+ 0], m0 mova [r0+ 8], m1 movh [r0+16], m2 mova [r0+r1*2+ 0], m3 mova [r0+r1*2+ 8], m4 movh [r0+r1*2+16], m5 lea r2, [r2+r3*2*2] lea r0, [r0+r1*2*2] sub r5d, 2 jg .height_loop RET cglobal pixel_avg2_w16_mmx2, 6,7 sub r4, r2 lea r6, [r4+r3*2] .height_loop: movu m0, [r2+ 0] movu m1, [r2+ 8] movu m2, [r2+16] movu m3, [r2+24] movu m4, [r2+r3*2+ 0] movu m5, [r2+r3*2+ 8] movu m6, [r2+r3*2+16] movu m7, [r2+r3*2+24] pavgw m0, [r2+r4+ 0] pavgw m1, [r2+r4+ 8] pavgw m2, [r2+r4+16] pavgw m3, [r2+r4+24] pavgw m4, [r2+r6+ 0] pavgw m5, [r2+r6+ 8] pavgw m6, [r2+r6+16] pavgw m7, [r2+r6+24] mova [r0+ 0], m0 mova [r0+ 8], m1 mova [r0+16], m2 mova [r0+24], m3 mova [r0+r1*2+ 0], m4 mova [r0+r1*2+ 8], m5 mova [r0+r1*2+16], m6 mova [r0+r1*2+24], m7 lea r2, [r2+r3*2*2] lea r0, [r0+r1*2*2] sub r5d, 2 jg .height_loop RET cglobal pixel_avg2_w18_mmx2, 6,7 sub r4, r2 .height_loop: movu m0, [r2+ 0] movu m1, [r2+ 8] movu m2, [r2+16] movu m3, [r2+24] movh m4, [r2+32] pavgw m0, [r2+r4+ 0] pavgw m1, [r2+r4+ 8] pavgw m2, [r2+r4+16] pavgw m3, [r2+r4+24] pavgw m4, [r2+r4+32] mova [r0+ 0], m0 mova [r0+ 8], m1 mova [r0+16], m2 mova [r0+24], m3 movh [r0+32], m4 lea r2, [r2+r3*2] lea r0, [r0+r1*2] dec r5d jg .height_loop RET %macro PIXEL_AVG_W18 0 cglobal pixel_avg2_w18, 6,7 sub r4, r2 .height_loop: movu m0, [r2+ 0] movd xm2, [r2+32] %if mmsize == 32 pavgw m0, [r2+r4+ 0] movd xm1, [r2+r4+32] pavgw xm2, xm1 %else movu m1, [r2+16] movu m3, [r2+r4+ 0] movu m4, [r2+r4+16] movd m5, [r2+r4+32] pavgw m0, m3 pavgw m1, m4 pavgw m2, m5 mova [r0+16], m1 %endif mova [r0+ 0], m0 movd [r0+32], xm2 lea r2, [r2+r3*2] lea r0, [r0+r1*2] dec r5d jg .height_loop RET %endmacro INIT_XMM sse2 PIXEL_AVG_W18 INIT_YMM avx2 PIXEL_AVG_W18 ;------------------------------------------------------------------------------------------------------------------------------- ;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) ;------------------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_avg_12x16, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 4 .loop: movu m0, [r2] movu m1, [r4] pavgw m0, m1 movu [r0], xm0 movu m2, [r2 + r3] movu m3, [r4 + r5] pavgw m2, m3 movu [r0 + r1], xm2 vextracti128 xm0, m0, 1 vextracti128 xm2, m2, 1 movq [r0 + 16], xm0 movq [r0 + r1 + 16], xm2 movu m0, [r2 + r3 * 2] movu m1, [r4 + r5 * 2] pavgw m0, m1 movu [r0 + r1 * 2], xm0 movu m2, [r2 + r6] movu m3, [r4 + r7] pavgw m2, m3 movu [r0 + r8], xm2 vextracti128 xm0, m0, 1 vextracti128 xm2, m2, 1 movq [r0 + r1 * 2 + 16], xm0 movq [r0 + r8 + 16], xm2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] lea r4, [r4 + 4 * r5] dec r9d jnz .loop RET %endif %macro pixel_avg_H4 0 movu m0, [r2] movu m1, [r4] pavgw m0, m1 movu [r0], m0 movu m2, [r2 + r3] movu m3, [r4 + r5] pavgw m2, m3 movu [r0 + r1], m2 movu m0, [r2 + r3 * 2] movu m1, [r4 + r5 * 2] pavgw m0, m1 movu [r0 + r1 * 2], m0 movu m2, [r2 + r6] movu m3, [r4 + r7] pavgw m2, m3 movu [r0 + r8], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] lea r4, [r4 + 4 * r5] %endmacro ;------------------------------------------------------------------------------------------------------------------------------- ;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) ;------------------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_avg_16x4, 6,9,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] pixel_avg_H4 RET cglobal pixel_avg_16x8, 6,9,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] pixel_avg_H4 pixel_avg_H4 RET cglobal pixel_avg_16x12, 6,9,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] pixel_avg_H4 pixel_avg_H4 pixel_avg_H4 RET %endif %macro pixel_avg_H16 0 movu m0, [r2] movu m1, [r4] pavgw m0, m1 movu [r0], m0 movu m2, [r2 + r3] movu m3, [r4 + r5] pavgw m2, m3 movu [r0 + r1], m2 movu m0, [r2 + r3 * 2] movu m1, [r4 + r5 * 2] pavgw m0, m1 movu [r0 + r1 * 2], m0 movu m2, [r2 + r6] movu m3, [r4 + r7] pavgw m2, m3 movu [r0 + r8], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] lea r4, [r4 + 4 * r5] %endmacro ;------------------------------------------------------------------------------------------------------------------------------- ;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) ;------------------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_avg_16x16, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 4 .loop: pixel_avg_H16 dec r9d jnz .loop RET cglobal pixel_avg_16x32, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 4 .loop: pixel_avg_H16 pixel_avg_H16 dec r9d jnz .loop RET cglobal pixel_avg_16x64, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 4 .loop: pixel_avg_H16 pixel_avg_H16 pixel_avg_H16 pixel_avg_H16 dec r9d jnz .loop RET %endif ;------------------------------------------------------------------------------------------------------------------------------- ;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) ;------------------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_avg_24x32, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 8 .loop: movu m0, [r2] movu m1, [r4] pavgw m0, m1 movu [r0], m0 movu m2, [r2 + r3] movu m3, [r4 + r5] pavgw m2, m3 movu [r0 + r1], m2 movu xm0, [r2 + 32] movu xm1, [r4 + 32] pavgw xm0, xm1 movu [r0 + 32], xm0 movu xm2, [r2 + r3 + 32] movu xm3, [r4 + r5 + 32] pavgw xm2, xm3 movu [r0 + r1 + 32], xm2 movu m0, [r2 + r3 * 2] movu m1, [r4 + r5 * 2] pavgw m0, m1 movu [r0 + r1 * 2], m0 movu m2, [r2 + r6] movu m3, [r4 + r7] pavgw m2, m3 movu [r0 + r8], m2 movu xm0, [r2 + r3 * 2 + 32] movu xm1, [r4 + r5 * 2 + 32] pavgw xm0, xm1 movu [r0 + r1 * 2 + 32], xm0 movu xm2, [r2 + r6 + 32] movu xm3, [r4 + r7 + 32] pavgw xm2, xm3 movu [r0 + r8 + 32], xm2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] lea r4, [r4 + 4 * r5] dec r9d jnz .loop RET %endif %macro pixel_avg_W32 0 movu m0, [r2] movu m1, [r4] pavgw m0, m1 movu [r0], m0 movu m2, [r2 + r3] movu m3, [r4 + r5] pavgw m2, m3 movu [r0 + r1], m2 movu m0, [r2 + 32] movu m1, [r4 + 32] pavgw m0, m1 movu [r0 + 32], m0 movu m2, [r2 + r3 + 32] movu m3, [r4 + r5 + 32] pavgw m2, m3 movu [r0 + r1 + 32], m2 movu m0, [r2 + r3 * 2] movu m1, [r4 + r5 * 2] pavgw m0, m1 movu [r0 + r1 * 2], m0 movu m2, [r2 + r6] movu m3, [r4 + r7] pavgw m2, m3 movu [r0 + r8], m2 movu m0, [r2 + r3 * 2 + 32] movu m1, [r4 + r5 * 2 + 32] pavgw m0, m1 movu [r0 + r1 * 2 + 32], m0 movu m2, [r2 + r6 + 32] movu m3, [r4 + r7 + 32] pavgw m2, m3 movu [r0 + r8 + 32], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] lea r4, [r4 + 4 * r5] %endmacro ;------------------------------------------------------------------------------------------------------------------------------- ;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) ;------------------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_avg_32x8, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 2 .loop: pixel_avg_W32 dec r9d jnz .loop RET cglobal pixel_avg_32x16, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 4 .loop: pixel_avg_W32 dec r9d jnz .loop RET cglobal pixel_avg_32x24, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 6 .loop: pixel_avg_W32 dec r9d jnz .loop RET cglobal pixel_avg_32x32, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 8 .loop: pixel_avg_W32 dec r9d jnz .loop RET cglobal pixel_avg_32x64, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 16 .loop: pixel_avg_W32 dec r9d jnz .loop RET %endif %macro pixel_avg_W64 0 movu m0, [r2] movu m1, [r4] pavgw m0, m1 movu [r0], m0 movu m2, [r2 + r3] movu m3, [r4 + r5] pavgw m2, m3 movu [r0 + r1], m2 movu m0, [r2 + 32] movu m1, [r4 + 32] pavgw m0, m1 movu [r0 + 32], m0 movu m2, [r2 + r3 + 32] movu m3, [r4 + r5 + 32] pavgw m2, m3 movu [r0 + r1 + 32], m2 movu m0, [r2 + 64] movu m1, [r4 + 64] pavgw m0, m1 movu [r0 + 64], m0 movu m2, [r2 + r3 + 64] movu m3, [r4 + r5 + 64] pavgw m2, m3 movu [r0 + r1 + 64], m2 movu m0, [r2 + 96] movu m1, [r4 + 96] pavgw m0, m1 movu [r0 + 96], m0 movu m2, [r2 + r3 + 96] movu m3, [r4 + r5 + 96] pavgw m2, m3 movu [r0 + r1 + 96], m2 movu m0, [r2 + r3 * 2] movu m1, [r4 + r5 * 2] pavgw m0, m1 movu [r0 + r1 * 2], m0 movu m2, [r2 + r6] movu m3, [r4 + r7] pavgw m2, m3 movu [r0 + r8], m2 movu m0, [r2 + r3 * 2 + 32] movu m1, [r4 + r5 * 2 + 32] pavgw m0, m1 movu [r0 + r1 * 2 + 32], m0 movu m2, [r2 + r6 + 32] movu m3, [r4 + r7 + 32] pavgw m2, m3 movu [r0 + r8 + 32], m2 movu m0, [r2 + r3 * 2 + 64] movu m1, [r4 + r5 * 2 + 64] pavgw m0, m1 movu [r0 + r1 * 2 + 64], m0 movu m2, [r2 + r6 + 64] movu m3, [r4 + r7 + 64] pavgw m2, m3 movu [r0 + r8 + 64], m2 movu m0, [r2 + r3 * 2 + 96] movu m1, [r4 + r5 * 2 + 96] pavgw m0, m1 movu [r0 + r1 * 2 + 96], m0 movu m2, [r2 + r6 + 96] movu m3, [r4 + r7 + 96] pavgw m2, m3 movu [r0 + r8 + 96], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] lea r4, [r4 + 4 * r5] %endmacro ;------------------------------------------------------------------------------------------------------------------------------- ;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) ;------------------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_avg_64x16, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 4 .loop: pixel_avg_W64 dec r9d jnz .loop RET cglobal pixel_avg_64x32, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 8 .loop: pixel_avg_W64 dec r9d jnz .loop RET cglobal pixel_avg_64x48, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 12 .loop: pixel_avg_W64 dec r9d jnz .loop RET cglobal pixel_avg_64x64, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 16 .loop: pixel_avg_W64 dec r9d jnz .loop RET %endif ;------------------------------------------------------------------------------------------------------------------------------- ;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) ;------------------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_avg_48x64, 6,10,4 add r1d, r1d add r3d, r3d add r5d, r5d lea r6, [r3 * 3] lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 16 .loop: movu m0, [r2] movu m1, [r4] pavgw m0, m1 movu [r0], m0 movu m2, [r2 + r3] movu m3, [r4 + r5] pavgw m2, m3 movu [r0 + r1], m2 movu m0, [r2 + 32] movu m1, [r4 + 32] pavgw m0, m1 movu [r0 + 32], m0 movu m2, [r2 + r3 + 32] movu m3, [r4 + r5 + 32] pavgw m2, m3 movu [r0 + r1 + 32], m2 movu m0, [r2 + 64] movu m1, [r4 + 64] pavgw m0, m1 movu [r0 + 64], m0 movu m2, [r2 + r3 + 64] movu m3, [r4 + r5 + 64] pavgw m2, m3 movu [r0 + r1 + 64], m2 movu m0, [r2 + r3 * 2] movu m1, [r4 + r5 * 2] pavgw m0, m1 movu [r0 + r1 * 2], m0 movu m2, [r2 + r6] movu m3, [r4 + r7] pavgw m2, m3 movu [r0 + r8], m2 movu m0, [r2 + r3 * 2 + 32] movu m1, [r4 + r5 * 2 + 32] pavgw m0, m1 movu [r0 + r1 * 2 + 32], m0 movu m2, [r2 + r6 + 32] movu m3, [r4 + r7 + 32] pavgw m2, m3 movu [r0 + r8 + 32], m2 movu m0, [r2 + r3 * 2 + 64] movu m1, [r4 + r5 * 2 + 64] pavgw m0, m1 movu [r0 + r1 * 2 + 64], m0 movu m2, [r2 + r6 + 64] movu m3, [r4 + r7 + 64] pavgw m2, m3 movu [r0 + r8 + 64], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] lea r4, [r4 + 4 * r5] dec r9d jnz .loop RET %endif %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- ; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride, ; uint8_t *src1, intptr_t src_stride, ; uint8_t *src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W8 2 cglobal pixel_avg2_w%1_mmx2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: %2 mm0, [r2] %2 mm1, [r2+r3] pavgb mm0, [r2+r4] pavgb mm1, [r2+r6] lea r2, [r2+r3*2] %2 [r0], mm0 %2 [r0+r1], mm1 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop RET %endmacro INIT_MMX AVG2_W8 4, movd AVG2_W8 8, movq %macro AVG2_W16 2 cglobal pixel_avg2_w%1_mmx2, 6,7 sub r2, r4 lea r6, [r2+r3] .height_loop: movq mm0, [r4] %2 mm1, [r4+8] movq mm2, [r4+r3] %2 mm3, [r4+r3+8] pavgb mm0, [r4+r2] pavgb mm1, [r4+r2+8] pavgb mm2, [r4+r6] pavgb mm3, [r4+r6+8] lea r4, [r4+r3*2] movq [r0], mm0 %2 [r0+8], mm1 movq [r0+r1], mm2 %2 [r0+r1+8], mm3 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop RET %endmacro AVG2_W16 12, movd AVG2_W16 16, movq cglobal pixel_avg2_w20_mmx2, 6,7 sub r2, r4 lea r6, [r2+r3] .height_loop: movq mm0, [r4] movq mm1, [r4+8] movd mm2, [r4+16] movq mm3, [r4+r3] movq mm4, [r4+r3+8] movd mm5, [r4+r3+16] pavgb mm0, [r4+r2] pavgb mm1, [r4+r2+8] pavgb mm2, [r4+r2+16] pavgb mm3, [r4+r6] pavgb mm4, [r4+r6+8] pavgb mm5, [r4+r6+16] lea r4, [r4+r3*2] movq [r0], mm0 movq [r0+8], mm1 movd [r0+16], mm2 movq [r0+r1], mm3 movq [r0+r1+8], mm4 movd [r0+r1+16], mm5 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop RET INIT_XMM cglobal pixel_avg2_w16_sse2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: movu m0, [r2] movu m2, [r2+r3] movu m1, [r2+r4] movu m3, [r2+r6] lea r2, [r2+r3*2] pavgb m0, m1 pavgb m2, m3 mova [r0], m0 mova [r0+r1], m2 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop RET cglobal pixel_avg2_w20_sse2, 6,7 sub r2, r4 lea r6, [r2+r3] .height_loop: movu m0, [r4] movu m2, [r4+r3] movu m1, [r4+r2] movu m3, [r4+r6] movd mm4, [r4+16] movd mm5, [r4+r3+16] pavgb m0, m1 pavgb m2, m3 pavgb mm4, [r4+r2+16] pavgb mm5, [r4+r6+16] lea r4, [r4+r3*2] mova [r0], m0 mova [r0+r1], m2 movd [r0+16], mm4 movd [r0+r1+16], mm5 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop RET INIT_YMM avx2 cglobal pixel_avg2_w20, 6,7 sub r2, r4 lea r6, [r2+r3] .height_loop: movu m0, [r4] movu m1, [r4+r3] pavgb m0, [r4+r2] pavgb m1, [r4+r6] lea r4, [r4+r3*2] mova [r0], m0 mova [r0+r1], m1 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop RET ; Cacheline split code for processors with high latencies for loads ; split over cache lines. See sad-a.asm for a more detailed explanation. ; This particular instance is complicated by the fact that src1 and src2 ; can have different alignments. For simplicity and code size, only the ; MMX cacheline workaround is used. As a result, in the case of SSE2 ; pixel_avg, the cacheline check functions calls the SSE2 version if there ; is no cacheline split, and the MMX workaround if there is. %macro INIT_SHIFT 2 and eax, 7 shl eax, 3 movd %1, [pd_64] movd %2, eax psubw %1, %2 %endmacro %macro AVG_CACHELINE_START 0 %assign stack_offset 0 INIT_SHIFT mm6, mm7 mov eax, r4m INIT_SHIFT mm4, mm5 PROLOGUE 6,6 and r2, ~7 and r4, ~7 sub r4, r2 .height_loop: %endmacro %macro AVG_CACHELINE_LOOP 2 movq mm1, [r2+%1] movq mm0, [r2+8+%1] movq mm3, [r2+r4+%1] movq mm2, [r2+r4+8+%1] psrlq mm1, mm7 psllq mm0, mm6 psrlq mm3, mm5 psllq mm2, mm4 por mm0, mm1 por mm2, mm3 pavgb mm2, mm0 %2 [r0+%1], mm2 %endmacro %macro AVG_CACHELINE_FUNC 2 pixel_avg2_w%1_cache_mmx2: AVG_CACHELINE_START AVG_CACHELINE_LOOP 0, movq %if %1>8 AVG_CACHELINE_LOOP 8, movq %if %1>16 AVG_CACHELINE_LOOP 16, movd %endif %endif add r2, r3 add r0, r1 dec r5d jg .height_loop RET %endmacro %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set %if %1 == 12 ;w12 isn't needed because w16 is just as fast if there's no cacheline split %define cachesplit pixel_avg2_w16_cache_mmx2 %else %define cachesplit pixel_avg2_w%1_cache_mmx2 %endif cglobal pixel_avg2_w%1_cache%2_%3 mov eax, r2m and eax, %2-1 cmp eax, (%2-%1-(%1 % 8)) %if %1==12||%1==20 jbe pixel_avg2_w%1_%3 %else jb pixel_avg2_w%1_%3 %endif %if 0 ; or %1==8 - but the extra branch seems too expensive ja cachesplit %if ARCH_X86_64 test r4b, 1 %else test byte r4m, 1 %endif jz pixel_avg2_w%1_%3 %else or eax, r4m and eax, 7 jz pixel_avg2_w%1_%3 mov eax, r2m %endif %if mmsize==16 || (%1==8 && %2==64) AVG_CACHELINE_FUNC %1, %2 %else jmp cachesplit %endif %endmacro INIT_MMX AVG_CACHELINE_CHECK 8, 64, mmx2 AVG_CACHELINE_CHECK 12, 64, mmx2 %if ARCH_X86_64 == 0 AVG_CACHELINE_CHECK 16, 64, mmx2 AVG_CACHELINE_CHECK 20, 64, mmx2 AVG_CACHELINE_CHECK 8, 32, mmx2 AVG_CACHELINE_CHECK 12, 32, mmx2 AVG_CACHELINE_CHECK 16, 32, mmx2 AVG_CACHELINE_CHECK 20, 32, mmx2 %endif INIT_XMM AVG_CACHELINE_CHECK 16, 64, sse2 AVG_CACHELINE_CHECK 20, 64, sse2 ; computed jump assumes this loop is exactly 48 bytes %macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment ALIGN 16 avg_w16_align%1_%2_ssse3: %if %1==0 && %2==0 movdqa xmm1, [r2] pavgb xmm1, [r2+r4] add r2, r3 %elif %1==0 movdqa xmm1, [r2+r4+16] palignr xmm1, [r2+r4], %2 pavgb xmm1, [r2] add r2, r3 %elif %2&15==0 movdqa xmm1, [r2+16] palignr xmm1, [r2], %1 pavgb xmm1, [r2+r4] add r2, r3 %else movdqa xmm1, [r2+16] movdqa xmm2, [r2+r4+16] palignr xmm1, [r2], %1 palignr xmm2, [r2+r4], %2&15 add r2, r3 pavgb xmm1, xmm2 %endif movdqa [r0], xmm1 add r0, r1 dec r5d jg avg_w16_align%1_%2_ssse3 ret %if %1==0 ; make sure the first ones don't end up short ALIGN 16 times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop %endif %endmacro cglobal pixel_avg2_w16_cache64_ssse3 %if 0 ; seems both tests aren't worth it if src1%16==0 is optimized mov eax, r2m and eax, 0x3f cmp eax, 0x30 jb xavs2_pixel_avg2_w16_sse2 or eax, r4m and eax, 7 jz xavs2_pixel_avg2_w16_sse2 %endif PROLOGUE 6, 8 lea r6, [r4+r2] and r4, ~0xf and r6, 0x1f and r2, ~0xf lea r6, [r6*3] ;(offset + align*2)*3 sub r4, r2 shl r6, 4 ;jump = (offset + align*2)*48 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3) %ifdef PIC lea r7, [avg_w16_addr] add r6, r7 %else lea r6, [avg_w16_addr + r6] %endif TAIL_CALL r6, 1 %assign j 0 %assign k 1 %rep 16 AVG16_CACHELINE_LOOP_SSSE3 j, j AVG16_CACHELINE_LOOP_SSSE3 j, k %assign j j+1 %assign k k+1 %endrep %endif ; !HIGH_BIT_DEPTH ;============================================================================= ; pixel copy ;============================================================================= %macro COPY1 2 movu m0, [r2] movu m1, [r2+r3] movu m2, [r2+r3*2] movu m3, [r2+%2] mova [r0], m0 mova [r0+r1], m1 mova [r0+r1*2], m2 mova [r0+%1], m3 %endmacro %macro COPY2 2-4 0, 1 movu m0, [r2+%3*mmsize] movu m1, [r2+%4*mmsize] movu m2, [r2+r3+%3*mmsize] movu m3, [r2+r3+%4*mmsize] mova [r0+%3*mmsize], m0 mova [r0+%4*mmsize], m1 mova [r0+r1+%3*mmsize], m2 mova [r0+r1+%4*mmsize], m3 movu m0, [r2+r3*2+%3*mmsize] movu m1, [r2+r3*2+%4*mmsize] movu m2, [r2+%2+%3*mmsize] movu m3, [r2+%2+%4*mmsize] mova [r0+r1*2+%3*mmsize], m0 mova [r0+r1*2+%4*mmsize], m1 mova [r0+%1+%3*mmsize], m2 mova [r0+%1+%4*mmsize], m3 %endmacro %macro COPY4 2 COPY2 %1, %2, 0, 1 COPY2 %1, %2, 2, 3 %endmacro ;----------------------------------------------------------------------------- ; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride, ; uint8_t *src, intptr_t i_src_stride, int i_height ) ;----------------------------------------------------------------------------- INIT_MMX cglobal mc_copy_w4_mmx, 4,6 FIX_STRIDES r1, r3 cmp dword r4m, 4 lea r5, [r3*3] lea r4, [r1*3] je .end %if HIGH_BIT_DEPTH == 0 %define mova movd %define movu movd %endif COPY1 r4, r5 lea r2, [r2+r3*4] lea r0, [r0+r1*4] .end: COPY1 r4, r5 RET %macro MC_COPY 1 %assign %%w %1*SIZEOF_PIXEL/mmsize %if %%w > 0 cglobal mc_copy_w%1, 5,7 FIX_STRIDES r1, r3 lea r6, [r3*3] lea r5, [r1*3] .height_loop: COPY %+ %%w r5, r6 lea r2, [r2+r3*4] lea r0, [r0+r1*4] sub r4d, 4 jg .height_loop RET %endif %endmacro INIT_MMX mmx MC_COPY 8 MC_COPY 16 INIT_XMM sse MC_COPY 8 MC_COPY 16 INIT_XMM aligned, sse MC_COPY 16 %if HIGH_BIT_DEPTH INIT_YMM avx MC_COPY 16 INIT_YMM aligned, avx MC_COPY 16 %endif ;============================================================================= ; prefetch ;============================================================================= ; assumes 64 byte cachelines ; FIXME doesn't cover all pixels in high depth and/or 4:4:4 ;----------------------------------------------------------------------------- ; void prefetch_fenc( pixel *pix_y, intptr_t stride_y, ; pixel *pix_uv, intptr_t stride_uv, int mb_x ) ;----------------------------------------------------------------------------- %macro PREFETCH_FENC 1 %if ARCH_X86_64 cglobal prefetch_fenc_%1, 5,5 FIX_STRIDES r1, r3 and r4d, 3 mov eax, r4d imul r4d, r1d lea r0, [r0+r4*4+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] imul eax, r3d lea r2, [r2+rax*2+64*SIZEOF_PIXEL] prefetcht0 [r2] prefetcht0 [r2+r3] %ifidn %1, 422 lea r2, [r2+r3*2] prefetcht0 [r2] prefetcht0 [r2+r3] %endif RET %else cglobal prefetch_fenc_%1, 0,3 mov r2, r4m mov r1, r1m mov r0, r0m FIX_STRIDES r1 and r2, 3 imul r2, r1 lea r0, [r0+r2*4+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] mov r2, r4m mov r1, r3m mov r0, r2m FIX_STRIDES r1 and r2, 3 imul r2, r1 lea r0, [r0+r2*2+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] %ifidn %1, 422 lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] %endif ret %endif ; ARCH_X86_64 %endmacro INIT_MMX mmx2 PREFETCH_FENC 420 PREFETCH_FENC 422 ;----------------------------------------------------------------------------- ; void prefetch_ref( pixel *pix, intptr_t stride, int parity ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal prefetch_ref, 3,3 FIX_STRIDES r1 dec r2d and r2d, r1d lea r0, [r0+r2*8+64*SIZEOF_PIXEL] lea r2, [r1*3] prefetcht0 [r0] prefetcht0 [r0+r1] prefetcht0 [r0+r1*2] prefetcht0 [r0+r2] lea r0, [r0+r1*4] prefetcht0 [r0] prefetcht0 [r0+r1] prefetcht0 [r0+r1*2] prefetcht0 [r0+r2] RET xavs2-1.3/source/common/x86/mc-a2.asm000066400000000000000000001443751340660520300172530ustar00rootroot00000000000000;***************************************************************************** ;* mc-a2.asm: x86 motion compensation ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Holger Lubitz ;* Mathieu Monnier ;* Oskar Arvidsson ;* Min Chen ;* Jiaqi Zhang ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 %if HIGH_BIT_DEPTH deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15 %else deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 %endif cutree_fix8_unpack_shuf: db -1,-1, 0, 1,-1,-1, 2, 3,-1,-1, 4, 5,-1,-1, 6, 7 db -1,-1, 8, 9,-1,-1,10,11,-1,-1,12,13,-1,-1,14,15 const pq_256, times 4 dq 256.0 const pd_inv256, times 4 dq 0.00390625 const pd_0_5, times 4 dq 0.5 SECTION .text cextern pb_0 cextern pw_1 cextern pw_16 cextern pw_32 cextern pw_512 cextern pw_00ff cextern pw_1024 cextern pw_3fff cextern pw_pixel_max cextern pd_ffff cextern pd_16 ;The hpel_filter routines use non-temporal writes for output. ;The following defines may be uncommented for testing. ;Doing the hpel_filter temporal may be a win if the last level cache ;is big enough (preliminary benching suggests on the order of 4* framesize). ;%define movntq movq ;%define movntps movaps ;%define sfence %if HIGH_BIT_DEPTH == 0 %undef movntq %undef movntps %undef sfence %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void plane_copy_core( pixel *dst, intptr_t i_dst, ; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- ; assumes i_dst and w are multiples of 16, and i_dst>w INIT_MMX cglobal plane_copy_core_mmx2, 6,7 FIX_STRIDES r1, r3, r4d %if HIGH_BIT_DEPTH == 0 movsxdifnidn r4, r4d %endif sub r1, r4 sub r3, r4 .loopy: lea r6d, [r4-63] .loopx: prefetchnta [r2+256] movq m0, [r2 ] movq m1, [r2+ 8] movntq [r0 ], m0 movntq [r0+ 8], m1 movq m2, [r2+16] movq m3, [r2+24] movntq [r0+16], m2 movntq [r0+24], m3 movq m4, [r2+32] movq m5, [r2+40] movntq [r0+32], m4 movntq [r0+40], m5 movq m6, [r2+48] movq m7, [r2+56] movntq [r0+48], m6 movntq [r0+56], m7 add r2, 64 add r0, 64 sub r6d, 64 jg .loopx prefetchnta [r2+256] add r6d, 63 jle .end16 .loop16: movq m0, [r2 ] movq m1, [r2+8] movntq [r0 ], m0 movntq [r0+8], m1 add r2, 16 add r0, 16 sub r6d, 16 jg .loop16 .end16: add r0, r1 add r2, r3 dec r5d jg .loopy sfence emms RET %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint %if HIGH_BIT_DEPTH %assign x 0 %rep 16/mmsize mov%4 m0, [%2+(x/2)*mmsize] mov%4 m1, [%3+(x/2)*mmsize] punpckhwd m2, m0, m1 punpcklwd m0, m1 mov%5a [%1+(x+0)*mmsize], m0 mov%5a [%1+(x+1)*mmsize], m2 %assign x (x+2) %endrep %else movq m0, [%2] %if mmsize==16 %ifidn %4, a punpcklbw m0, [%3] %else movq m1, [%3] punpcklbw m0, m1 %endif mov%5a [%1], m0 %else movq m1, [%3] punpckhbw m2, m0, m1 punpcklbw m0, m1 mov%5a [%1+0], m0 mov%5a [%1+8], m2 %endif %endif ; HIGH_BIT_DEPTH %endmacro %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned %if HIGH_BIT_DEPTH %assign n 0 %rep 16/mmsize mova m0, [%3+(n+0)*mmsize] mova m1, [%3+(n+1)*mmsize] psrld m2, m0, 16 psrld m3, m1, 16 pand m0, %5 pand m1, %5 packssdw m0, m1 packssdw m2, m3 mov%6 [%1+(n/2)*mmsize], m0 mov%6 [%2+(n/2)*mmsize], m2 %assign n (n+2) %endrep %else ; !HIGH_BIT_DEPTH %if mmsize==16 mova m0, [%3] %if cpuflag(ssse3) pshufb m0, %5 %else mova m1, m0 pand m0, %5 psrlw m1, 8 packuswb m0, m1 %endif %if %4 mova [%1], m0 %else movq [%1], m0 movhps [%2], m0 %endif %else mova m0, [%3] mova m1, [%3+8] mova m2, m0 mova m3, m1 pand m0, %5 pand m1, %5 psrlw m2, 8 psrlw m3, 8 packuswb m0, m1 packuswb m2, m3 mova [%1], m0 mova [%2], m2 %endif ; mmsize == 16 %endif ; HIGH_BIT_DEPTH %endmacro %macro PLANE_INTERLEAVE 0 ;----------------------------------------------------------------------------- ; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst, ; uint8_t *srcu, intptr_t i_srcu, ; uint8_t *srcv, intptr_t i_srcv, int w, int h ) ;----------------------------------------------------------------------------- ; assumes i_dst and w are multiples of 16, and i_dst>2*w cglobal plane_copy_interleave_core, 6,9 mov r6d, r6m %if HIGH_BIT_DEPTH FIX_STRIDES r1, r3, r5, r6d movifnidn r1mp, r1 movifnidn r3mp, r3 mov r6m, r6d %endif lea r0, [r0+r6*2] add r2, r6 add r4, r6 %if ARCH_X86_64 DECLARE_REG_TMP 7,8 %else DECLARE_REG_TMP 1,3 %endif mov t1, r1 shr t1, SIZEOF_PIXEL sub t1, r6 mov t0d, r7m .loopy: mov r6d, r6m neg r6 .prefetch: prefetchnta [r2+r6] prefetchnta [r4+r6] add r6, 64 jl .prefetch mov r6d, r6m neg r6 .loopx: INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt add r6, 16*SIZEOF_PIXEL jl .loopx .pad: %assign n 0 %rep SIZEOF_PIXEL %if mmsize==8 movntq [r0+r6*2+(n+ 0)], m0 movntq [r0+r6*2+(n+ 8)], m0 movntq [r0+r6*2+(n+16)], m0 movntq [r0+r6*2+(n+24)], m0 %else movntdq [r0+r6*2+(n+ 0)], m0 movntdq [r0+r6*2+(n+16)], m0 %endif %assign n n+32 %endrep add r6, 16*SIZEOF_PIXEL cmp r6, t1 jl .pad add r0, r1mp add r2, r3mp add r4, r5 dec t0d jg .loopy sfence emms RET ;----------------------------------------------------------------------------- ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height ) ;----------------------------------------------------------------------------- cglobal store_interleave_chroma, 5,5 FIX_STRIDES r1 .loop: INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a add r2, FDEC_STRIDEB*2 add r3, FDEC_STRIDEB*2 lea r0, [r0+r1*2] sub r4d, 2 jg .loop RET %endmacro ; PLANE_INTERLEAVE %macro DEINTERLEAVE_START 0 %if HIGH_BIT_DEPTH mova m4, [pd_ffff] %elif cpuflag(ssse3) mova m4, [deinterleave_shuf] %else mova m4, [pw_00ff] %endif ; HIGH_BIT_DEPTH %endmacro %macro PLANE_DEINTERLEAVE 0 ;----------------------------------------------------------------------------- ; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu, ; pixel *dstv, intptr_t i_dstv, ; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- cglobal plane_copy_deinterleave, 6,7 DEINTERLEAVE_START mov r6d, r6m FIX_STRIDES r1, r3, r5, r6d %if HIGH_BIT_DEPTH mov r6m, r6d %endif add r0, r6 add r2, r6 lea r4, [r4+r6*2] .loopy: mov r6d, r6m neg r6 .loopx: DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u add r6, 16*SIZEOF_PIXEL jl .loopx add r0, r1 add r2, r3 add r4, r5 dec dword r7m jg .loopy RET ;----------------------------------------------------------------------------- ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height ) ;----------------------------------------------------------------------------- cglobal load_deinterleave_chroma_fenc, 4,4 DEINTERLEAVE_START FIX_STRIDES r2 .loop: DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a add r0, FENC_STRIDEB*2 lea r1, [r1+r2*2] sub r3d, 2 jg .loop RET ;----------------------------------------------------------------------------- ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height ) ;----------------------------------------------------------------------------- cglobal load_deinterleave_chroma_fdec, 4,4 DEINTERLEAVE_START FIX_STRIDES r2 .loop: DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a add r0, FDEC_STRIDEB*2 lea r1, [r1+r2*2] sub r3d, 2 jg .loop RET %endmacro ; PLANE_DEINTERLEAVE %if HIGH_BIT_DEPTH INIT_MMX mmx2 PLANE_INTERLEAVE INIT_MMX mmx PLANE_DEINTERLEAVE INIT_XMM sse2 PLANE_INTERLEAVE PLANE_DEINTERLEAVE INIT_XMM avx PLANE_INTERLEAVE PLANE_DEINTERLEAVE %else INIT_MMX mmx2 PLANE_INTERLEAVE INIT_MMX mmx PLANE_DEINTERLEAVE INIT_XMM sse2 PLANE_INTERLEAVE PLANE_DEINTERLEAVE INIT_XMM ssse3 PLANE_DEINTERLEAVE %endif ; These functions are not general-use; not only do the SSE ones require aligned input, ; but they also will fail if given a non-mod16 size. ; memzero SSE will fail for non-mod128. ;----------------------------------------------------------------------------- ; void *memcpy_aligned( void *dst, const void *src, size_t n ); ;----------------------------------------------------------------------------- %macro MEMCPY 0 cglobal memcpy_aligned, 3,3 %if mmsize == 16 test r2d, 16 jz .copy2 mova m0, [r1+r2-16] mova [r0+r2-16], m0 sub r2d, 16 .copy2: %endif test r2d, 2*mmsize jz .copy4start mova m0, [r1+r2-1*mmsize] mova m1, [r1+r2-2*mmsize] mova [r0+r2-1*mmsize], m0 mova [r0+r2-2*mmsize], m1 sub r2d, 2*mmsize .copy4start: test r2d, r2d jz .ret .copy4: mova m0, [r1+r2-1*mmsize] mova m1, [r1+r2-2*mmsize] mova m2, [r1+r2-3*mmsize] mova m3, [r1+r2-4*mmsize] mova [r0+r2-1*mmsize], m0 mova [r0+r2-2*mmsize], m1 mova [r0+r2-3*mmsize], m2 mova [r0+r2-4*mmsize], m3 sub r2d, 4*mmsize jg .copy4 .ret: REP_RET %endmacro INIT_MMX mmx MEMCPY INIT_XMM sse MEMCPY ; ---------------------------------------------------------------------------- ; void *fast_memcpy( void *dst, const void *src, size_t n ); ; ---------------------------------------------------------------------------- INIT_MMX mmx cglobal fast_memcpy, 3,5,8 ;{ test r2, r2 ; if n = 0, quit jz .L_QUIT ; ; mov r3, r2 ; r3 <-- r2, copy sar r2, 3 ; r2 <-- n/8 and r3, 0x07 ; r3 <-- n%8 prefetchnta [r1] ; prefetch ahead, non-temporal ; ; cal hexnum/8 and remainder/8 and store ; mov r4, r2 ; r4 <-- r2, copy sar r2, 3 ; r2 <-- (n/8)/8 and r4, 0x07 ; r4 <-- (n/8)%8 cmp r2, 0 ; je .HEX_ZERO ; ; align 4 ; .L_COPY_64X: ; prefetchnta [r1 + 128] ; prefetch ahead, non-temporal prefetchnta [r1 + 256] ; prefetch ahead, non-temporal ; ; load 64 bytes data form src ; movq m0, [r1 + 0*8] ; load 8 bytes movq m1, [r1 + 1*8] ; load 8 bytes movq m2, [r1 + 2*8] ; load 8 bytes movq m3, [r1 + 3*8] ; load 8 bytes movq m4, [r1 + 4*8] ; load 8 bytes movq m5, [r1 + 5*8] ; load 8 bytes movq m6, [r1 + 6*8] ; load 8 bytes movq m7, [r1 + 7*8] ; load 8 bytes ; ; store the 64 bytes to dst ; movntq [r0 + 0*8], m0 ; store 8 bytes movntq [r0 + 1*8], m1 ; store 8 bytes movntq [r0 + 2*8], m2 ; store 8 bytes movntq [r0 + 3*8], m3 ; store 8 bytes movntq [r0 + 4*8], m4 ; store 8 bytes movntq [r0 + 5*8], m5 ; store 8 bytes movntq [r0 + 6*8], m6 ; store 8 bytes movntq [r0 + 7*8], m7 ; store 8 bytes ; add r1, 64 ; add r0, 64 ; dec r2 ; jnz .L_COPY_64X ; ; .HEX_ZERO: ; cmp r4, 0 ; je .L_RESIDUAL ; ; .L_COPY_8X: ; movq m3, [r1] ; load 8 bytes movntq [r0], m3 ; store 8 bytes add r1, 8 ; add r0, 8 ; dec r4 ; jnz .L_COPY_8X ; ; .L_RESIDUAL: ; ; quit ; cmp r3, 0 ; je .L_QUIT ; ; .L_COPY_1X: ; mov r2b, [r1] ; mov [r0], r2b ; add r1, 1 ; add r0, 1 ; dec r3 ; jnz .L_COPY_1X ; ; .L_QUIT: ; sfence ; emms ; RET ; ;} ;----------------------------------------------------------------------------- ; void *memzero_aligned( void *dst, size_t n ); ;----------------------------------------------------------------------------- %macro MEMZERO 1 cglobal memzero_aligned, 2,2 add r0, r1 neg r1 %if mmsize == 8 pxor m0, m0 %else xorps m0, m0 %endif .loop: %assign i 0 %rep %1 mova [r0 + r1 + i], m0 %assign i i+mmsize %endrep add r1, mmsize*%1 jl .loop RET %endmacro INIT_MMX mmx MEMZERO 8 INIT_XMM sse MEMZERO 8 INIT_YMM avx MEMZERO 4 ; ---------------------------------------------------------------------------- ; void *fast_memzero( void *dst, size_t n ); ; ---------------------------------------------------------------------------- INIT_MMX mmx cglobal fast_memzero, 2,3,1 ;{ test r1, r1 ; if n = 0, quit jz .L_QUIT ; mov r2, r1 ; r2 <-- r1 = n, copy sar r1, 3 ; r1 = n/8 and r2, 7 ; r2 = n%8 cmp r1, 0 ; n/8 = 0? je .HEX_ZERO ; jump if n < 8 pxor m0, m0 ; clear m0 ; .L_SET_8X: ; movntq [r0], m0 ; clear 8 bytes add r0, 8 ; r0 = r0 + 8 dec r1 ; r1 = r1 - 1 jnz .L_SET_8X ; loop until r1 = 0 ; .HEX_ZERO: ; xor r1, r1 ; clear r1 cmp r2, 0 ; n%8 = 0? je .L_QUIT ; ; .L_RESIDUAL: ; mov [r0], r1b ; mov 1 byte add r0, 1 ; dec r2 ; jnz .L_RESIDUAL ; ; .L_QUIT: ; emms ; RET ; ;} ; ---------------------------------------------------------------------------- ; void *fast_memset( void *dst, int val, size_t n ); ; ---------------------------------------------------------------------------- INIT_MMX mmx cglobal fast_memset, 3,4,1 ;{ test r2, r2 ; if n = 0, quit jz .L_QUIT ; mov r3, r2 ; r3 <-- r2 = n, copy sar r2, 3 ; r2 = n/8 and r3, 7 ; r3 = n%8 cmp r2, 0 ; n/8 = 0? je .HEX_ZERO ; jump if n < 8 movd m0, r1d ; m0[ 0] = val (DWORD) pshufw m0, m0, 0 ; m0[ 3 2 1 0] = val (WORD) packsswb m0, m0 ; m0[76543210] = val (BYTE) ; .L_SET_8X: ; movntq [r0], m0 ; clear 8 bytes add r0, 8 ; r0 = r0 + 8 dec r2 ; r2 = r2 - 1 jnz .L_SET_8X ; loop until r2 = 0 ; .HEX_ZERO: ; cmp r3, 0 ; n%8 = 0? je .L_QUIT ; ; .L_RESIDUAL: ; mov [r0], r1b ; mov 1 byte add r0, 1 ; dec r3 ; jnz .L_RESIDUAL ; ; .L_QUIT: ; emms ; RET ; ;} ; ------------------------------------------------------------------ ; param 1: dst, param 2: src stride ; r0 -- src %macro FILT_8x2 2 mova m3, [r0 + 8] mova m2, [r0 ] pavgb m3, [r0 + %2 + 8] pavgb m2, [r0 + %2 ] mova m1, [r0 + 9] mova m0, [r0 + 1] pavgb m1, [r0 + %2 + 9] pavgb m0, [r0 + %2 + 1] pavgb m1, m3 pavgb m0, m2 pand m1, m7 pand m0, m7 packuswb m0, m1 movu [%1], m0 %endmacro ; ------------------------------------------------------------------ ; param 1: dst, param 2: src stride ; r0 -- src %macro FILT_16x2 2 mova m3, [r0 + mmsize] mova m2, [r0 ] pavgb m3, [r0 + %2 + mmsize] pavgb m2, [r0 + %2 ] PALIGNR m0, m3, 1, m6 pavgb m0, m3 PALIGNR m3, m2, 1, m6 pavgb m3, m2 pand m0, m7 pand m3, m7 packuswb m3, m0 movu [%1], m3 %endmacro ; ---------------------------------------------------------------------------- ; void lowres_filter_core_c( pel_t *src, int i_src, pel_t *dst, int i_dst, ; int width, int height ) ; ---------------------------------------------------------------------------- %macro LOWRES_FILTER_CORE 0 cglobal lowres_filter_core, 6,7,8 %if mmsize >= 16 ; add r4, mmsize-1 ; and r4, ~(mmsize-1) ; %endif ; ; src += 2*[(height-1)*i_src + width] ; mov r6d, r5d ; r6 <-- height dec r6d ; r6 <-- (height - 1) imul r6d, r1d ; r6 <-- (height - 1) * i_src add r6d, r4d ; r6 <-- (height - 1) * i_src + width lea r0, [r0+r6*2] ; r0 <== src + 2*((height - 1) * i_src + width) ; dst += (height-1)*stride + width ; mov r6d, r5d ; r6 <-- height dec r6d ; r6 <-- (height - 1) imul r6d, r3d ; r6 <-- (height - 1) * i_dst add r6d, r4d ; r6 <-- (height - 1) * i_dst + width add r2, r6 ; r2 <== dst + (height - 1) * i_dst + width ; gap of src and dst in each line ; sub r3d, r4d ; r3 <== i_dst - width // dst gap mov r6d, r1d ; r6 <-- i_src sub r6d, r4d ; r6 <-- i_src - width shl r6d, 1 ; r6 <-- 2 * (i_src - width) PUSH r6 ; src gap %define src_gap [rsp] ; ; pcmpeqb m7, m7 ; m7 <-- [FFFF...FFFF] psrlw m7, 8 ; m7 <-- [00FF...00FF] ; .vloop: ; ==== for (; height>0; height--) { mov r6d, r4d ; r6 <-- width %ifnidn cpuname, mmx2 ; %if mmsize <= 16 ; mova m0, [r0 ] ; load from src mova m1, [r0 + r1] ; load from down line pavgb m0, m1 ; m0 <-- average of 2 lines %endif ; %endif ; .hloop: ; -------- for (; width>0; width-=mmsize) { sub r0, mmsize*2 ; src -= mmsize * 2 sub r2, mmsize ; dst -= mmsize %ifidn cpuname, mmx2 ; FILT_8x2 r2, r1 ; %else ; FILT_16x2 r2, r1 ; %endif ; sub r6d, mmsize ; r6 -= mmsize jg .hloop ; -------- } // end for (width...) ; .skip: ; sub r0, src_gap ; sub r2, r3 ; dec r5d ; jg .vloop ; ==== } // end for (height...) ADD rsp, gprsize ; emms ; RET ; %endmacro ; LOWRES_FILTER_CORE INIT_MMX mmx2 LOWRES_FILTER_CORE ; lowres_filter_core_mmx2 INIT_XMM sse2 LOWRES_FILTER_CORE ; lowres_filter_core_sse2 INIT_XMM ssse3 LOWRES_FILTER_CORE ; lowres_filter_core_ssse3 INIT_XMM avx LOWRES_FILTER_CORE ; lowres_filter_core_avx ; %if HIGH_BIT_DEPTH == 0 ; ;----------------------------------------------------------------------------- ; ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride ) ; ;----------------------------------------------------------------------------- ; %macro INTEGRAL_INIT4H 0 ; cglobal integral_init4h, 3,4 ; lea r3, [r0+r2*2] ; add r1, r2 ; neg r2 ; pxor m4, m4 ; .loop: ; mova m0, [r1+r2] ; %if mmsize==32 ; movu m1, [r1+r2+8] ; %else ; mova m1, [r1+r2+16] ; palignr m1, m0, 8 ; %endif ; mpsadbw m0, m4, 0 ; mpsadbw m1, m4, 0 ; paddw m0, [r0+r2*2] ; paddw m1, [r0+r2*2+mmsize] ; mova [r3+r2*2 ], m0 ; mova [r3+r2*2+mmsize], m1 ; add r2, mmsize ; jl .loop ; RET ; %endmacro ; ; INIT_XMM sse4 ; INTEGRAL_INIT4H ; INIT_YMM avx2 ; INTEGRAL_INIT4H ; ; %macro INTEGRAL_INIT8H 0 ; cglobal integral_init8h, 3,4 ; lea r3, [r0+r2*2] ; add r1, r2 ; neg r2 ; pxor m4, m4 ; .loop: ; mova m0, [r1+r2] ; %if mmsize==32 ; movu m1, [r1+r2+8] ; mpsadbw m2, m0, m4, 100100b ; mpsadbw m3, m1, m4, 100100b ; %else ; mova m1, [r1+r2+16] ; palignr m1, m0, 8 ; mpsadbw m2, m0, m4, 100b ; mpsadbw m3, m1, m4, 100b ; %endif ; mpsadbw m0, m4, 0 ; mpsadbw m1, m4, 0 ; paddw m0, [r0+r2*2] ; paddw m1, [r0+r2*2+mmsize] ; paddw m0, m2 ; paddw m1, m3 ; mova [r3+r2*2 ], m0 ; mova [r3+r2*2+mmsize], m1 ; add r2, mmsize ; jl .loop ; RET ; %endmacro ; ; INIT_XMM sse4 ; INTEGRAL_INIT8H ; INIT_XMM avx ; INTEGRAL_INIT8H ; INIT_YMM avx2 ; INTEGRAL_INIT8H ; %endif ; !HIGH_BIT_DEPTH ; ; %macro INTEGRAL_INIT_8V 0 ; ;----------------------------------------------------------------------------- ; ; void integral_init8v( uint16_t *sum8, intptr_t stride ) ; ;----------------------------------------------------------------------------- ; cglobal integral_init8v, 3,3 ; add r1, r1 ; add r0, r1 ; lea r2, [r0+r1*8] ; neg r1 ; .loop: ; mova m0, [r2+r1] ; mova m1, [r2+r1+mmsize] ; psubw m0, [r0+r1] ; psubw m1, [r0+r1+mmsize] ; mova [r0+r1], m0 ; mova [r0+r1+mmsize], m1 ; add r1, 2*mmsize ; jl .loop ; RET ; %endmacro ; ; INIT_MMX mmx ; INTEGRAL_INIT_8V ; INIT_XMM sse2 ; INTEGRAL_INIT_8V ; INIT_YMM avx2 ; INTEGRAL_INIT_8V ; ; ;----------------------------------------------------------------------------- ; ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride ) ; ;----------------------------------------------------------------------------- ; INIT_MMX mmx ; cglobal integral_init4v, 3,5 ; shl r2, 1 ; lea r3, [r0+r2*4] ; lea r4, [r0+r2*8] ; mova m0, [r0+r2] ; mova m4, [r4+r2] ; .loop: ; mova m1, m4 ; psubw m1, m0 ; mova m4, [r4+r2-8] ; mova m0, [r0+r2-8] ; paddw m1, m4 ; mova m3, [r3+r2-8] ; psubw m1, m0 ; psubw m3, m0 ; mova [r0+r2-8], m1 ; mova [r1+r2-8], m3 ; sub r2, 8 ; jge .loop ; RET ; ; INIT_XMM sse2 ; cglobal integral_init4v, 3,5 ; shl r2, 1 ; add r0, r2 ; add r1, r2 ; lea r3, [r0+r2*4] ; lea r4, [r0+r2*8] ; neg r2 ; .loop: ; mova m0, [r0+r2] ; mova m1, [r4+r2] ; mova m2, m0 ; mova m4, m1 ; shufpd m0, [r0+r2+16], 1 ; shufpd m1, [r4+r2+16], 1 ; paddw m0, m2 ; paddw m1, m4 ; mova m3, [r3+r2] ; psubw m1, m0 ; psubw m3, m2 ; mova [r0+r2], m1 ; mova [r1+r2], m3 ; add r2, 16 ; jl .loop ; RET ; ; INIT_XMM ssse3 ; cglobal integral_init4v, 3,5 ; shl r2, 1 ; add r0, r2 ; add r1, r2 ; lea r3, [r0+r2*4] ; lea r4, [r0+r2*8] ; neg r2 ; .loop: ; mova m2, [r0+r2] ; mova m0, [r0+r2+16] ; mova m4, [r4+r2] ; mova m1, [r4+r2+16] ; palignr m0, m2, 8 ; palignr m1, m4, 8 ; paddw m0, m2 ; paddw m1, m4 ; mova m3, [r3+r2] ; psubw m1, m0 ; psubw m3, m2 ; mova [r0+r2], m1 ; mova [r1+r2], m3 ; add r2, 16 ; jl .loop ; RET ; ; INIT_YMM avx2 ; cglobal integral_init4v, 3,5 ; add r2, r2 ; add r0, r2 ; add r1, r2 ; lea r3, [r0+r2*4] ; lea r4, [r0+r2*8] ; neg r2 ; .loop: ; mova m2, [r0+r2] ; movu m1, [r4+r2+8] ; paddw m0, m2, [r0+r2+8] ; paddw m1, [r4+r2] ; mova m3, [r3+r2] ; psubw m1, m0 ; psubw m3, m2 ; mova [r0+r2], m1 ; mova [r1+r2], m3 ; add r2, 32 ; jl .loop ; RET ; ; %macro FILT8x4 7 ; mova %3, [r0+%7] ; mova %4, [r0+r5+%7] ; pavgb %3, %4 ; pavgb %4, [r0+r5*2+%7] ; PALIGNR %1, %3, 1, m6 ; PALIGNR %2, %4, 1, m6 ; %if cpuflag(xop) ; pavgb %1, %3 ; pavgb %2, %4 ; %else ; pavgb %1, %3 ; pavgb %2, %4 ; psrlw %5, %1, 8 ; psrlw %6, %2, 8 ; pand %1, m7 ; pand %2, m7 ; %endif ; %endmacro ; ; %macro FILT32x4U 4 ; movu m1, [r0+r5] ; pavgb m0, m1, [r0] ; movu m3, [r0+r5+1] ; pavgb m2, m3, [r0+1] ; pavgb m1, [r0+r5*2] ; pavgb m3, [r0+r5*2+1] ; pavgb m0, m2 ; pavgb m1, m3 ; ; movu m3, [r0+r5+mmsize] ; pavgb m2, m3, [r0+mmsize] ; movu m5, [r0+r5+1+mmsize] ; pavgb m4, m5, [r0+1+mmsize] ; pavgb m3, [r0+r5*2+mmsize] ; pavgb m5, [r0+r5*2+1+mmsize] ; pavgb m2, m4 ; pavgb m3, m5 ; ; pshufb m0, m7 ; pshufb m1, m7 ; pshufb m2, m7 ; pshufb m3, m7 ; punpckhqdq m4, m0, m2 ; punpcklqdq m0, m0, m2 ; punpckhqdq m5, m1, m3 ; punpcklqdq m2, m1, m3 ; vpermq m0, m0, q3120 ; vpermq m1, m4, q3120 ; vpermq m2, m2, q3120 ; vpermq m3, m5, q3120 ; movu [%1], m0 ; movu [%2], m1 ; movu [%3], m2 ; movu [%4], m3 ; %endmacro ; ; %macro FILT16x2 4 ; mova m3, [r0+%4+mmsize] ; mova m2, [r0+%4] ; pavgb m3, [r0+%4+r5+mmsize] ; pavgb m2, [r0+%4+r5] ; PALIGNR %1, m3, 1, m6 ; pavgb %1, m3 ; PALIGNR m3, m2, 1, m6 ; pavgb m3, m2 ; %if cpuflag(xop) ; vpperm m5, m3, %1, m7 ; vpperm m3, m3, %1, m6 ; %else ; psrlw m5, m3, 8 ; psrlw m4, %1, 8 ; pand m3, m7 ; pand %1, m7 ; packuswb m3, %1 ; packuswb m5, m4 ; %endif ; mova [%2], m3 ; mova [%3], m5 ; mova %1, m2 ; %endmacro ; ; %macro FILT8x2U 3 ; mova m3, [r0+%3+8] ; mova m2, [r0+%3] ; pavgb m3, [r0+%3+r5+8] ; pavgb m2, [r0+%3+r5] ; mova m1, [r0+%3+9] ; mova m0, [r0+%3+1] ; pavgb m1, [r0+%3+r5+9] ; pavgb m0, [r0+%3+r5+1] ; pavgb m1, m3 ; pavgb m0, m2 ; psrlw m3, m1, 8 ; psrlw m2, m0, 8 ; pand m1, m7 ; pand m0, m7 ; packuswb m0, m1 ; packuswb m2, m3 ; mova [%1], m0 ; mova [%2], m2 ; %endmacro ; ; %macro FILT8xU 3 ; mova m3, [r0+%3+8] ; mova m2, [r0+%3] ; pavgw m3, [r0+%3+r5+8] ; pavgw m2, [r0+%3+r5] ; movu m1, [r0+%3+10] ; movu m0, [r0+%3+2] ; pavgw m1, [r0+%3+r5+10] ; pavgw m0, [r0+%3+r5+2] ; pavgw m1, m3 ; pavgw m0, m2 ; psrld m3, m1, 16 ; psrld m2, m0, 16 ; pand m1, m7 ; pand m0, m7 ; packssdw m0, m1 ; packssdw m2, m3 ; movu [%1], m0 ; mova [%2], m2 ; %endmacro ; ; %macro FILT8xA 4 ; movu m3, [r0+%4+mmsize] ; movu m2, [r0+%4] ; pavgw m3, [r0+%4+r5+mmsize] ; pavgw m2, [r0+%4+r5] ; PALIGNR %1, m3, 2, m6 ; pavgw %1, m3 ; PALIGNR m3, m2, 2, m6 ; pavgw m3, m2 ; %if cpuflag(xop) ; vpperm m5, m3, %1, m7 ; vpperm m3, m3, %1, m6 ; %else ; psrld m5, m3, 16 ; psrld m4, %1, 16 ; pand m3, m7 ; pand %1, m7 ; packssdw m3, %1 ; packssdw m5, m4 ; %endif ; %if cpuflag(avx2) ; vpermq m3, m3, q3120 ; vpermq m5, m5, q3120 ; %endif ; movu [%2], m3 ; movu [%3], m5 ; movu %1, m2 ; %endmacro ; ; ;----------------------------------------------------------------------------- ; ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, ; ; intptr_t src_stride, intptr_t dst_stride, int width, int height ) ; ;----------------------------------------------------------------------------- ; %macro FRAME_INIT_LOWRES 0 ; cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise ; %if HIGH_BIT_DEPTH ; shl dword r6m, 1 ; FIX_STRIDES r5 ; shl dword r7m, 1 ; %endif ; %if mmsize >= 16 ; add dword r7m, mmsize-1 ; and dword r7m, ~(mmsize-1) ; %endif ; ; src += 2*(height-1)*stride + 2*width ; mov r6d, r8m ; dec r6d ; imul r6d, r5d ; add r6d, r7m ; lea r0, [r0+r6*2] ; ; dst += (height-1)*stride + width ; mov r6d, r8m ; dec r6d ; imul r6d, r6m ; add r6d, r7m ; add r1, r6 ; add r2, r6 ; add r3, r6 ; add r4, r6 ; ; gap = stride - width ; mov r6d, r6m ; sub r6d, r7m ; PUSH r6 ; %define dst_gap [rsp+gprsize] ; mov r6d, r5d ; sub r6d, r7m ; shl r6d, 1 ; PUSH r6 ; %define src_gap [rsp] ; %if HIGH_BIT_DEPTH ; %if cpuflag(xop) ; mova m6, [deinterleave_shuf32a] ; mova m7, [deinterleave_shuf32b] ; %else ; pcmpeqw m7, m7 ; psrld m7, 16 ; %endif ; .vloop: ; mov r6d, r7m ; %ifnidn cpuname, mmx2 ; movu m0, [r0] ; movu m1, [r0+r5] ; pavgw m0, m1 ; pavgw m1, [r0+r5*2] ; %endif ; .hloop: ; sub r0, mmsize*2 ; sub r1, mmsize ; sub r2, mmsize ; sub r3, mmsize ; sub r4, mmsize ; %ifidn cpuname, mmx2 ; FILT8xU r1, r2, 0 ; FILT8xU r3, r4, r5 ; %else ; FILT8xA m0, r1, r2, 0 ; FILT8xA m1, r3, r4, r5 ; %endif ; sub r6d, mmsize ; jg .hloop ; %else ; !HIGH_BIT_DEPTH ; %if cpuflag(avx2) ; mova m7, [deinterleave_shuf] ; %elif cpuflag(xop) ; mova m6, [deinterleave_shuf32a] ; mova m7, [deinterleave_shuf32b] ; %else ; pcmpeqb m7, m7 ; psrlw m7, 8 ; %endif ; .vloop: ; mov r6d, r7m ; %ifnidn cpuname, mmx2 ; %if mmsize <= 16 ; mova m0, [r0] ; mova m1, [r0+r5] ; pavgb m0, m1 ; pavgb m1, [r0+r5*2] ; %endif ; %endif ; .hloop: ; sub r0, mmsize*2 ; sub r1, mmsize ; sub r2, mmsize ; sub r3, mmsize ; sub r4, mmsize ; %if mmsize==32 ; FILT32x4U r1, r2, r3, r4 ; %elifdef m8 ; FILT8x4 m0, m1, m2, m3, m10, m11, mmsize ; mova m8, m0 ; mova m9, m1 ; FILT8x4 m2, m3, m0, m1, m4, m5, 0 ; %if cpuflag(xop) ; vpperm m4, m2, m8, m7 ; vpperm m2, m2, m8, m6 ; vpperm m5, m3, m9, m7 ; vpperm m3, m3, m9, m6 ; %else ; packuswb m2, m8 ; packuswb m3, m9 ; packuswb m4, m10 ; packuswb m5, m11 ; %endif ; mova [r1], m2 ; mova [r2], m4 ; mova [r3], m3 ; mova [r4], m5 ; %elifidn cpuname, mmx2 ; FILT8x2U r1, r2, 0 ; FILT8x2U r3, r4, r5 ; %else ; FILT16x2 m0, r1, r2, 0 ; FILT16x2 m1, r3, r4, r5 ; %endif ; sub r6d, mmsize ; jg .hloop ; %endif ; HIGH_BIT_DEPTH ; .skip: ; mov r6, dst_gap ; sub r0, src_gap ; sub r1, r6 ; sub r2, r6 ; sub r3, r6 ; sub r4, r6 ; dec dword r8m ; jg .vloop ; ADD rsp, 2*gprsize ; emms ; RET ; %endmacro ; FRAME_INIT_LOWRES ; ; INIT_MMX mmx2 ; FRAME_INIT_LOWRES ; %if ARCH_X86_64 == 0 ; INIT_MMX cache32, mmx2 ; FRAME_INIT_LOWRES ; %endif ; INIT_XMM sse2 ; FRAME_INIT_LOWRES ; INIT_XMM ssse3 ; FRAME_INIT_LOWRES ; INIT_XMM avx ; FRAME_INIT_LOWRES ; INIT_XMM xop ; FRAME_INIT_LOWRES ; %if ARCH_X86_64 == 1 ; INIT_YMM avx2 ; FRAME_INIT_LOWRES ; %endif ; ; ;----------------------------------------------------------------------------- ; ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs, ; ; uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len ) ; ;----------------------------------------------------------------------------- ; INIT_XMM sse2 ; cglobal mbtree_propagate_cost, 7,7,7 ; dec r6d ; movsd m6, [r5] ; mulpd m6, [pd_inv256] ; xor r5d, r5d ; lea r0, [r0+r5*2] ; pxor m4, m4 ; movlhps m6, m6 ; mova m5, [pw_3fff] ; ; .loop: ; movh m2, [r2+r5*4] ; intra ; movh m0, [r4+r5*4] ; invq ; movd m3, [r3+r5*2] ; inter ; pand m3, m5 ; punpcklwd m3, m4 ; ; ; PMINSD ; pcmpgtd m1, m2, m3 ; pand m3, m1 ; pandn m1, m2 ; por m3, m1 ; ; movd m1, [r1+r5*2] ; prop ; punpckldq m2, m2 ; punpckldq m0, m0 ; pmuludq m0, m2 ; pshufd m2, m2, q3120 ; pshufd m0, m0, q3120 ; ; punpcklwd m1, m4 ; cvtdq2pd m0, m0 ; mulpd m0, m6 ; intra*invq*fps_factor>>8 ; cvtdq2pd m1, m1 ; prop ; addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) ; ;cvtdq2ps m1, m2 ; intra ; cvtdq2pd m1, m2 ; intra ; psubd m2, m3 ; intra - inter ; cvtdq2pd m2, m2 ; intra - inter ; ;rcpps m3, m1 ; ;mulps m1, m3 ; intra * (1/intra 1st approx) ; ;mulps m1, m3 ; intra * (1/intra 1st approx)^2 ; ;addps m3, m3 ; 2 * (1/intra 1st approx) ; ;subps m3, m1 ; 2nd approximation for 1/intra ; ;cvtps2pd m3, m3 ; 1 / intra 1st approximation ; mulpd m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) ; ;mulpd m0, m3 ; / intra ; ; ; TODO: DIVPD very slow, but match to C model output, since it is not bottleneck function, I comment above faster code ; divpd m0, m1 ; addpd m0, [pd_0_5] ; cvttpd2dq m0, m0 ; ; movh [r0+r5*4], m0 ; add r5d, 2 ; cmp r5d, r6d ; jl .loop ; ; xor r6d, r5d ; jnz .even ; movd m2, [r2+r5*4] ; intra ; movd m0, [r4+r5*4] ; invq ; movd m3, [r3+r5*2] ; inter ; pand m3, m5 ; punpcklwd m3, m4 ; ; ; PMINSD ; pcmpgtd m1, m2, m3 ; pand m3, m1 ; pandn m1, m2 ; por m3, m1 ; ; movd m1, [r1+r5*2] ; prop ; punpckldq m2, m2 ; DWORD [_ 1 _ 0] ; punpckldq m0, m0 ; pmuludq m0, m2 ; QWORD [m1 m0] ; pshufd m2, m2, q3120 ; pshufd m0, m0, q3120 ; punpcklwd m1, m4 ; cvtdq2pd m0, m0 ; mulpd m0, m6 ; intra*invq*fps_factor>>8 ; cvtdq2pd m1, m1 ; prop ; addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) ; cvtdq2pd m1, m2 ; intra ; psubd m2, m3 ; intra - inter ; cvtdq2pd m2, m2 ; intra - inter ; mulpd m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) ; ; divpd m0, m1 ; addpd m0, [pd_0_5] ; cvttpd2dq m0, m0 ; movd [r0+r5*4], m0 ; .even: ; RET ; ; ; ;----------------------------------------------------------------------------- ; ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs, ; ; uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len ) ; ;----------------------------------------------------------------------------- ; ; FIXME: align loads/stores to 16 bytes ; %macro MBTREE_AVX 0 ; cglobal mbtree_propagate_cost, 7,7,7 ; sub r6d, 3 ; vbroadcastsd m6, [r5] ; mulpd m6, [pd_inv256] ; xor r5d, r5d ; mova m5, [pw_3fff] ; ; .loop: ; movu xm2, [r2+r5*4] ; intra ; movu xm0, [r4+r5*4] ; invq ; pmovzxwd xm3, [r3+r5*2] ; inter ; pand xm3, xm5 ; pminsd xm3, xm2 ; ; pmovzxwd xm1, [r1+r5*2] ; prop ; pmulld xm0, xm2 ; cvtdq2pd m0, xm0 ; cvtdq2pd m1, xm1 ; prop ; ;%if cpuflag(avx2) ; ; fmaddpd m0, m0, m6, m1 ; ;%else ; mulpd m0, m6 ; intra*invq*fps_factor>>8 ; addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) ; ;%endif ; cvtdq2pd m1, xm2 ; intra ; psubd xm2, xm3 ; intra - inter ; cvtdq2pd m2, xm2 ; intra - inter ; mulpd m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) ; ; ; TODO: DIVPD very slow, but match to C model output, since it is not bottleneck function, I comment above faster code ; divpd m0, m1 ; addpd m0, [pd_0_5] ; cvttpd2dq xm0, m0 ; ; movu [r0+r5*4], xm0 ; add r5d, 4 ; process 4 values in one iteration ; cmp r5d, r6d ; jl .loop ; ; add r6d, 3 ; xor r6d, r5d ; jz .even ; if loop counter is multiple of 4, all values are processed ; ; and r6d, 3 ; otherwise, remaining unprocessed values must be 1, 2 or 3 ; cmp r6d, 1 ; je .process1 ; if only 1 value is unprocessed ; ; ; process 2 values here ; movq xm2, [r2+r5*4] ; intra ; movq xm0, [r4+r5*4] ; invq ; movd xm3, [r3+r5*2] ; inter ; pmovzxwd xm3, xm3 ; pand xm3, xm5 ; pminsd xm3, xm2 ; ; movd xm1, [r1+r5*2] ; prop ; pmovzxwd xm1, xm1 ; pmulld xm0, xm2 ; cvtdq2pd m0, xm0 ; cvtdq2pd m1, xm1 ; prop ; ;%if cpuflag(avx2) ; ; fmaddpd m0, m0, m6, m1 ; ;%else ; mulpd m0, m6 ; intra*invq*fps_factor>>8 ; addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) ; ;%endif ; cvtdq2pd m1, xm2 ; intra ; psubd xm2, xm3 ; intra - inter ; cvtdq2pd m2, xm2 ; intra - inter ; mulpd m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) ; ; divpd m0, m1 ; addpd m0, [pd_0_5] ; cvttpd2dq xm0, m0 ; movq [r0+r5*4], xm0 ; ; xor r6d, 2 ; jz .even ; add r5d, 2 ; ; ; process 1 value here ; .process1: ; movd xm2, [r2+r5*4] ; intra ; movd xm0, [r4+r5*4] ; invq ; movzx r6d, word [r3+r5*2] ; inter ; movd xm3, r6d ; pand xm3, xm5 ; pminsd xm3, xm2 ; ; movzx r6d, word [r1+r5*2] ; prop ; movd xm1, r6d ; pmulld xm0, xm2 ; cvtdq2pd m0, xm0 ; cvtdq2pd m1, xm1 ; prop ; ;%if cpuflag(avx2) ; ; fmaddpd m0, m0, m6, m1 ; ;%else ; mulpd m0, m6 ; intra*invq*fps_factor>>8 ; addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) ; ;%endif ; cvtdq2pd m1, xm2 ; intra ; psubd xm2, xm3 ; intra - inter ; cvtdq2pd m2, xm2 ; intra - inter ; mulpd m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) ; ; divpd m0, m1 ; addpd m0, [pd_0_5] ; cvttpd2dq xm0, m0 ; movd [r0+r5*4], xm0 ; .even: ; RET ; %endmacro ; ; INIT_YMM avx ; MBTREE_AVX ; ; INIT_YMM avx2 ; MBTREE_AVX ; ; ; %macro CUTREE_FIX8 0 ; ;----------------------------------------------------------------------------- ; ; void cutree_fix8_pack( uint16_t *dst, double *src, int count ) ; ;----------------------------------------------------------------------------- ; cglobal cutree_fix8_pack, 3, 4, 5 ; movapd m2, [pq_256] ; sub r2d, mmsize / 2 ; movsxdifnidn r2, r2d ; lea r1, [r1 + 8 * r2] ; lea r0, [r0 + 2 * r2] ; neg r2 ; jg .skip_loop ; .loop: ; mulpd m0, m2, [r1 + 8 * r2] ; mulpd m1, m2, [r1 + 8 * r2 + mmsize] ; mulpd m3, m2, [r1 + 8 * r2 + 2 * mmsize] ; mulpd m4, m2, [r1 + 8 * r2 + 3 * mmsize] ; cvttpd2dq xm0, m0 ; cvttpd2dq xm1, m1 ; cvttpd2dq xm3, m3 ; cvttpd2dq xm4, m4 ; %if mmsize == 32 ; vinserti128 m0, m0, xm3, 1 ; vinserti128 m1, m1, xm4, 1 ; packssdw m0, m1 ; %else ; punpcklqdq m0, m1 ; punpcklqdq m3, m4 ; packssdw m0, m3 ; %endif ; mova [r0 + 2 * r2], m0 ; add r2, mmsize / 2 ; jle .loop ; .skip_loop: ; sub r2, mmsize / 2 ; jz .end ; ; Do the remaining values in scalar in order to avoid overreading src. ; .scalar: ; movq xm0, [r1 + 8 * r2 + 4 * mmsize] ; mulsd xm0, xm2 ; cvttsd2si r3d, xm0 ; mov [r0 + 2 * r2 + mmsize], r3w ; inc r2 ; jl .scalar ; .end: ; RET ; ; ;----------------------------------------------------------------------------- ; ; void cutree_fix8_unpack( double *dst, uint16_t *src, int count ) ; ;----------------------------------------------------------------------------- ; cglobal cutree_fix8_unpack, 3, 4, 7 ; %if mmsize != 32 ; mova m4, [cutree_fix8_unpack_shuf+16] ; %endif ; movapd m2, [pd_inv256] ; mova m3, [cutree_fix8_unpack_shuf] ; sub r2d, mmsize / 2 ; movsxdifnidn r2, r2d ; lea r1, [r1 + 2 * r2] ; lea r0, [r0 + 8 * r2] ; neg r2 ; jg .skip_loop ; .loop: ; %if mmsize == 32 ; vbroadcasti128 m0, [r1 + 2 * r2] ; vbroadcasti128 m1, [r1 + 2 * r2 + 16] ; pshufb m0, m3 ; pshufb m1, m3 ; %else ; mova m1, [r1 + 2 * r2] ; pshufb m0, m1, m3 ; pshufb m1, m4 ; %endif ; psrad m0, 16 ; sign-extend ; psrad m1, 16 ; cvtdq2pd m5, xm0 ; cvtdq2pd m6, xm1 ; %if mmsize == 32 ; vpermq m0, m0, q1032 ; vpermq m1, m1, q1032 ; %else ; psrldq m0, 8 ; psrldq m1, 8 ; %endif ; cvtdq2pd m0, xm0 ; cvtdq2pd m1, xm1 ; mulpd m0, m2 ; mulpd m1, m2 ; mulpd m5, m2 ; mulpd m6, m2 ; movapd [r0 + 8 * r2], m5 ; movapd [r0 + 8 * r2 + mmsize], m0 ; movapd [r0 + 8 * r2 + mmsize * 2], m6 ; movapd [r0 + 8 * r2 + mmsize * 3], m1 ; add r2, mmsize / 2 ; jle .loop ; .skip_loop: ; sub r2, mmsize / 2 ; jz .end ; .scalar: ; movzx r3d, word [r1 + 2 * r2 + mmsize] ; movsx r3d, r3w ; cvtsi2sd xm0, r3d ; mulsd xm0, xm2 ; movsd [r0 + 8 * r2 + 4 * mmsize], xm0 ; inc r2 ; jl .scalar ; .end: ; RET ; %endmacro ; ; INIT_XMM ssse3 ; CUTREE_FIX8 ; ; INIT_YMM avx2 ; CUTREE_FIX8 xavs2-1.3/source/common/x86/mc.h000066400000000000000000000063731340660520300164150ustar00rootroot00000000000000/***************************************************************************** * Copyright (C) 2013-2017 MulticoreWare, Inc * * Authors: Steve Borho * Jiaqi Zhang * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ #ifndef XAVS2_I386_MC_H #define XAVS2_I386_MC_H #define xavs2_plane_copy_core_mmx2 FPFX(plane_copy_core_mmx2) void xavs2_plane_copy_core_mmx2(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h); #define xavs2_plane_copy_deinterleave_mmx FPFX(plane_copy_deinterleave_mmx) void xavs2_plane_copy_deinterleave_mmx(pel_t *dstu, intptr_t i_dstu, pel_t *dstv, intptr_t i_dstv, pel_t *src, intptr_t i_src, int w, int h); #define xavs2_memcpy_aligned_mmx FPFX(memcpy_aligned_mmx) void *xavs2_memcpy_aligned_mmx(void *dst, const void *src, size_t n); #define xavs2_memcpy_aligned_sse FPFX(memcpy_aligned_sse) void *xavs2_memcpy_aligned_sse(void *dst, const void *src, size_t n); #define xavs2_fast_memcpy_mmx FPFX(fast_memcpy_mmx) void *xavs2_fast_memcpy_mmx (void *dst, const void *src, size_t n); #define xavs2_fast_memset_mmx FPFX(fast_memset_mmx) void *xavs2_fast_memset_mmx (void *dst, int val, size_t n); #define xavs2_memzero_aligned_mmx FPFX(memzero_aligned_mmx) void *xavs2_memzero_aligned_mmx(void *dst, size_t n); #define xavs2_memzero_aligned_sse FPFX(memzero_aligned_sse) void *xavs2_memzero_aligned_sse(void *dst, size_t n); #define xavs2_memzero_aligned_avx FPFX(memzero_aligned_avx) void *xavs2_memzero_aligned_avx(void *dst, size_t n); #define xavs2_fast_memzero_mmx FPFX(fast_memzero_mmx) void *xavs2_fast_memzero_mmx (void *dst, size_t n); #define xavs2_lowres_filter_core_mmx2 FPFX(lowres_filter_core_mmx2) void xavs2_lowres_filter_core_mmx2 (pel_t *src, int i_src, pel_t *dst, int i_dst, int width, int height); #define xavs2_lowres_filter_core_sse2 FPFX(lowres_filter_core_sse2) void xavs2_lowres_filter_core_sse2 (pel_t *src, int i_src, pel_t *dst, int i_dst, int width, int height); #define xavs2_lowres_filter_core_ssse3 FPFX(lowres_filter_core_ssse3) void xavs2_lowres_filter_core_ssse3(pel_t *src, int i_src, pel_t *dst, int i_dst, int width, int height); #define xavs2_lowres_filter_core_avx FPFX(lowres_filter_core_avx) void xavs2_lowres_filter_core_avx (pel_t *src, int i_src, pel_t *dst, int i_dst, int width, int height); #endif // XAVS2_I386_MC_H xavs2-1.3/source/common/x86/pixel-32.asm000066400000000000000000000262511340660520300177070ustar00rootroot00000000000000;***************************************************************************** ;* pixel-32.asm: x86_32 pixel metrics ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* ;* Authors: Loren Merritt ;* Laurent Aimar ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" %if ARCH_X86_64 == 0 cextern pw_ppmmppmm cextern pw_pmpmpmpm SECTION .text INIT_MMX mmx2 %macro LOAD_DIFF_4x8P 1 ; dx LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1] LOAD_DIFF m1, m6, none, [r0+%1+r1], [r2+%1+r3] LOAD_DIFF m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2] LOAD_DIFF m3, m6, none, [r0+%1+r4], [r2+%1+r5] lea r0, [r0+4*r1] lea r2, [r2+4*r3] LOAD_DIFF m4, m7, none, [r0+%1], [r2+%1] LOAD_DIFF m5, m6, none, [r0+%1+r1], [r2+%1+r3] LOAD_DIFF m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2] movq [spill], m5 LOAD_DIFF m7, m5, none, [r0+%1+r4], [r2+%1+r5] movq m5, [spill] %endmacro %macro SUM4x8_MM 0 movq [spill], m6 movq [spill+8], m7 ABSW2 m0, m1, m0, m1, m6, m7 ABSW2 m2, m3, m2, m3, m6, m7 paddw m0, m2 paddw m1, m3 movq m6, [spill] movq m7, [spill+8] ABSW2 m4, m5, m4, m5, m2, m3 ABSW2 m6, m7, m6, m7, m2, m3 paddw m4, m6 paddw m5, m7 paddw m0, m4 paddw m1, m5 paddw m0, m1 %endmacro ;----------------------------------------------------------------------------- ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sa8d_8x8_internal push r0 push r2 sub esp, 0x74 %define args esp+0x74 %define spill esp+0x60 ; +16 %define trans esp+0 ; +96 LOAD_DIFF_4x8P 0 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 movq [spill], m1 TRANSPOSE4x4W 4, 5, 6, 7, 1 movq [trans+0x00], m4 movq [trans+0x08], m5 movq [trans+0x10], m6 movq [trans+0x18], m7 movq m1, [spill] TRANSPOSE4x4W 0, 1, 2, 3, 4 movq [trans+0x20], m0 movq [trans+0x28], m1 movq [trans+0x30], m2 movq [trans+0x38], m3 mov r0, [args+4] mov r2, [args] LOAD_DIFF_4x8P 4 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 movq [spill], m7 TRANSPOSE4x4W 0, 1, 2, 3, 7 movq [trans+0x40], m0 movq [trans+0x48], m1 movq [trans+0x50], m2 movq [trans+0x58], m3 movq m7, [spill] TRANSPOSE4x4W 4, 5, 6, 7, 1 movq m0, [trans+0x00] movq m1, [trans+0x08] movq m2, [trans+0x10] movq m3, [trans+0x18] HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 SUM4x8_MM movq [trans], m0 movq m0, [trans+0x20] movq m1, [trans+0x28] movq m2, [trans+0x30] movq m3, [trans+0x38] movq m4, [trans+0x40] movq m5, [trans+0x48] movq m6, [trans+0x50] movq m7, [trans+0x58] HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 SUM4x8_MM pavgw m0, [trans] add esp, 0x7c ret %undef args %undef spill %undef trans %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op pxor %7, %7 pshufw %4, %1, q1032 pshufw %5, %2, q1032 pshufw %6, %3, q1032 paddusw %1, %4 paddusw %2, %5 paddusw %3, %6 punpcklwd %1, %7 punpcklwd %2, %7 punpcklwd %3, %7 pshufw %4, %1, q1032 pshufw %5, %2, q1032 pshufw %6, %3, q1032 %8 %1, %4 %8 %2, %5 %8 %3, %6 %endmacro %macro LOAD_4x8P 1 ; dx pxor m7, m7 movd m6, [r0+%1+7*FENC_STRIDE] movd m0, [r0+%1+0*FENC_STRIDE] movd m1, [r0+%1+1*FENC_STRIDE] movd m2, [r0+%1+2*FENC_STRIDE] movd m3, [r0+%1+3*FENC_STRIDE] movd m4, [r0+%1+4*FENC_STRIDE] movd m5, [r0+%1+5*FENC_STRIDE] punpcklbw m6, m7 punpcklbw m0, m7 punpcklbw m1, m7 movq [spill], m6 punpcklbw m2, m7 punpcklbw m3, m7 movd m6, [r0+%1+6*FENC_STRIDE] punpcklbw m4, m7 punpcklbw m5, m7 punpcklbw m6, m7 movq m7, [spill] %endmacro %macro HSUMSUB2 4 pshufw m4, %1, %3 pshufw m5, %2, %3 pmullw %1, %4 pmullw m5, %4 paddw %1, m4 paddw %2, m5 %endmacro ;----------------------------------------------------------------------------- ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res ) ;----------------------------------------------------------------------------- cglobal intra_sa8d_x3_8x8, 2,3 SUB esp, 0x94 %define edge esp+0x70 ; +32 %define spill esp+0x60 ; +16 %define trans esp+0 ; +96 %define sum esp+0 ; +32 pxor m7, m7 movq m0, [r1+7] movq m2, [r1+16] movq m1, m0 movq m3, m2 punpcklbw m0, m7 punpckhbw m1, m7 punpcklbw m2, m7 punpckhbw m3, m7 movq m6, [pw_ppmmppmm] HSUMSUB2 m0, m2, q1032, m6 HSUMSUB2 m1, m3, q1032, m6 movq m6, [pw_pmpmpmpm] HSUMSUB2 m0, m2, q2301, m6 HSUMSUB2 m1, m3, q2301, m6 movq m4, m0 movq m5, m2 paddw m0, m1 paddw m2, m3 psubw m4, m1 psubw m3, m5 movq [edge+0], m0 movq [edge+8], m4 movq [edge+16], m2 movq [edge+24], m3 LOAD_4x8P 0 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 movq [spill], m0 TRANSPOSE4x4W 4, 5, 6, 7, 0 movq [trans+0x00], m4 movq [trans+0x08], m5 movq [trans+0x10], m6 movq [trans+0x18], m7 movq m0, [spill] TRANSPOSE4x4W 0, 1, 2, 3, 4 movq [trans+0x20], m0 movq [trans+0x28], m1 movq [trans+0x30], m2 movq [trans+0x38], m3 LOAD_4x8P 4 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 movq [spill], m7 TRANSPOSE4x4W 0, 1, 2, 3, 7 movq [trans+0x40], m0 movq [trans+0x48], m1 movq [trans+0x50], m2 movq [trans+0x58], m3 movq m7, [spill] TRANSPOSE4x4W 4, 5, 6, 7, 0 movq m0, [trans+0x00] movq m1, [trans+0x08] movq m2, [trans+0x10] movq m3, [trans+0x18] HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 movq [spill+0], m0 movq [spill+8], m1 ABSW2 m2, m3, m2, m3, m0, m1 ABSW2 m4, m5, m4, m5, m0, m1 paddw m2, m4 paddw m3, m5 ABSW2 m6, m7, m6, m7, m4, m5 movq m0, [spill+0] movq m1, [spill+8] paddw m2, m6 paddw m3, m7 paddw m2, m3 ABSW m1, m1, m4 paddw m2, m1 ; 7x4 sum movq m7, m0 movq m1, [edge+8] ; left bottom psllw m1, 3 psubw m7, m1 ABSW2 m0, m7, m0, m7, m5, m3 paddw m0, m2 paddw m7, m2 movq [sum+0], m0 ; dc movq [sum+8], m7 ; left movq m0, [trans+0x20] movq m1, [trans+0x28] movq m2, [trans+0x30] movq m3, [trans+0x38] movq m4, [trans+0x40] movq m5, [trans+0x48] movq m6, [trans+0x50] movq m7, [trans+0x58] HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 movd [sum+0x10], m0 movd [sum+0x12], m1 movd [sum+0x14], m2 movd [sum+0x16], m3 movd [sum+0x18], m4 movd [sum+0x1a], m5 movd [sum+0x1c], m6 movd [sum+0x1e], m7 movq [spill], m0 movq [spill+8], m1 ABSW2 m2, m3, m2, m3, m0, m1 ABSW2 m4, m5, m4, m5, m0, m1 paddw m2, m4 paddw m3, m5 paddw m2, m3 movq m0, [spill] movq m1, [spill+8] ABSW2 m6, m7, m6, m7, m4, m5 ABSW m1, m1, m3 paddw m2, m7 paddw m1, m6 paddw m2, m1 ; 7x4 sum movq m1, m0 movq m7, [edge+0] psllw m7, 3 ; left top mov r2, [edge+0] add r2, [edge+16] lea r2, [4*r2+32] and r2, 0xffc0 movd m6, r2 ; dc psubw m1, m7 psubw m0, m6 ABSW2 m0, m1, m0, m1, m5, m6 movq m3, [sum+0] ; dc paddw m0, m2 paddw m1, m2 movq m2, m0 paddw m0, m3 paddw m1, [sum+8] ; h psrlq m2, 16 paddw m2, m3 movq m3, [edge+16] ; top left movq m4, [edge+24] ; top right psllw m3, 3 psllw m4, 3 psubw m3, [sum+16] psubw m4, [sum+24] ABSW2 m3, m4, m3, m4, m5, m6 paddw m2, m3 paddw m2, m4 ; v SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw mov r2, r2m pxor m7, m7 punpckldq m2, m1 pavgw m0, m7 pavgw m2, m7 movd [r2+8], m0 ; dc movq [r2+0], m2 ; v, h ADD esp, 0x94 RET %undef edge %undef spill %undef trans %undef sum ;----------------------------------------------------------------------------- ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- cglobal pixel_ssim_4x4x2_core, 0,5 mov r1, r1m mov r3, r3m mov r4, 4 pxor m0, m0 .loop: mov r0, r0m mov r2, r2m add r0, r4 add r2, r4 pxor m1, m1 pxor m2, m2 pxor m3, m3 pxor m4, m4 %rep 4 movd m5, [r0] movd m6, [r2] punpcklbw m5, m0 punpcklbw m6, m0 paddw m1, m5 paddw m2, m6 movq m7, m5 pmaddwd m5, m5 pmaddwd m7, m6 pmaddwd m6, m6 paddd m3, m5 paddd m4, m7 paddd m3, m6 add r0, r1 add r2, r3 %endrep mov r0, r4m lea r0, [r0+r4*4] pshufw m5, m1, q0032 pshufw m6, m2, q0032 paddusw m1, m5 paddusw m2, m6 punpcklwd m1, m2 pshufw m2, m1, q0032 pshufw m5, m3, q0032 pshufw m6, m4, q0032 paddusw m1, m2 paddd m3, m5 paddd m4, m6 punpcklwd m1, m0 punpckldq m3, m4 movq [r0+0], m1 movq [r0+8], m3 sub r4, 4 jge .loop emms RET %endif ; if ARCH_X86_64 == 0 xavs2-1.3/source/common/x86/pixel-a.asm000066400000000000000000007642171340660520300177160ustar00rootroot00000000000000;***************************************************************************** ;* pixel-a.asm: x86 ssd functions ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Laurent Aimar ;* Alex Izvorski ;* Min Chen ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 hmul_8p: times 8 db 1 times 4 db 1, -1 times 8 db 1 times 4 db 1, -1 hmul_4p: times 4 db 1, 1, 1, 1, 1, -1, 1, -1 mask_10: times 4 dw 0, -1 mask_1100: times 2 dd 0, -1 hmul_8w: times 4 dw 1 times 2 dw 1, -1 times 4 dw 1 times 2 dw 1, -1 ALIGN 32 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 SECTION .text cextern pb_0 cextern pb_1 cextern pw_1 cextern pw_8 cextern pw_16 cextern pw_32 cextern pw_00ff cextern pw_ppppmmmm cextern pw_ppmmppmm cextern pw_pmpmpmpm cextern pw_pmmpzzzz cextern pd_1 cextern pd_2 cextern hmul_16p cextern pb_movemask cextern pb_movemask_32 cextern pw_pixel_max ;============================================================================= ; SATD ;============================================================================= %macro JDUP 2 %if cpuflag(sse4) ; just use shufps on anything post conroe shufps %1, %2, 0 %elif cpuflag(ssse3) && notcpuflag(atom) ; join 2x 32 bit and duplicate them ; emulating shufps is faster on conroe punpcklqdq %1, %2 movsldup %1, %1 %else ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d punpckldq %1, %2 %endif %endmacro %macro HSUMSUB 5 pmaddubsw m%2, m%5 pmaddubsw m%1, m%5 pmaddubsw m%4, m%5 pmaddubsw m%3, m%5 %endmacro %macro DIFF_UNPACK_SSE2 5 punpcklbw m%1, m%5 punpcklbw m%2, m%5 punpcklbw m%3, m%5 punpcklbw m%4, m%5 psubw m%1, m%2 psubw m%3, m%4 %endmacro %macro DIFF_SUMSUB_SSSE3 5 HSUMSUB %1, %2, %3, %4, %5 psubw m%1, m%2 psubw m%3, m%4 %endmacro %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer movd %1, %3 movd %2, %4 JDUP %1, %2 %endmacro %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer movddup m%3, %6 movddup m%4, %8 movddup m%1, %5 movddup m%2, %7 %endmacro %macro LOAD_DUP_4x8P_PENRYN 8 ; penryn and nehalem run punpcklqdq and movddup in different units movh m%3, %6 movh m%4, %8 punpcklqdq m%3, m%3 movddup m%1, %5 punpcklqdq m%4, m%4 movddup m%2, %7 %endmacro %macro LOAD_SUMSUB_8x2P 9 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr movddup m%1, [%7] movddup m%2, [%7+8] mova m%4, [%6] movddup m%3, m%4 punpckhqdq m%4, m%4 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr movu m%4, [%7] mova m%2, [%6] DEINTB %1, %2, %3, %4, %5 psubw m%1, m%3 psubw m%2, m%4 SUMSUB_BA w, %1, %2, %3 %endmacro %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp] LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5 %endmacro %macro LOAD_SUMSUB_16x2P_AVX2 9 ; 2*dst, 2*tmp, mul, 4*ptr vbroadcasti128 m%1, [%6] vbroadcasti128 m%3, [%7] vbroadcasti128 m%2, [%8] vbroadcasti128 m%4, [%9] DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3 LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5 %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro %macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer mova xm%3, %6 mova xm%4, %8 mova xm%1, %5 mova xm%2, %7 vpermq m%3, m%3, q0011 vpermq m%4, m%4, q0011 vpermq m%1, m%1, q0011 vpermq m%2, m%2, q0011 %endmacro %macro LOAD_SUMSUB8_16x2P_AVX2 9 ; 2*dst, 2*tmp, mul, 4*ptr LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro ;----------------------------------------------------------------------------- ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SATDS_SSE2 0 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) %endmacro ; SATDS_SSE2 ;============================================================================= ; SA8D ;============================================================================= %macro SA8D_INTER 0 %if ARCH_X86_64 %define lh m10 %define rh m0 %else %define lh m0 %define rh [esp+48] %endif %if HIGH_BIT_DEPTH HADDUW m0, m1 paddd lh, rh %else paddusw lh, rh %endif ; HIGH_BIT_DEPTH %endmacro %macro SA8D_8x8 0 call pixel_sa8d_8x8_internal %if HIGH_BIT_DEPTH HADDUW m0, m1 %else HADDW m0, m1 %endif ; HIGH_BIT_DEPTH paddd m0, [pd_1] psrld m0, 1 paddd m12, m0 %endmacro %macro SA8D_16x16 0 call pixel_sa8d_8x8_internal ; pix[0] add r2, 8*SIZEOF_PIXEL add r0, 8*SIZEOF_PIXEL %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova m10, m0 call pixel_sa8d_8x8_internal ; pix[8] lea r2, [r2+8*r3] lea r0, [r0+8*r1] SA8D_INTER call pixel_sa8d_8x8_internal ; pix[8*stride+8] sub r2, 8*SIZEOF_PIXEL sub r0, 8*SIZEOF_PIXEL SA8D_INTER call pixel_sa8d_8x8_internal ; pix[8*stride] SA8D_INTER SWAP 0, 10 %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif paddd m0, [pd_1] psrld m0, 1 paddd m12, m0 %endmacro %macro AVG_16x16 0 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d %endmacro %macro SA8D 0 ; sse2 doesn't seem to like the horizontal way of doing things %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sa8d_8x8_internal lea r6, [r0+4*r1] lea r7, [r2+4*r3] LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7 %if vertical HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax %else ; non-sse2 HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11 %endif paddw m0, m1 paddw m0, m2 paddw m0, m8 SAVE_MM_PERMUTATION ret cglobal pixel_sa8d_8x8, 4,8,12 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] %if vertical == 0 mova m7, [hmul_8p] %endif call pixel_sa8d_8x8_internal %if HIGH_BIT_DEPTH HADDUW m0, m1 %else HADDW m0, m1 %endif ; HIGH_BIT_DEPTH movd eax, m0 add eax, 1 shr eax, 1 RET cglobal pixel_sa8d_16x16, 4,8,12 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] %if vertical == 0 mova m7, [hmul_8p] %endif call pixel_sa8d_8x8_internal ; pix[0] add r2, 8*SIZEOF_PIXEL add r0, 8*SIZEOF_PIXEL %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova m10, m0 call pixel_sa8d_8x8_internal ; pix[8] lea r2, [r2+8*r3] lea r0, [r0+8*r1] SA8D_INTER call pixel_sa8d_8x8_internal ; pix[8*stride+8] sub r2, 8*SIZEOF_PIXEL sub r0, 8*SIZEOF_PIXEL SA8D_INTER call pixel_sa8d_8x8_internal ; pix[8*stride] SA8D_INTER SWAP 0, 10 %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd eax, m0 add eax, 1 shr eax, 1 RET cglobal pixel_sa8d_8x16, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_8x8 lea r0, [r0 + 8*r1] lea r2, [r2 + 8*r3] SA8D_8x8 movd eax, m12 RET cglobal pixel_sa8d_8x32, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 movd eax, m12 RET cglobal pixel_sa8d_16x8, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 movd eax, m12 RET cglobal pixel_sa8d_16x32, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_16x64, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_24x32, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 sub r0, 8*SIZEOF_PIXEL sub r2, 8*SIZEOF_PIXEL SA8D_8x8 sub r0, 8*SIZEOF_PIXEL sub r2, 8*SIZEOF_PIXEL SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 sub r0, 8*SIZEOF_PIXEL sub r2, 8*SIZEOF_PIXEL SA8D_8x8 sub r0, 8*SIZEOF_PIXEL sub r2, 8*SIZEOF_PIXEL SA8D_8x8 movd eax, m12 RET cglobal pixel_sa8d_32x8, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 movd eax, m12 RET cglobal pixel_sa8d_32x16, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_32x24, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 sub r0, 8*SIZEOF_PIXEL sub r2, 8*SIZEOF_PIXEL SA8D_8x8 sub r0, 8*SIZEOF_PIXEL sub r2, 8*SIZEOF_PIXEL SA8D_8x8 sub r0, 8*SIZEOF_PIXEL sub r2, 8*SIZEOF_PIXEL SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 movd eax, m12 RET cglobal pixel_sa8d_32x32, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_32x64, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_48x64, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_64x16, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_64x32, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_64x48, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_64x64, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET %else ; ARCH_X86_32 %if mmsize == 16 cglobal pixel_sa8d_8x8_internal %define spill0 [esp+4] %define spill1 [esp+20] %define spill2 [esp+36] %if vertical LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 HADAMARD4_2D 0, 1, 2, 3, 4 movdqa spill0, m3 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1 HADAMARD4_2D 4, 5, 6, 7, 3 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax movdqa m3, spill0 paddw m0, m1 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax %else ; mmsize == 8 mova m7, [hmul_8p] LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1 ; could do first HADAMARD4_V here to save spilling later ; surprisingly, not a win on conroe or even p4 mova spill0, m2 mova spill1, m3 mova spill2, m1 SWAP 1, 7 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1 HADAMARD4_V 4, 5, 6, 7, 3 mova m1, spill2 mova m2, spill0 mova m3, spill1 mova spill0, m6 mova spill1, m7 HADAMARD4_V 0, 1, 2, 3, 7 SUMSUB_BADC w, 0, 4, 1, 5, 7 HADAMARD 2, sumsub, 0, 4, 7, 6 HADAMARD 2, sumsub, 1, 5, 7, 6 HADAMARD 1, amax, 0, 4, 7, 6 HADAMARD 1, amax, 1, 5, 7, 6 mova m6, spill0 mova m7, spill1 paddw m0, m1 SUMSUB_BADC w, 2, 6, 3, 7, 4 HADAMARD 2, sumsub, 2, 6, 4, 5 HADAMARD 2, sumsub, 3, 7, 4, 5 HADAMARD 1, amax, 2, 6, 4, 5 HADAMARD 1, amax, 3, 7, 4, 5 %endif ; sse2/non-sse2 paddw m0, m2 paddw m0, m3 SAVE_MM_PERMUTATION ret %endif ; ifndef mmx2 cglobal pixel_sa8d_8x8_internal2 %define spill0 [esp+4] LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 HADAMARD4_2D 0, 1, 2, 3, 4 movdqa spill0, m3 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1 HADAMARD4_2D 4, 5, 6, 7, 3 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax movdqa m3, spill0 paddw m0, m1 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax paddw m0, m2 paddw m0, m3 SAVE_MM_PERMUTATION ret cglobal pixel_sa8d_8x8, 4,7 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 48 lea r4, [3*r1] lea r5, [3*r3] call pixel_sa8d_8x8_internal %if HIGH_BIT_DEPTH HADDUW m0, m1 %else HADDW m0, m1 %endif ; HIGH_BIT_DEPTH movd eax, m0 add eax, 1 shr eax, 1 mov esp, r6 RET cglobal pixel_sa8d_16x16, 4,7 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [3*r1] lea r5, [3*r3] call pixel_sa8d_8x8_internal %if mmsize == 8 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal %if mmsize == 8 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %else SA8D_INTER %endif mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal %if HIGH_BIT_DEPTH SA8D_INTER %else ; !HIGH_BIT_DEPTH paddusw m0, [esp+64-mmsize] %if mmsize == 16 HADDUW m0, m1 %else mova m2, [esp+48] pxor m7, m7 mova m1, m0 mova m3, m2 punpcklwd m0, m7 punpckhwd m1, m7 punpcklwd m2, m7 punpckhwd m3, m7 paddd m0, m1 paddd m2, m3 paddd m0, m2 HADDD m0, m1 %endif %endif ; HIGH_BIT_DEPTH movd eax, m0 add eax, 1 shr eax, 1 mov esp, r6 RET cglobal pixel_sa8d_8x16, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_8x32, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_16x8, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_16x32, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_16x64, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_24x32, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_32x8, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_32x16, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_32x24, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_32x32, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_32x64, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_48x64, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_64x16, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_64x32, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_64x48, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_64x64, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET %endif ; !ARCH_X86_64 %endmacro ; SA8D %if ARCH_X86_64 == 1 && BIT_DEPTH == 12 INIT_YMM avx2 cglobal sa8d_8x8_12bit pmovzxwd m0, [r0] pmovzxwd m9, [r2] psubd m0, m9 pmovzxwd m1, [r0 + r1] pmovzxwd m9, [r2 + r3] psubd m1, m9 pmovzxwd m2, [r0 + r1 * 2] pmovzxwd m9, [r2 + r3 * 2] psubd m2, m9 pmovzxwd m8, [r0 + r4] pmovzxwd m9, [r2 + r5] psubd m8, m9 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] pmovzxwd m4, [r0] pmovzxwd m9, [r2] psubd m4, m9 pmovzxwd m5, [r0 + r1] pmovzxwd m9, [r2 + r3] psubd m5, m9 pmovzxwd m3, [r0 + r1 * 2] pmovzxwd m9, [r2 + r3 * 2] psubd m3, m9 pmovzxwd m7, [r0 + r4] pmovzxwd m9, [r2 + r5] psubd m7, m9 mova m6, m0 paddd m0, m1 psubd m1, m6 mova m6, m2 paddd m2, m8 psubd m8, m6 mova m6, m0 punpckldq m0, m1 punpckhdq m6, m1 mova m1, m0 paddd m0, m6 psubd m6, m1 mova m1, m2 punpckldq m2, m8 punpckhdq m1, m8 mova m8, m2 paddd m2, m1 psubd m1, m8 mova m8, m4 paddd m4, m5 psubd m5, m8 mova m8, m3 paddd m3, m7 psubd m7, m8 mova m8, m4 punpckldq m4, m5 punpckhdq m8, m5 mova m5, m4 paddd m4, m8 psubd m8, m5 mova m5, m3 punpckldq m3, m7 punpckhdq m5, m7 mova m7, m3 paddd m3, m5 psubd m5, m7 mova m7, m0 paddd m0, m2 psubd m2, m7 mova m7, m6 paddd m6, m1 psubd m1, m7 mova m7, m0 punpcklqdq m0, m2 punpckhqdq m7, m2 mova m2, m0 paddd m0, m7 psubd m7, m2 mova m2, m6 punpcklqdq m6, m1 punpckhqdq m2, m1 mova m1, m6 paddd m6, m2 psubd m2, m1 mova m1, m4 paddd m4, m3 psubd m3, m1 mova m1, m8 paddd m8, m5 psubd m5, m1 mova m1, m4 punpcklqdq m4, m3 punpckhqdq m1, m3 mova m3, m4 paddd m4, m1 psubd m1, m3 mova m3, m8 punpcklqdq m8, m5 punpckhqdq m3, m5 mova m5, m8 paddd m8, m3 psubd m3, m5 mova m5, m0 paddd m0, m4 psubd m4, m5 mova m5, m7 paddd m7, m1 psubd m1, m5 mova m5, m0 vinserti128 m0, m0, xm4, 1 vperm2i128 m5, m5, m4, 00110001b pxor m4, m4 psubd m4, m0 pmaxsd m0, m4 pxor m4, m4 psubd m4, m5 pmaxsd m5, m4 pmaxsd m0, m5 mova m4, m7 vinserti128 m7, m7, xm1, 1 vperm2i128 m4, m4, m1, 00110001b pxor m1, m1 psubd m1, m7 pmaxsd m7, m1 pxor m1, m1 psubd m1, m4 pmaxsd m4, m1 pmaxsd m7, m4 mova m1, m6 paddd m6, m8 psubd m8, m1 mova m1, m2 paddd m2, m3 psubd m3, m1 mova m1, m6 vinserti128 m6, m6, xm8, 1 vperm2i128 m1, m1, m8, 00110001b pxor m8, m8 psubd m8, m6 pmaxsd m6, m8 pxor m8, m8 psubd m8, m1 pmaxsd m1, m8 pmaxsd m6, m1 mova m8, m2 vinserti128 m2, m2, xm3, 1 vperm2i128 m8, m8, m3, 00110001b pxor m3, m3 psubd m3, m2 pmaxsd m2, m3 pxor m3, m3 psubd m3, m8 pmaxsd m8, m3 pmaxsd m2, m8 paddd m0, m6 paddd m0, m7 paddd m0, m2 ret cglobal pixel_sa8d_8x8, 4,6,10 add r1d, r1d add r3d, r3d lea r4, [r1 + r1 * 2] lea r5, [r3 + r3 * 2] call sa8d_8x8_12bit vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 movd eax, xm0 add eax, 1 shr eax, 1 RET cglobal pixel_sa8d_8x16, 4,7,11 add r1d, r1d add r3d, r3d lea r4, [r1 + r1 * 2] lea r5, [r3 + r3 * 2] pxor m10, m10 call sa8d_8x8_12bit vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm10, xm0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm0, xm10 movd eax, xm0 RET cglobal pixel_sa8d_16x16, 4,8,11 add r1d, r1d add r3d, r3d lea r4, [r1 + r1 * 2] lea r5, [r3 + r3 * 2] mov r6, r0 mov r7, r2 pxor m10, m10 call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 16] lea r2, [r7 + 16] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 movd eax, xm0 add eax, 1 shr eax, 1 RET cglobal pixel_sa8d_16x32, 4,8,12 add r1d, r1d add r3d, r3d lea r4, [r1 + r1 * 2] lea r5, [r3 + r3 * 2] mov r6, r0 mov r7, r2 pxor m10, m10 pxor m11, m11 call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 16] lea r2, [r7 + 16] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 lea r6, [r6 + r1 * 8] lea r6, [r6 + r1 * 8] lea r7, [r7 + r3 * 8] lea r7, [r7 + r3 * 8] pxor m10, m10 mov r0, r6 mov r2, r7 call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 16] lea r2, [r7 + 16] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 movd eax, xm11 RET cglobal pixel_sa8d_32x32, 4,8,12 add r1d, r1d add r3d, r3d lea r4, [r1 + r1 * 2] lea r5, [r3 + r3 * 2] mov r6, r0 mov r7, r2 pxor m10, m10 pxor m11, m11 call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 16] lea r2, [r7 + 16] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 32] lea r2, [r7 + 32] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 48] lea r2, [r7 + 48] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 lea r6, [r6 + r1 * 8] lea r6, [r6 + r1 * 8] lea r7, [r7 + r3 * 8] lea r7, [r7 + r3 * 8] pxor m10, m10 mov r0, r6 mov r2, r7 call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 16] lea r2, [r7 + 16] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 32] lea r2, [r7 + 32] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 48] lea r2, [r7 + 48] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 movd eax, xm11 RET cglobal pixel_sa8d_32x64, 4,8,12 add r1d, r1d add r3d, r3d lea r4, [r1 + r1 * 2] lea r5, [r3 + r3 * 2] mov r6, r0 mov r7, r2 pxor m10, m10 pxor m11, m11 call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 16] lea r2, [r7 + 16] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 32] lea r2, [r7 + 32] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 48] lea r2, [r7 + 48] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 lea r6, [r6 + r1 * 8] lea r6, [r6 + r1 * 8] lea r7, [r7 + r3 * 8] lea r7, [r7 + r3 * 8] pxor m10, m10 mov r0, r6 mov r2, r7 call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 16] lea r2, [r7 + 16] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 32] lea r2, [r7 + 32] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 48] lea r2, [r7 + 48] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 lea r6, [r6 + r1 * 8] lea r6, [r6 + r1 * 8] lea r7, [r7 + r3 * 8] lea r7, [r7 + r3 * 8] pxor m10, m10 mov r0, r6 mov r2, r7 call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 16] lea r2, [r7 + 16] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 32] lea r2, [r7 + 32] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 48] lea r2, [r7 + 48] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 lea r6, [r6 + r1 * 8] lea r6, [r6 + r1 * 8] lea r7, [r7 + r3 * 8] lea r7, [r7 + r3 * 8] pxor m10, m10 mov r0, r6 mov r2, r7 call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 16] lea r2, [r7 + 16] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 32] lea r2, [r7 + 32] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 48] lea r2, [r7 + 48] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 movd eax, xm11 RET cglobal pixel_sa8d_64x64, 4,8,12 add r1d, r1d add r3d, r3d lea r4, [r1 + r1 * 2] lea r5, [r3 + r3 * 2] mov r6, r0 mov r7, r2 pxor m10, m10 pxor m11, m11 call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 16] lea r2, [r7 + 16] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 32] lea r2, [r7 + 32] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 48] lea r2, [r7 + 48] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 64] lea r2, [r7 + 64] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 80] lea r2, [r7 + 80] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 96] lea r2, [r7 + 96] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 112] lea r2, [r7 + 112] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 lea r6, [r6 + r1 * 8] lea r6, [r6 + r1 * 8] lea r7, [r7 + r3 * 8] lea r7, [r7 + r3 * 8] pxor m10, m10 mov r0, r6 mov r2, r7 call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 16] lea r2, [r7 + 16] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 32] lea r2, [r7 + 32] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 48] lea r2, [r7 + 48] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 64] lea r2, [r7 + 64] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 80] lea r2, [r7 + 80] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 96] lea r2, [r7 + 96] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 112] lea r2, [r7 + 112] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 lea r6, [r6 + r1 * 8] lea r6, [r6 + r1 * 8] lea r7, [r7 + r3 * 8] lea r7, [r7 + r3 * 8] pxor m10, m10 mov r0, r6 mov r2, r7 call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 16] lea r2, [r7 + 16] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 32] lea r2, [r7 + 32] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 48] lea r2, [r7 + 48] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 64] lea r2, [r7 + 64] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 80] lea r2, [r7 + 80] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 96] lea r2, [r7 + 96] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 112] lea r2, [r7 + 112] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 lea r6, [r6 + r1 * 8] lea r6, [r6 + r1 * 8] lea r7, [r7 + r3 * 8] lea r7, [r7 + r3 * 8] pxor m10, m10 mov r0, r6 mov r2, r7 call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 16] lea r2, [r7 + 16] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 32] lea r2, [r7 + 32] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 48] lea r2, [r7 + 48] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 64] lea r2, [r7 + 64] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 80] lea r2, [r7 + 80] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 pxor m10, m10 lea r0, [r6 + 96] lea r2, [r7 + 96] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r6 + 112] lea r2, [r7 + 112] call sa8d_8x8_12bit paddd m10, m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] call sa8d_8x8_12bit paddd m0, m10 vextracti128 xm6, m0, 1 paddd xm0, xm6 movhlps xm6, xm0 paddd xm0, xm6 pshuflw xm6, xm0, 0Eh paddd xm0, xm6 paddd xm0, [pd_1] psrld xm0, 1 paddd xm11, xm0 movd eax, xm11 RET %endif ;============================================================================= ; INTRA SATD ;============================================================================= %define TRANS TRANS_SSE2 %define DIFFOP DIFF_UNPACK_SSE2 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size %define movdqu movups %define punpcklqdq movlhps INIT_XMM sse2 %if BIT_DEPTH <= 10 SA8D %endif SATDS_SSE2 %if HIGH_BIT_DEPTH == 0 INIT_XMM ssse3,atom SATDS_SSE2 SA8D %endif %define DIFFOP DIFF_SUMSUB_SSSE3 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE %if HIGH_BIT_DEPTH == 0 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3 %endif INIT_XMM ssse3 %if BIT_DEPTH <= 10 SA8D %endif SATDS_SSE2 %undef movdqa ; nehalem doesn't like movaps %undef movdqu ; movups %undef punpcklqdq ; or movlhps %define TRANS TRANS_SSE4 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN INIT_XMM sse4 %if BIT_DEPTH <= 10 SA8D %endif SATDS_SSE2 ; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so ; it's effectively free. %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE INIT_XMM avx SA8D SATDS_SSE2 %define TRANS TRANS_XOP INIT_XMM xop %if BIT_DEPTH <= 10 SA8D %endif SATDS_SSE2 %if HIGH_BIT_DEPTH == 0 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2 %define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2 %define TRANS TRANS_SSE4 %macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul] movddup xm%1, [r0] movddup xm%3, [r2] movddup xm%2, [r0+4*r1] movddup xm%5, [r2+4*r3] vinserti128 m%1, m%1, xm%2, 1 vinserti128 m%3, m%3, xm%5, 1 movddup xm%2, [r0+r1] movddup xm%4, [r2+r3] movddup xm%5, [r0+r4] movddup xm%6, [r2+r5] vinserti128 m%2, m%2, xm%5, 1 vinserti128 m%4, m%4, xm%6, 1 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movddup xm%3, [r0] movddup xm%5, [r0+4*r1] vinserti128 m%3, m%3, xm%5, 1 movddup xm%5, [r2] movddup xm%4, [r2+4*r3] vinserti128 m%5, m%5, xm%4, 1 movddup xm%4, [r0+r1] movddup xm%6, [r0+r4] vinserti128 m%4, m%4, xm%6, 1 movq xm%6, [r2+r3] movhps xm%6, [r2+r5] vpermq m%6, m%6, q1100 DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7 %endmacro %macro SATD_START_AVX2 2-3 0 FIX_STRIDES r1, r3 %if %3 mova %2, [hmul_8p] lea r4, [5*r1] lea r5, [5*r3] %else mova %2, [hmul_16p] lea r4, [3*r1] lea r5, [3*r3] %endif pxor %1, %1 %endmacro %define TRANS TRANS_SSE4 INIT_YMM avx2 cglobal pixel_sa8d_8x8_internal LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 HADAMARD4_V 0, 1, 2, 3, 4 HADAMARD 8, sumsub, 0, 1, 4, 5 HADAMARD 8, sumsub, 2, 3, 4, 5 HADAMARD 2, sumsub, 0, 1, 4, 5 HADAMARD 2, sumsub, 2, 3, 4, 5 HADAMARD 1, amax, 0, 1, 4, 5 HADAMARD 1, amax, 2, 3, 4, 5 paddw m6, m0 paddw m6, m2 ret cglobal pixel_sa8d_8x8, 4,6,8 SATD_START_AVX2 m6, m7, 1 call pixel_sa8d_8x8_internal vextracti128 xm1, m6, 1 paddw xm6, xm1 HADDW xm6, xm1 movd eax, xm6 add eax, 1 shr eax, 1 RET cglobal pixel_sa8d_16x16, 4,6,8 SATD_START_AVX2 m6, m7, 1 call pixel_sa8d_8x8_internal ; pix[0] sub r0, r1 sub r0, r1 add r0, 8*SIZEOF_PIXEL sub r2, r3 sub r2, r3 add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal ; pix[8] add r0, r4 add r0, r1 add r2, r5 add r2, r3 call pixel_sa8d_8x8_internal ; pix[8*stride+8] sub r0, r1 sub r0, r1 sub r0, 8*SIZEOF_PIXEL sub r2, r3 sub r2, r3 sub r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal ; pix[8*stride] ; TODO: analyze Dynamic Range vextracti128 xm0, m6, 1 paddusw xm6, xm0 HADDUW xm6, xm0 movd eax, xm6 add eax, 1 shr eax, 1 RET cglobal pixel_sa8d_16x16_internal call pixel_sa8d_8x8_internal ; pix[0] sub r0, r1 sub r0, r1 add r0, 8*SIZEOF_PIXEL sub r2, r3 sub r2, r3 add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal ; pix[8] add r0, r4 add r0, r1 add r2, r5 add r2, r3 call pixel_sa8d_8x8_internal ; pix[8*stride+8] sub r0, r1 sub r0, r1 sub r0, 8*SIZEOF_PIXEL sub r2, r3 sub r2, r3 sub r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal ; pix[8*stride] ; TODO: analyze Dynamic Range vextracti128 xm0, m6, 1 paddusw xm6, xm0 HADDUW xm6, xm0 movd eax, xm6 add eax, 1 shr eax, 1 ret %if ARCH_X86_64 cglobal pixel_sa8d_32x32, 4,8,8 ; TODO: R6 is RAX on x64 platform, so we use it directly SATD_START_AVX2 m6, m7, 1 xor r7d, r7d call pixel_sa8d_16x16_internal ; [0] pxor m6, m6 add r7d, eax add r0, r4 add r0, r1 add r2, r5 add r2, r3 call pixel_sa8d_16x16_internal ; [2] pxor m6, m6 add r7d, eax lea eax, [r4 * 5 - 16] sub r0, rax sub r0, r1 lea eax, [r5 * 5 - 16] sub r2, rax sub r2, r3 call pixel_sa8d_16x16_internal ; [1] pxor m6, m6 add r7d, eax add r0, r4 add r0, r1 add r2, r5 add r2, r3 call pixel_sa8d_16x16_internal ; [3] add eax, r7d RET %endif ; ARCH_X86_64=1 %endif ; HIGH_BIT_DEPTH ; Input 10bit, Output 8bit ;------------------------------------------------------------------------------------------------------------------------ ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;------------------------------------------------------------------------------------------------------------------------ INIT_XMM sse2 cglobal downShift_16, 4,7,3 mov r4d, r4m mov r5d, r5m movd m0, r6m ; m0 = shift add r1, r1 dec r5d .loopH: xor r6, r6 .loopW: movu m1, [r0 + r6 * 2] movu m2, [r0 + r6 * 2 + mmsize] psrlw m1, m0 psrlw m2, m0 packuswb m1, m2 movu [r2 + r6], m1 add r6, mmsize cmp r6d, r4d jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d jnz .loopH ;processing last row of every frame [To handle width which not a multiple of 16] ; r4d must be more than or equal to 16(mmsize) .loop16: movu m1, [r0 + (r4 - mmsize) * 2] movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] psrlw m1, m0 psrlw m2, m0 packuswb m1, m2 movu [r2 + r4 - mmsize], m1 sub r4d, mmsize jz .end cmp r4d, mmsize jge .loop16 ; process partial pixels movu m1, [r0] movu m2, [r0 + mmsize] psrlw m1, m0 psrlw m2, m0 packuswb m1, m2 movu [r2], m1 .end: RET ; Input 10bit, Output 8bit ;------------------------------------------------------------------------------------------------------------------------------------- ;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;------------------------------------------------------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal downShift_16, 4,7,3 mov r4d, r4m mov r5d, r5m movd xm0, r6m ; m0 = shift add r1d, r1d dec r5d .loopH: xor r6, r6 .loopW: movu m1, [r0 + r6 * 2 + 0] movu m2, [r0 + r6 * 2 + 32] vpsrlw m1, xm0 vpsrlw m2, xm0 packuswb m1, m2 vpermq m1, m1, 11011000b movu [r2 + r6], m1 add r6d, mmsize cmp r6d, r4d jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d jnz .loopH ; processing last row of every frame [To handle width which not a multiple of 32] .loop32: movu m1, [r0 + (r4 - mmsize) * 2] movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] psrlw m1, xm0 psrlw m2, xm0 packuswb m1, m2 vpermq m1, m1, q3120 movu [r2 + r4 - mmsize], m1 sub r4d, mmsize jz .end cmp r4d, mmsize jge .loop32 ; process partial pixels movu m1, [r0] movu m2, [r0 + mmsize] psrlw m1, xm0 psrlw m2, xm0 packuswb m1, m2 vpermq m1, m1, q3120 movu [r2], m1 .end: RET ; Input 8bit, Output 10bit ;--------------------------------------------------------------------------------------------------------------------- ;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift) ;--------------------------------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal upShift_8, 6,7,3 movd xm2, r6m add r3d, r3d dec r5d .loopH: xor r6, r6 .loopW: pmovzxbw m0,[r0 + r6] pmovzxbw m1,[r0 + r6 + mmsize/2] psllw m0, m2 psllw m1, m2 movu [r2 + r6 * 2], m0 movu [r2 + r6 * 2 + mmsize], m1 add r6d, mmsize cmp r6d, r4d jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d jg .loopH ; processing last row of every frame [To handle width which not a multiple of 16] mov r1d, (mmsize/2 - 1) and r1d, r4d sub r1, mmsize/2 ; NOTE: Width MUST BE more than or equal to 8 shr r4d, 3 ; log2(mmsize) .loopW8: pmovzxbw m0,[r0] psllw m0, m2 movu [r2], m0 add r0, mmsize/2 add r2, mmsize dec r4d jg .loopW8 ; Mac OS X can't read beyond array bound, so rollback some bytes pmovzxbw m0,[r0 + r1] psllw m0, m2 movu [r2 + r1 * 2], m0 RET ;--------------------------------------------------------------------------------------------------------------------- ;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift) ;--------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 INIT_YMM avx2 cglobal upShift_8, 6,7,3 movd xm2, r6m add r3d, r3d dec r5d .loopH: xor r6, r6 .loopW: pmovzxbw m0,[r0 + r6] pmovzxbw m1,[r0 + r6 + mmsize/2] psllw m0, xm2 psllw m1, xm2 movu [r2 + r6 * 2], m0 movu [r2 + r6 * 2 + mmsize], m1 add r6d, mmsize cmp r6d, r4d jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d jg .loopH ; processing last row of every frame [To handle width which not a multiple of 32] mov r1d, (mmsize/2 - 1) and r1d, r4d sub r1, mmsize/2 ; NOTE: Width MUST BE more than or equal to 16 shr r4d, 4 ; log2(mmsize) .loopW16: pmovzxbw m0,[r0] psllw m0, xm2 movu [r2], m0 add r0, mmsize/2 add r2, mmsize dec r4d jg .loopW16 ; Mac OS X can't read beyond array bound, so rollback some bytes pmovzxbw m0,[r0 + r1] psllw m0, xm2 movu [r2 + r1 * 2], m0 RET %endif %macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp %if cpuflag(ssse3) pabsd %1, %3 pabsd %2, %4 %elifidn %1, %3 pxor %5, %5 pxor %6, %6 psubd %5, %1 psubd %6, %2 pmaxsd %1, %5 pmaxsd %2, %6 %else pxor %1, %1 pxor %2, %2 psubd %1, %3 psubd %2, %4 pmaxsd %1, %3 pmaxsd %2, %4 %endif %endmacro ; Input 10bit, Output 12bit ;------------------------------------------------------------------------------------------------------------------------ ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;------------------------------------------------------------------------------------------------------------------------ INIT_XMM sse2 cglobal upShift_16, 4,7,4 mov r4d, r4m mov r5d, r5m movd m0, r6m ; m0 = shift mova m3, [pw_pixel_max] FIX_STRIDES r1d, r3d dec r5d .loopH: xor r6d, r6d .loopW: movu m1, [r0 + r6 * SIZEOF_PIXEL] movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize] psllw m1, m0 psllw m2, m0 ; TODO: if input always valid, we can remove below 2 instructions. pand m1, m3 pand m2, m3 movu [r2 + r6 * SIZEOF_PIXEL], m1 movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2 add r6, mmsize * 2 / SIZEOF_PIXEL cmp r6d, r4d jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d jnz .loopH ;processing last row of every frame [To handle width which not a multiple of 16] ; WARNING: width(r4d) MUST BE more than or equal to 16(mmsize) in here .loop16: movu m1, [r0 + (r4 - mmsize) * 2] movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] psllw m1, m0 psllw m2, m0 pand m1, m3 pand m2, m3 movu [r2 + (r4 - mmsize) * 2], m1 movu [r2 + (r4 - mmsize) * 2 + mmsize], m2 sub r4d, mmsize jz .end cmp r4d, mmsize jge .loop16 ; process partial pixels movu m1, [r0] movu m2, [r0 + mmsize] psllw m1, m0 psllw m2, m0 pand m1, m3 pand m2, m3 movu [r2], m1 movu [r2 + mmsize], m2 .end: RET ; Input 10bit, Output 12bit ;------------------------------------------------------------------------------------------------------------------------------------- ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;------------------------------------------------------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal upShift_16, 4,7,4 mov r4d, r4m mov r5d, r5m movd xm0, r6m ; m0 = shift vbroadcasti128 m3, [pw_pixel_max] FIX_STRIDES r1d, r3d dec r5d .loopH: xor r6d, r6d .loopW: movu m1, [r0 + r6 * SIZEOF_PIXEL] movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize] psllw m1, xm0 psllw m2, xm0 pand m1, m3 pand m2, m3 movu [r2 + r6 * SIZEOF_PIXEL], m1 movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2 add r6, mmsize * 2 / SIZEOF_PIXEL cmp r6d, r4d jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d jnz .loopH ; processing last row of every frame [To handle width which not a multiple of 32] .loop32: movu m1, [r0 + (r4 - mmsize) * 2] movu m2, [r0 + (r4 - mmsize) * 2 + mmsize] psllw m1, xm0 psllw m2, xm0 pand m1, m3 pand m2, m3 movu [r2 + (r4 - mmsize) * 2], m1 movu [r2 + (r4 - mmsize) * 2 + mmsize], m2 sub r4d, mmsize jz .end cmp r4d, mmsize jge .loop32 ; process partial pixels movu m1, [r0] movu m2, [r0 + mmsize] psllw m1, xm0 psllw m2, xm0 pand m1, m3 pand m2, m3 movu [r2], m1 movu [r2 + mmsize], m2 .end: RET ;--------------------------------------------------------------------------------------------------------------------- ;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride) ;--------------------------------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal psyCost_pp_4x4, 4, 5, 8 %if HIGH_BIT_DEPTH FIX_STRIDES r1, r3 lea r4, [3 * r1] movddup m0, [r0] movddup m1, [r0 + r1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4] mova m4, [hmul_8w] pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m4 pmaddwd m3, m4 paddd m5, m0, m1 paddd m5, m2 paddd m5, m3 psrldq m4, m5, 4 paddd m5, m4 psrld m5, 2 SUMSUB_BA d, 0, 1, 4 SUMSUB_BA d, 2, 3, 4 SUMSUB_BA d, 0, 2, 4 SUMSUB_BA d, 1, 3, 4 %define ORDER unord TRANS q, ORDER, 0, 2, 4, 6 TRANS q, ORDER, 1, 3, 4, 6 ABSD2 m0, m2, m0, m2, m4, m6 pmaxsd m0, m2 ABSD2 m1, m3, m1, m3, m4, m6 pmaxsd m1, m3 paddd m0, m1 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 psubd m7, m0, m5 lea r4, [3 * r3] movddup m0, [r2] movddup m1, [r2 + r3] movddup m2, [r2 + r3 * 2] movddup m3, [r2 + r4] mova m4, [hmul_8w] pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m4 pmaddwd m3, m4 paddd m5, m0, m1 paddd m5, m2 paddd m5, m3 psrldq m4, m5, 4 paddd m5, m4 psrld m5, 2 SUMSUB_BA d, 0, 1, 4 SUMSUB_BA d, 2, 3, 4 SUMSUB_BA d, 0, 2, 4 SUMSUB_BA d, 1, 3, 4 %define ORDER unord TRANS q, ORDER, 0, 2, 4, 6 TRANS q, ORDER, 1, 3, 4, 6 ABSD2 m0, m2, m0, m2, m4, m6 pmaxsd m0, m2 ABSD2 m1, m3, m1, m3, m4, m6 pmaxsd m1, m3 paddd m0, m1 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 psubd m0, m5 psubd m7, m0 pabsd m0, m7 movd eax, m0 %else ; !HIGH_BIT_DEPTH lea r4, [3 * r1] movd m0, [r0] movd m1, [r0 + r1] movd m2, [r0 + r1 * 2] movd m3, [r0 + r4] shufps m0, m1, 0 shufps m2, m3, 0 mova m4, [hmul_4p] pmaddubsw m0, m4 pmaddubsw m2, m4 paddw m5, m0, m2 movhlps m4, m5 paddw m5, m4 pmaddwd m5, [pw_1] psrld m5, 2 HADAMARD 0, sumsub, 0, 2, 1, 3 HADAMARD 4, sumsub, 0, 2, 1, 3 HADAMARD 1, amax, 0, 2, 1, 3 HADDW m0, m2 psubd m6, m0, m5 lea r4, [3 * r3] movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + r3 * 2] movd m3, [r2 + r4] shufps m0, m1, 0 shufps m2, m3, 0 mova m4, [hmul_4p] pmaddubsw m0, m4 pmaddubsw m2, m4 paddw m5, m0, m2 movhlps m4, m5 paddw m5, m4 pmaddwd m5, [pw_1] psrld m5, 2 HADAMARD 0, sumsub, 0, 2, 1, 3 HADAMARD 4, sumsub, 0, 2, 1, 3 HADAMARD 1, amax, 0, 2, 1, 3 HADDW m0, m2 psubd m0, m5 psubd m6, m0 pabsd m0, m6 movd eax, m0 %endif ; HIGH_BIT_DEPTH RET %if ARCH_X86_64 INIT_XMM sse4 cglobal psyCost_pp_8x8, 4, 6, 13 %if HIGH_BIT_DEPTH FIX_STRIDES r1, r3 lea r4, [3 * r1] pxor m10, m10 movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] lea r5, [r0 + r1 * 4] movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, [pw_1] movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, [pd_1] psrld m0, 1 psubd m10, m0, m8 lea r4, [3 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r4] lea r5, [r2 + r3 * 4] movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r4] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, [pw_1] movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, [pd_1] psrld m0, 1 psubd m0, m8 psubd m10, m0 pabsd m0, m10 movd eax, m0 %else ; !HIGH_BIT_DEPTH lea r4, [3 * r1] mova m8, [hmul_8p] movddup m0, [r0] movddup m1, [r0 + r1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4] lea r5, [r0 + r1 * 4] movddup m4, [r5] movddup m5, [r5 + r1] movddup m6, [r5 + r1 * 2] movddup m7, [r5 + r4] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, [pw_1] psrldq m10, m11, 4 paddd m11, m10 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, [pd_1] psrld m0, 1 psubd m12, m0, m11 lea r4, [3 * r3] movddup m0, [r2] movddup m1, [r2 + r3] movddup m2, [r2 + r3 * 2] movddup m3, [r2 + r4] lea r5, [r2 + r3 * 4] movddup m4, [r5] movddup m5, [r5 + r3] movddup m6, [r5 + r3 * 2] movddup m7, [r5 + r4] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, [pw_1] psrldq m10, m11, 4 paddd m11, m10 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, [pd_1] psrld m0, 1 psubd m0, m11 psubd m12, m0 pabsd m0, m12 movd eax, m0 %endif ; HIGH_BIT_DEPTH RET %endif %if ARCH_X86_64 %if HIGH_BIT_DEPTH INIT_XMM sse4 cglobal psyCost_pp_16x16, 4, 9, 14 FIX_STRIDES r1, r3 lea r4, [3 * r1] lea r8, [3 * r3] mova m12, [pw_1] mova m13, [pd_1] pxor m11, m11 mov r7d, 2 .loopH: mov r6d, 2 .loopW: pxor m10, m10 movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] lea r5, [r0 + r1 * 4] movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, m12 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, m13 psrld m0, 1 psubd m10, m0, m8 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r8] lea r5, [r2 + r3 * 4] movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r8] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, m12 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, m13 psrld m0, 1 psubd m0, m8 psubd m10, m0 pabsd m0, m10 paddd m11, m0 add r0, 16 add r2, 16 dec r6d jnz .loopW lea r0, [r0 + r1 * 8 - 32] lea r2, [r2 + r3 * 8 - 32] dec r7d jnz .loopH movd eax, m11 RET %else ; !HIGH_BIT_DEPTH INIT_XMM sse4 cglobal psyCost_pp_16x16, 4, 9, 15 lea r4, [3 * r1] lea r8, [3 * r3] mova m8, [hmul_8p] mova m10, [pw_1] mova m14, [pd_1] pxor m13, m13 mov r7d, 2 .loopH: mov r6d, 2 .loopW: pxor m12, m12 movddup m0, [r0] movddup m1, [r0 + r1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4] lea r5, [r0 + r1 * 4] movddup m4, [r5] movddup m5, [r5 + r1] movddup m6, [r5 + r1 * 2] movddup m7, [r5 + r4] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, m10 psrldq m9, m11, 4 paddd m11, m9 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, m14 psrld m0, 1 psubd m12, m0, m11 movddup m0, [r2] movddup m1, [r2 + r3] movddup m2, [r2 + r3 * 2] movddup m3, [r2 + r8] lea r5, [r2 + r3 * 4] movddup m4, [r5] movddup m5, [r5 + r3] movddup m6, [r5 + r3 * 2] movddup m7, [r5 + r8] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, m10 psrldq m9, m11, 4 paddd m11, m9 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, m14 psrld m0, 1 psubd m0, m11 psubd m12, m0 pabsd m0, m12 paddd m13, m0 add r0, 8 add r2, 8 dec r6d jnz .loopW lea r0, [r0 + r1 * 8 - 16] lea r2, [r2 + r3 * 8 - 16] dec r7d jnz .loopH movd eax, m13 RET %endif ; HIGH_BIT_DEPTH %endif %if ARCH_X86_64 %if HIGH_BIT_DEPTH INIT_XMM sse4 cglobal psyCost_pp_32x32, 4, 9, 14 FIX_STRIDES r1, r3 lea r4, [3 * r1] lea r8, [3 * r3] mova m12, [pw_1] mova m13, [pd_1] pxor m11, m11 mov r7d, 4 .loopH: mov r6d, 4 .loopW: pxor m10, m10 movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] lea r5, [r0 + r1 * 4] movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, m12 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, m13 psrld m0, 1 psubd m10, m0, m8 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r8] lea r5, [r2 + r3 * 4] movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r8] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, m12 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, m13 psrld m0, 1 psubd m0, m8 psubd m10, m0 pabsd m0, m10 paddd m11, m0 add r0, 16 add r2, 16 dec r6d jnz .loopW lea r0, [r0 + r1 * 8 - 64] lea r2, [r2 + r3 * 8 - 64] dec r7d jnz .loopH movd eax, m11 RET %else ; !HIGH_BIT_DEPTH INIT_XMM sse4 cglobal psyCost_pp_32x32, 4, 9, 15 lea r4, [3 * r1] lea r8, [3 * r3] mova m8, [hmul_8p] mova m10, [pw_1] mova m14, [pd_1] pxor m13, m13 mov r7d, 4 .loopH: mov r6d, 4 .loopW: pxor m12, m12 movddup m0, [r0] movddup m1, [r0 + r1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4] lea r5, [r0 + r1 * 4] movddup m4, [r5] movddup m5, [r5 + r1] movddup m6, [r5 + r1 * 2] movddup m7, [r5 + r4] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, m10 psrldq m9, m11, 4 paddd m11, m9 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, m14 psrld m0, 1 psubd m12, m0, m11 movddup m0, [r2] movddup m1, [r2 + r3] movddup m2, [r2 + r3 * 2] movddup m3, [r2 + r8] lea r5, [r2 + r3 * 4] movddup m4, [r5] movddup m5, [r5 + r3] movddup m6, [r5 + r3 * 2] movddup m7, [r5 + r8] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, m10 psrldq m9, m11, 4 paddd m11, m9 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, m14 psrld m0, 1 psubd m0, m11 psubd m12, m0 pabsd m0, m12 paddd m13, m0 add r0, 8 add r2, 8 dec r6d jnz .loopW lea r0, [r0 + r1 * 8 - 32] lea r2, [r2 + r3 * 8 - 32] dec r7d jnz .loopH movd eax, m13 RET %endif ; HIGH_BIT_DEPTH %endif %if ARCH_X86_64 %if HIGH_BIT_DEPTH INIT_XMM sse4 cglobal psyCost_pp_64x64, 4, 9, 14 FIX_STRIDES r1, r3 lea r4, [3 * r1] lea r8, [3 * r3] mova m12, [pw_1] mova m13, [pd_1] pxor m11, m11 mov r7d, 8 .loopH: mov r6d, 8 .loopW: pxor m10, m10 movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] lea r5, [r0 + r1 * 4] movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, m12 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, m13 psrld m0, 1 psubd m10, m0, m8 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r8] lea r5, [r2 + r3 * 4] movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r8] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, m12 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, m13 psrld m0, 1 psubd m0, m8 psubd m10, m0 pabsd m0, m10 paddd m11, m0 add r0, 16 add r2, 16 dec r6d jnz .loopW lea r0, [r0 + r1 * 8 - 128] lea r2, [r2 + r3 * 8 - 128] dec r7d jnz .loopH movd eax, m11 RET %else ; !HIGH_BIT_DEPTH INIT_XMM sse4 cglobal psyCost_pp_64x64, 4, 9, 15 lea r4, [3 * r1] lea r8, [3 * r3] mova m8, [hmul_8p] mova m10, [pw_1] mova m14, [pd_1] pxor m13, m13 mov r7d, 8 .loopH: mov r6d, 8 .loopW: pxor m12, m12 movddup m0, [r0] movddup m1, [r0 + r1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4] lea r5, [r0 + r1 * 4] movddup m4, [r5] movddup m5, [r5 + r1] movddup m6, [r5 + r1 * 2] movddup m7, [r5 + r4] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, m10 psrldq m9, m11, 4 paddd m11, m9 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, m14 psrld m0, 1 psubd m12, m0, m11 movddup m0, [r2] movddup m1, [r2 + r3] movddup m2, [r2 + r3 * 2] movddup m3, [r2 + r8] lea r5, [r2 + r3 * 4] movddup m4, [r5] movddup m5, [r5 + r3] movddup m6, [r5 + r3 * 2] movddup m7, [r5 + r8] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, m10 psrldq m9, m11, 4 paddd m11, m9 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, m14 psrld m0, 1 psubd m0, m11 psubd m12, m0 pabsd m0, m12 paddd m13, m0 add r0, 8 add r2, 8 dec r6d jnz .loopW lea r0, [r0 + r1 * 8 - 64] lea r2, [r2 + r3 * 8 - 64] dec r7d jnz .loopH movd eax, m13 RET %endif ; HIGH_BIT_DEPTH %endif INIT_YMM avx2 %if HIGH_BIT_DEPTH cglobal psyCost_pp_4x4, 4, 5, 6 add r1d, r1d add r3d, r3d lea r4, [r1 * 3] movddup xm0, [r0] movddup xm1, [r0 + r1] movddup xm2, [r0 + r1 * 2] movddup xm3, [r0 + r4] lea r4, [r3 * 3] movddup xm4, [r2] movddup xm5, [r2 + r3] vinserti128 m0, m0, xm4, 1 vinserti128 m1, m1, xm5, 1 movddup xm4, [r2 + r3 * 2] movddup xm5, [r2 + r4] vinserti128 m2, m2, xm4, 1 vinserti128 m3, m3, xm5, 1 mova m4, [hmul_8w] pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m4 pmaddwd m3, m4 paddd m5, m0, m1 paddd m4, m2, m3 paddd m5, m4 psrldq m4, m5, 4 paddd m5, m4 psrld m5, 2 mova m4, m0 paddd m0, m1 psubd m1, m4 mova m4, m2 paddd m2, m3 psubd m3, m4 mova m4, m0 paddd m0, m2 psubd m2, m4 mova m4, m1 paddd m1, m3 psubd m3, m4 movaps m4, m0 vshufps m4, m4, m2, 11011101b vshufps m0, m0, m2, 10001000b movaps m2, m1 vshufps m2, m2, m3, 11011101b vshufps m1, m1, m3, 10001000b pabsd m0, m0 pabsd m4, m4 pmaxsd m0, m4 pabsd m1, m1 pabsd m2, m2 pmaxsd m1, m2 paddd m0, m1 vpermq m1, m0, 11110101b paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 psubd m0, m5 vextracti128 xm1, m0, 1 psubd xm1, xm0 pabsd xm1, xm1 movd eax, xm1 RET %else ; !HIGH_BIT_DEPTH cglobal psyCost_pp_4x4, 4, 5, 6 lea r4, [3 * r1] movd xm0, [r0] movd xm1, [r0 + r1] movd xm2, [r0 + r1 * 2] movd xm3, [r0 + r4] vshufps xm0, xm1, 0 vshufps xm2, xm3, 0 lea r4, [3 * r3] movd xm1, [r2] movd xm3, [r2 + r3] movd xm4, [r2 + r3 * 2] movd xm5, [r2 + r4] vshufps xm1, xm3, 0 vshufps xm4, xm5, 0 vinserti128 m0, m0, xm1, 1 vinserti128 m2, m2, xm4, 1 mova m4, [hmul_4p] pmaddubsw m0, m4 pmaddubsw m2, m4 paddw m5, m0, m2 mova m1, m5 psrldq m4, m5, 8 paddw m5, m4 pmaddwd m5, [pw_1] psrld m5, 2 vpsubw m2, m2, m0 vpunpckhqdq m0, m1, m2 vpunpcklqdq m1, m1, m2 vpaddw m2, m1, m0 vpsubw m0, m0, m1 vpblendw m1, m2, m0, 10101010b vpslld m0, m0, 10h vpsrld m2, m2, 10h vpor m0, m0, m2 vpabsw m1, m1 vpabsw m0, m0 vpmaxsw m1, m1, m0 vpmaddwd m1, m1, [pw_1] psrldq m2, m1, 8 paddd m1, m2 psrldq m3, m1, 4 paddd m1, m3 psubd m1, m5 vextracti128 xm2, m1, 1 psubd m1, m2 pabsd m1, m1 movd eax, xm1 RET %endif %macro PSY_PP_8x8 0 movddup m0, [r0 + r1 * 0] movddup m1, [r0 + r1 * 1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4 * 1] lea r5, [r0 + r1 * 4] movddup m4, [r2 + r3 * 0] movddup m5, [r2 + r3 * 1] movddup m6, [r2 + r3 * 2] movddup m7, [r2 + r7 * 1] lea r6, [r2 + r3 * 4] vinserti128 m0, m0, xm4, 1 vinserti128 m1, m1, xm5, 1 vinserti128 m2, m2, xm6, 1 vinserti128 m3, m3, xm7, 1 movddup m4, [r5 + r1 * 0] movddup m5, [r5 + r1 * 1] movddup m6, [r5 + r1 * 2] movddup m7, [r5 + r4 * 1] movddup m9, [r6 + r3 * 0] movddup m10, [r6 + r3 * 1] movddup m11, [r6 + r3 * 2] movddup m12, [r6 + r7 * 1] vinserti128 m4, m4, xm9, 1 vinserti128 m5, m5, xm10, 1 vinserti128 m6, m6, xm11, 1 vinserti128 m7, m7, xm12, 1 pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, [pw_1] psrldq m10, m11, 4 paddd m11, m10 psrld m11, 2 mova m9, m0 paddw m0, m1 ; m0+m1 psubw m1, m9 ; m1-m0 mova m9, m2 paddw m2, m3 ; m2+m3 psubw m3, m9 ; m3-m2 mova m9, m0 paddw m0, m2 ; m0+m1+m2+m3 psubw m2, m9 ; m2+m3-m0+m1 mova m9, m1 paddw m1, m3 ; m1-m0+m3-m2 psubw m3, m9 ; m3-m2-m1-m0 movdqa m9, m4 paddw m4, m5 ; m4+m5 psubw m5, m9 ; m5-m4 movdqa m9, m6 paddw m6, m7 ; m6+m7 psubw m7, m9 ; m7-m6 movdqa m9, m4 paddw m4, m6 ; m4+m5+m6+m7 psubw m6, m9 ; m6+m7-m4+m5 movdqa m9, m5 paddw m5, m7 ; m5-m4+m7-m6 psubw m7, m9 ; m7-m6-m5-m4 movdqa m9, m0 paddw m0, m4 ; (m0+m1+m2+m3)+(m4+m5+m6+m7) psubw m4, m9 ; (m4+m5+m6+m7)-(m0+m1+m2+m3) movdqa m9, m1 paddw m1, m5 ; (m1-m0+m3-m2)+(m5-m4+m7-m6) psubw m5, m9 ; (m5-m4+m7-m6)-(m1-m0+m3-m2) mova m9, m0 vshufps m9, m9, m4, 11011101b vshufps m0, m0, m4, 10001000b movdqa m4, m0 paddw m0, m9 ; (a0 + a4) + (a4 - a0) psubw m9, m4 ; (a0 + a4) - (a4 - a0) == (a0 + a4) + (a0 - a4) movaps m4, m1 vshufps m4, m4, m5, 11011101b vshufps m1, m1, m5, 10001000b movdqa m5, m1 paddw m1, m4 psubw m4, m5 movdqa m5, m2 paddw m2, m6 psubw m6, m5 movdqa m5, m3 paddw m3, m7 psubw m7, m5 movaps m5, m2 vshufps m5, m5, m6, 11011101b vshufps m2, m2, m6, 10001000b movdqa m6, m2 paddw m2, m5 psubw m5, m6 movaps m6, m3 vshufps m6, m6, m7, 11011101b vshufps m3, m3, m7, 10001000b movdqa m7, m3 paddw m3, m6 psubw m6, m7 movdqa m7, m0 pblendw m0, m9, 10101010b pslld m9, 10h psrld m7, 10h por m9, m7 pabsw m0, m0 pabsw m9, m9 pmaxsw m0, m9 movdqa m7, m1 pblendw m1, m4, 10101010b pslld m4, 10h psrld m7, 10h por m4, m7 pabsw m1, m1 pabsw m4, m4 pmaxsw m1, m4 movdqa m7, m2 pblendw m2, m5, 10101010b pslld m5, 10h psrld m7, 10h por m5, m7 pabsw m2, m2 pabsw m5, m5 pmaxsw m2, m5 mova m7, m3 pblendw m3, m6, 10101010b pslld m6, 10h psrld m7, 10h por m6, m7 pabsw m3, m3 pabsw m6, m6 pmaxsw m3, m6 paddw m0, m1 paddw m0, m2 paddw m0, m3 pmaddwd m0, [pw_1] psrldq m1, m0, 8 paddd m0, m1 pshuflw m1, m0, 00001110b paddd m0, m1 paddd m0, [pd_1] psrld m0, 1 psubd m0, m11 vextracti128 xm1, m0, 1 psubd m0, m1 pabsd m0, m0 %endmacro %macro PSY_PP_8x8_AVX2 0 lea r4, [r1 * 3] movu xm0, [r0] movu xm1, [r0 + r1] movu xm2, [r0 + r1 * 2] movu xm3, [r0 + r4] lea r5, [r0 + r1 * 4] movu xm4, [r5] movu xm5, [r5 + r1] movu xm6, [r5 + r1 * 2] movu xm7, [r5 + r4] lea r4, [r3 * 3] vinserti128 m0, m0, [r2], 1 vinserti128 m1, m1, [r2 + r3], 1 vinserti128 m2, m2, [r2 + r3 * 2], 1 vinserti128 m3, m3, [r2 + r4], 1 lea r5, [r2 + r3 * 4] vinserti128 m4, m4, [r5], 1 vinserti128 m5, m5, [r5 + r3], 1 vinserti128 m6, m6, [r5 + r3 * 2], 1 vinserti128 m7, m7, [r5 + r4], 1 paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, [pw_1] psrldq m9, m8, 8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 psubw m9, m1, m0 paddw m0, m1 psubw m1, m3, m2 paddw m2, m3 punpckhwd m3, m0, m9 punpcklwd m0, m9 psubw m9, m3, m0 paddw m0, m3 punpckhwd m3, m2, m1 punpcklwd m2, m1 psubw m10, m3, m2 paddw m2, m3 psubw m3, m5, m4 paddw m4, m5 psubw m5, m7, m6 paddw m6, m7 punpckhwd m1, m4, m3 punpcklwd m4, m3 psubw m7, m1, m4 paddw m4, m1 punpckhwd m3, m6, m5 punpcklwd m6, m5 psubw m1, m3, m6 paddw m6, m3 psubw m3, m2, m0 paddw m0, m2 psubw m2, m10, m9 paddw m9, m10 punpckhdq m5, m0, m3 punpckldq m0, m3 psubw m10, m5, m0 paddw m0, m5 punpckhdq m3, m9, m2 punpckldq m9, m2 psubw m5, m3, m9 paddw m9, m3 psubw m3, m6, m4 paddw m4, m6 psubw m6, m1, m7 paddw m7, m1 punpckhdq m2, m4, m3 punpckldq m4, m3 psubw m1, m2, m4 paddw m4, m2 punpckhdq m3, m7, m6 punpckldq m7, m6 psubw m2, m3, m7 paddw m7, m3 psubw m3, m4, m0 paddw m0, m4 psubw m4, m1, m10 paddw m10, m1 punpckhqdq m6, m0, m3 punpcklqdq m0, m3 pabsw m0, m0 pabsw m6, m6 pmaxsw m0, m6 punpckhqdq m3, m10, m4 punpcklqdq m10, m4 pabsw m10, m10 pabsw m3, m3 pmaxsw m10, m3 psubw m3, m7, m9 paddw m9, m7 psubw m7, m2, m5 paddw m5, m2 punpckhqdq m4, m9, m3 punpcklqdq m9, m3 pabsw m9, m9 pabsw m4, m4 pmaxsw m9, m4 punpckhqdq m3, m5, m7 punpcklqdq m5, m7 pabsw m5, m5 pabsw m3, m3 pmaxsw m5, m3 paddd m0, m9 paddd m0, m10 paddd m0, m5 psrld m9, m0, 16 pslld m0, 16 psrld m0, 16 paddd m0, m9 psrldq m9, m0, 8 paddd m0, m9 psrldq m9, m0, 4 paddd m0, m9 paddd m0, [pd_1] psrld m0, 1 psubd m0, m8 vextracti128 xm1, m0, 1 psubd xm1, xm0 pabsd xm1, xm1 %endmacro %macro PSY_COST_PP_8x8_MAIN12 0 ; load source pixels lea r4, [r1 * 3] pmovzxwd m0, [r0] pmovzxwd m1, [r0 + r1] pmovzxwd m2, [r0 + r1 * 2] pmovzxwd m3, [r0 + r4] lea r5, [r0 + r1 * 4] pmovzxwd m4, [r5] pmovzxwd m5, [r5 + r1] pmovzxwd m6, [r5 + r1 * 2] pmovzxwd m7, [r5 + r4] ; source SAD paddd m8, m0, m1 paddd m8, m2 paddd m8, m3 paddd m8, m4 paddd m8, m5 paddd m8, m6 paddd m8, m7 vextracti128 xm9, m8, 1 paddd m8, m9 ; sad_8x8 movhlps xm9, xm8 paddd xm8, xm9 pshuflw xm9, xm8, 0Eh paddd xm8, xm9 psrld m8, 2 ; source SA8D psubd m9, m1, m0 paddd m0, m1 psubd m1, m3, m2 paddd m2, m3 punpckhdq m3, m0, m9 punpckldq m0, m9 psubd m9, m3, m0 paddd m0, m3 punpckhdq m3, m2, m1 punpckldq m2, m1 psubd m10, m3, m2 paddd m2, m3 psubd m3, m5, m4 paddd m4, m5 psubd m5, m7, m6 paddd m6, m7 punpckhdq m1, m4, m3 punpckldq m4, m3 psubd m7, m1, m4 paddd m4, m1 punpckhdq m3, m6, m5 punpckldq m6, m5 psubd m1, m3, m6 paddd m6, m3 psubd m3, m2, m0 paddd m0, m2 psubd m2, m10, m9 paddd m9, m10 punpckhqdq m5, m0, m3 punpcklqdq m0, m3 psubd m10, m5, m0 paddd m0, m5 punpckhqdq m3, m9, m2 punpcklqdq m9, m2 psubd m5, m3, m9 paddd m9, m3 psubd m3, m6, m4 paddd m4, m6 psubd m6, m1, m7 paddd m7, m1 punpckhqdq m2, m4, m3 punpcklqdq m4, m3 psubd m1, m2, m4 paddd m4, m2 punpckhqdq m3, m7, m6 punpcklqdq m7, m6 psubd m2, m3, m7 paddd m7, m3 psubd m3, m4, m0 paddd m0, m4 psubd m4, m1, m10 paddd m10, m1 vinserti128 m6, m0, xm3, 1 vperm2i128 m0, m0, m3, 00110001b pabsd m0, m0 pabsd m6, m6 pmaxsd m0, m6 vinserti128 m3, m10, xm4, 1 vperm2i128 m10, m10, m4, 00110001b pabsd m10, m10 pabsd m3, m3 pmaxsd m10, m3 psubd m3, m7, m9 paddd m9, m7 psubd m7, m2, m5 paddd m5, m2 vinserti128 m4, m9, xm3, 1 vperm2i128 m9, m9, m3, 00110001b pabsd m9, m9 pabsd m4, m4 pmaxsd m9, m4 vinserti128 m3, m5, xm7, 1 vperm2i128 m5, m5, m7, 00110001b pabsd m5, m5 pabsd m3, m3 pmaxsd m5, m3 paddd m0, m9 paddd m0, m10 paddd m0, m5 vextracti128 xm9, m0, 1 paddd m0, m9 ; sad_8x8 movhlps xm9, xm0 paddd xm0, xm9 pshuflw xm9, xm0, 0Eh paddd xm0, xm9 paddd m0, [pd_1] psrld m0, 1 ; sa8d_8x8 psubd m11, m0, m8 ; sa8d_8x8 - sad_8x8 ; load recon pixels lea r4, [r3 * 3] pmovzxwd m0, [r2] pmovzxwd m1, [r2 + r3] pmovzxwd m2, [r2 + r3 * 2] pmovzxwd m3, [r2 + r4] lea r5, [r2 + r3 * 4] pmovzxwd m4, [r5] pmovzxwd m5, [r5 + r3] pmovzxwd m6, [r5 + r3 * 2] pmovzxwd m7, [r5 + r4] ; recon SAD paddd m8, m0, m1 paddd m8, m2 paddd m8, m3 paddd m8, m4 paddd m8, m5 paddd m8, m6 paddd m8, m7 vextracti128 xm9, m8, 1 paddd m8, m9 ; sad_8x8 movhlps xm9, xm8 paddd xm8, xm9 pshuflw xm9, xm8, 0Eh paddd xm8, xm9 psrld m8, 2 ; recon SA8D psubd m9, m1, m0 paddd m0, m1 psubd m1, m3, m2 paddd m2, m3 punpckhdq m3, m0, m9 punpckldq m0, m9 psubd m9, m3, m0 paddd m0, m3 punpckhdq m3, m2, m1 punpckldq m2, m1 psubd m10, m3, m2 paddd m2, m3 psubd m3, m5, m4 paddd m4, m5 psubd m5, m7, m6 paddd m6, m7 punpckhdq m1, m4, m3 punpckldq m4, m3 psubd m7, m1, m4 paddd m4, m1 punpckhdq m3, m6, m5 punpckldq m6, m5 psubd m1, m3, m6 paddd m6, m3 psubd m3, m2, m0 paddd m0, m2 psubd m2, m10, m9 paddd m9, m10 punpckhqdq m5, m0, m3 punpcklqdq m0, m3 psubd m10, m5, m0 paddd m0, m5 punpckhqdq m3, m9, m2 punpcklqdq m9, m2 psubd m5, m3, m9 paddd m9, m3 psubd m3, m6, m4 paddd m4, m6 psubd m6, m1, m7 paddd m7, m1 punpckhqdq m2, m4, m3 punpcklqdq m4, m3 psubd m1, m2, m4 paddd m4, m2 punpckhqdq m3, m7, m6 punpcklqdq m7, m6 psubd m2, m3, m7 paddd m7, m3 psubd m3, m4, m0 paddd m0, m4 psubd m4, m1, m10 paddd m10, m1 vinserti128 m6, m0, xm3, 1 vperm2i128 m0, m0, m3, 00110001b pabsd m0, m0 pabsd m6, m6 pmaxsd m0, m6 vinserti128 m3, m10, xm4, 1 vperm2i128 m10, m10, m4, 00110001b pabsd m10, m10 pabsd m3, m3 pmaxsd m10, m3 psubd m3, m7, m9 paddd m9, m7 psubd m7, m2, m5 paddd m5, m2 vinserti128 m4, m9, xm3, 1 vperm2i128 m9, m9, m3, 00110001b pabsd m9, m9 pabsd m4, m4 pmaxsd m9, m4 vinserti128 m3, m5, xm7, 1 vperm2i128 m5, m5, m7, 00110001b pabsd m5, m5 pabsd m3, m3 pmaxsd m5, m3 paddd m0, m9 paddd m0, m10 paddd m0, m5 vextracti128 xm9, m0, 1 paddd m0, m9 ; sad_8x8 movhlps xm9, xm0 paddd xm0, xm9 pshuflw xm9, xm0, 0Eh paddd xm0, xm9 paddd m0, [pd_1] psrld m0, 1 ; sa8d_8x8 psubd m0, m8 ; sa8d_8x8 - sad_8x8 psubd m11, m0 pabsd m11, m11 %endmacro %if ARCH_X86_64 INIT_YMM avx2 %if HIGH_BIT_DEPTH && BIT_DEPTH == 12 cglobal psyCost_pp_8x8, 4, 8, 12 add r1d, r1d add r3d, r3d PSY_COST_PP_8x8_MAIN12 movd eax, xm11 RET %endif %if HIGH_BIT_DEPTH && BIT_DEPTH == 10 cglobal psyCost_pp_8x8, 4, 8, 11 add r1d, r1d add r3d, r3d PSY_PP_8x8_AVX2 movd eax, xm1 RET %endif %if BIT_DEPTH == 8 cglobal psyCost_pp_8x8, 4, 8, 13 lea r4, [3 * r1] lea r7, [3 * r3] mova m8, [hmul_8p] PSY_PP_8x8 movd eax, xm0 RET %endif %endif %if ARCH_X86_64 INIT_YMM avx2 %if HIGH_BIT_DEPTH && BIT_DEPTH == 12 cglobal psyCost_pp_16x16, 4, 10, 13 add r1d, r1d add r3d, r3d pxor m12, m12 mov r8d, 2 .loopH: mov r9d, 2 .loopW: PSY_COST_PP_8x8_MAIN12 paddd xm12, xm11 add r0, 16 add r2, 16 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 32] lea r2, [r2 + r3 * 8 - 32] dec r8d jnz .loopH movd eax, xm12 RET %endif %if HIGH_BIT_DEPTH && BIT_DEPTH == 10 cglobal psyCost_pp_16x16, 4, 10, 12 add r1d, r1d add r3d, r3d pxor m11, m11 mov r8d, 2 .loopH: mov r9d, 2 .loopW: PSY_PP_8x8_AVX2 paddd xm11, xm1 add r0, 16 add r2, 16 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 32] lea r2, [r2 + r3 * 8 - 32] dec r8d jnz .loopH movd eax, xm11 RET %endif %if BIT_DEPTH == 8 cglobal psyCost_pp_16x16, 4, 10, 14 lea r4, [3 * r1] lea r7, [3 * r3] mova m8, [hmul_8p] pxor m13, m13 mov r8d, 2 .loopH: mov r9d, 2 .loopW: PSY_PP_8x8 paddd m13, m0 add r0, 8 add r2, 8 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 16] lea r2, [r2 + r3 * 8 - 16] dec r8d jnz .loopH movd eax, xm13 RET %endif %endif %if ARCH_X86_64 INIT_YMM avx2 %if HIGH_BIT_DEPTH && BIT_DEPTH == 12 cglobal psyCost_pp_32x32, 4, 10, 13 add r1d, r1d add r3d, r3d pxor m12, m12 mov r8d, 4 .loopH: mov r9d, 4 .loopW: PSY_COST_PP_8x8_MAIN12 paddd xm12, xm11 add r0, 16 add r2, 16 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 64] lea r2, [r2 + r3 * 8 - 64] dec r8d jnz .loopH movd eax, xm12 RET %endif %if HIGH_BIT_DEPTH && BIT_DEPTH == 10 cglobal psyCost_pp_32x32, 4, 10, 12 add r1d, r1d add r3d, r3d pxor m11, m11 mov r8d, 4 .loopH: mov r9d, 4 .loopW: PSY_PP_8x8_AVX2 paddd xm11, xm1 add r0, 16 add r2, 16 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 64] lea r2, [r2 + r3 * 8 - 64] dec r8d jnz .loopH movd eax, xm11 RET %endif %if BIT_DEPTH == 8 cglobal psyCost_pp_32x32, 4, 10, 14 lea r4, [3 * r1] lea r7, [3 * r3] mova m8, [hmul_8p] pxor m13, m13 mov r8d, 4 .loopH: mov r9d, 4 .loopW: PSY_PP_8x8 paddd m13, m0 add r0, 8 add r2, 8 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 32] lea r2, [r2 + r3 * 8 - 32] dec r8d jnz .loopH movd eax, xm13 RET %endif %endif %if ARCH_X86_64 INIT_YMM avx2 %if HIGH_BIT_DEPTH && BIT_DEPTH == 12 cglobal psyCost_pp_64x64, 4, 10, 13 add r1d, r1d add r3d, r3d pxor m12, m12 mov r8d, 8 .loopH: mov r9d, 8 .loopW: PSY_COST_PP_8x8_MAIN12 paddd xm12, xm11 add r0, 16 add r2, 16 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 128] lea r2, [r2 + r3 * 8 - 128] dec r8d jnz .loopH movd eax, xm12 RET %endif %if HIGH_BIT_DEPTH && BIT_DEPTH == 10 cglobal psyCost_pp_64x64, 4, 10, 12 add r1d, r1d add r3d, r3d pxor m11, m11 mov r8d, 8 .loopH: mov r9d, 8 .loopW: PSY_PP_8x8_AVX2 paddd xm11, xm1 add r0, 16 add r2, 16 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 128] lea r2, [r2 + r3 * 8 - 128] dec r8d jnz .loopH movd eax, xm11 RET %endif %if BIT_DEPTH == 8 cglobal psyCost_pp_64x64, 4, 10, 14 lea r4, [3 * r1] lea r7, [3 * r3] mova m8, [hmul_8p] pxor m13, m13 mov r8d, 8 .loopH: mov r9d, 8 .loopW: PSY_PP_8x8 paddd m13, m0 add r0, 8 add r2, 8 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 64] lea r2, [r2 + r3 * 8 - 64] dec r8d jnz .loopH movd eax, xm13 RET %endif %endif ;--------------------------------------------------------------------------------------------------------------------- ;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) ;--------------------------------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal psyCost_ss_4x4, 4, 5, 8 add r1, r1 lea r4, [3 * r1] movddup m0, [r0] movddup m1, [r0 + r1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4] pabsw m4, m0 pabsw m5, m1 paddw m5, m4 pabsw m4, m2 paddw m5, m4 pabsw m4, m3 paddw m5, m4 pmaddwd m5, [pw_1] psrldq m4, m5, 4 paddd m5, m4 psrld m6, m5, 2 mova m4, [hmul_8w] pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m4 pmaddwd m3, m4 psrldq m4, m0, 4 psubd m5, m0, m4 paddd m0, m4 shufps m0, m5, 10001000b psrldq m4, m1, 4 psubd m5, m1, m4 paddd m1, m4 shufps m1, m5, 10001000b psrldq m4, m2, 4 psubd m5, m2, m4 paddd m2, m4 shufps m2, m5, 10001000b psrldq m4, m3, 4 psubd m5, m3, m4 paddd m3, m4 shufps m3, m5, 10001000b mova m4, m0 paddd m0, m1 psubd m1, m4 mova m4, m2 paddd m2, m3 psubd m3, m4 mova m4, m0 paddd m0, m2 psubd m2, m4 mova m4, m1 paddd m1, m3 psubd m3, m4 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 paddd m0, m2 paddd m1, m3 paddd m0, m1 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 psrld m0, 1 psubd m7, m0, m6 add r3, r3 lea r4, [3 * r3] movddup m0, [r2] movddup m1, [r2 + r3] movddup m2, [r2 + r3 * 2] movddup m3, [r2 + r4] pabsw m4, m0 pabsw m5, m1 paddw m5, m4 pabsw m4, m2 paddw m5, m4 pabsw m4, m3 paddw m5, m4 pmaddwd m5, [pw_1] psrldq m4, m5, 4 paddd m5, m4 psrld m6, m5, 2 mova m4, [hmul_8w] pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m4 pmaddwd m3, m4 psrldq m4, m0, 4 psubd m5, m0, m4 paddd m0, m4 shufps m0, m5, 10001000b psrldq m4, m1, 4 psubd m5, m1, m4 paddd m1, m4 shufps m1, m5, 10001000b psrldq m4, m2, 4 psubd m5, m2, m4 paddd m2, m4 shufps m2, m5, 10001000b psrldq m4, m3, 4 psubd m5, m3, m4 paddd m3, m4 shufps m3, m5, 10001000b mova m4, m0 paddd m0, m1 psubd m1, m4 mova m4, m2 paddd m2, m3 psubd m3, m4 mova m4, m0 paddd m0, m2 psubd m2, m4 mova m4, m1 paddd m1, m3 psubd m3, m4 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 paddd m0, m2 paddd m1, m3 paddd m0, m1 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 psrld m0, 1 psubd m0, m6 psubd m7, m0 pabsd m0, m7 movd eax, m0 RET %if ARCH_X86_64 INIT_XMM sse4 cglobal psyCost_ss_8x8, 4, 6, 15 mova m13, [pw_pmpmpmpm] mova m14, [pw_1] add r1, r1 add r3, r3 lea r4, [3 * r1] movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] lea r5, [r0 + r1 * 4] movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] pabsw m8, m0 pabsw m9, m1 paddw m8, m9 pabsw m10, m2 pabsw m11, m3 paddw m10, m11 paddw m8, m10 pabsw m9, m4 pabsw m10, m5 paddw m9, m10 pabsw m11, m6 pabsw m12, m7 paddw m11, m12 paddw m9, m11 paddw m8, m9 movhlps m9, m8 pmovzxwd m8, m8 pmovzxwd m9, m9 paddd m8, m9 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 pmaddwd m0, m13 pmaddwd m1, m13 pmaddwd m2, m13 pmaddwd m3, m13 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 pmaddwd m4, m13 pmaddwd m5, m13 pmaddwd m6, m13 pmaddwd m7, m13 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m11, m0, m7 movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] pmaddwd m0, m14 pmaddwd m1, m14 pmaddwd m2, m14 pmaddwd m3, m14 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] pmaddwd m4, m14 pmaddwd m5, m14 pmaddwd m6, m14 pmaddwd m7, m14 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m0, m7 paddd m0, m11 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 paddd m0, [pd_2] psrld m0, 2 psubd m12, m0, m8 lea r4, [3 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r4] lea r5, [r2 + r3 * 4] movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r4] pabsw m8, m0 pabsw m9, m1 paddw m8, m9 pabsw m10, m2 pabsw m11, m3 paddw m10, m11 paddw m8, m10 pabsw m9, m4 pabsw m10, m5 paddw m9, m10 pabsw m11, m6 pabsw m10, m7 paddw m11, m10 paddw m9, m11 paddw m8, m9 movhlps m9, m8 pmovzxwd m8, m8 pmovzxwd m9, m9 paddd m8, m9 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 pmaddwd m0, m13 pmaddwd m1, m13 pmaddwd m2, m13 pmaddwd m3, m13 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 pmaddwd m4, m13 pmaddwd m5, m13 pmaddwd m6, m13 pmaddwd m7, m13 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m11, m0, m7 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r4] pmaddwd m0, m14 pmaddwd m1, m14 pmaddwd m2, m14 pmaddwd m3, m14 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r4] pmaddwd m4, m14 pmaddwd m5, m14 pmaddwd m6, m14 pmaddwd m7, m14 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m0, m7 paddd m0, m11 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 paddd m0, [pd_2] psrld m0, 2 psubd m0, m8 psubd m12, m0 pabsd m0, m12 movd eax, m0 RET %endif %macro psy_cost_ss 0 movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] lea r5, [r0 + r1 * 4] movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] pabsw m8, m0 pabsw m9, m1 paddw m8, m9 pabsw m10, m2 pabsw m11, m3 paddw m10, m11 paddw m8, m10 pabsw m9, m4 pabsw m10, m5 paddw m9, m10 pabsw m11, m6 pabsw m12, m7 paddw m11, m12 paddw m9, m11 paddw m8, m9 movhlps m9, m8 pmovzxwd m8, m8 pmovzxwd m9, m9 paddd m8, m9 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 pmaddwd m0, m13 pmaddwd m1, m13 pmaddwd m2, m13 pmaddwd m3, m13 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 pmaddwd m4, m13 pmaddwd m5, m13 pmaddwd m6, m13 pmaddwd m7, m13 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m11, m0, m7 movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] pmaddwd m0, m14 pmaddwd m1, m14 pmaddwd m2, m14 pmaddwd m3, m14 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] pmaddwd m4, m14 pmaddwd m5, m14 pmaddwd m6, m14 pmaddwd m7, m14 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m0, m7 paddd m0, m11 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 paddd m0, [pd_2] psrld m0, 2 psubd m12, m0, m8 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r6] lea r5, [r2 + r3 * 4] movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r6] pabsw m8, m0 pabsw m9, m1 paddw m8, m9 pabsw m10, m2 pabsw m11, m3 paddw m10, m11 paddw m8, m10 pabsw m9, m4 pabsw m10, m5 paddw m9, m10 pabsw m11, m6 pabsw m10, m7 paddw m11, m10 paddw m9, m11 paddw m8, m9 movhlps m9, m8 pmovzxwd m8, m8 pmovzxwd m9, m9 paddd m8, m9 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 pmaddwd m0, m13 pmaddwd m1, m13 pmaddwd m2, m13 pmaddwd m3, m13 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 pmaddwd m4, m13 pmaddwd m5, m13 pmaddwd m6, m13 pmaddwd m7, m13 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m11, m0, m7 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r6] pmaddwd m0, m14 pmaddwd m1, m14 pmaddwd m2, m14 pmaddwd m3, m14 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r6] pmaddwd m4, m14 pmaddwd m5, m14 pmaddwd m6, m14 pmaddwd m7, m14 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m0, m7 paddd m0, m11 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 paddd m0, [pd_2] psrld m0, 2 psubd m0, m8 psubd m12, m0 pabsd m0, m12 paddd m15, m0 %endmacro %if ARCH_X86_64 INIT_XMM sse4 cglobal psyCost_ss_16x16, 4, 9, 16 mova m13, [pw_pmpmpmpm] mova m14, [pw_1] add r1, r1 add r3, r3 lea r4, [3 * r1] lea r6, [3 * r3] pxor m15, m15 mov r7d, 2 .loopH: mov r8d, 2 .loopW: psy_cost_ss add r0, 16 add r2, 16 dec r8d jnz .loopW lea r0, [r0 + r1 * 8 - 32] lea r2, [r2 + r3 * 8 - 32] dec r7d jnz .loopH movd eax, m15 RET %endif %if ARCH_X86_64 INIT_XMM sse4 cglobal psyCost_ss_32x32, 4, 9, 16 mova m13, [pw_pmpmpmpm] mova m14, [pw_1] add r1, r1 add r3, r3 lea r4, [3 * r1] lea r6, [3 * r3] pxor m15, m15 mov r7d, 4 .loopH: mov r8d, 4 .loopW: psy_cost_ss add r0, 16 add r2, 16 dec r8d jnz .loopW lea r0, [r0 + r1 * 8 - 64] lea r2, [r2 + r3 * 8 - 64] dec r7d jnz .loopH movd eax, m15 RET %endif %if ARCH_X86_64 INIT_XMM sse4 cglobal psyCost_ss_64x64, 4, 9, 16 mova m13, [pw_pmpmpmpm] mova m14, [pw_1] add r1, r1 add r3, r3 lea r4, [3 * r1] lea r6, [3 * r3] pxor m15, m15 mov r7d, 8 .loopH: mov r8d, 8 .loopW: psy_cost_ss add r0, 16 add r2, 16 dec r8d jnz .loopW lea r0, [r0 + r1 * 8 - 128] lea r2, [r2 + r3 * 8 - 128] dec r7d jnz .loopH movd eax, m15 RET %endif INIT_YMM avx2 cglobal psyCost_ss_4x4, 4, 5, 8 add r1, r1 add r3, r3 lea r4, [3 * r1] movddup m0, [r0] movddup m1, [r0 + r1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4] lea r4, [3 * r3] movddup m4, [r2] movddup m5, [r2 + r3] movddup m6, [r2 + r3 * 2] movddup m7, [r2 + r4] vinserti128 m0, m0, xm4, 1 vinserti128 m1, m1, xm5, 1 vinserti128 m2, m2, xm6, 1 vinserti128 m3, m3, xm7, 1 pabsw m4, m0 pabsw m5, m1 paddw m5, m4 pabsw m4, m2 paddw m5, m4 pabsw m4, m3 paddw m5, m4 pmaddwd m5, [pw_1] psrldq m4, m5, 4 paddd m5, m4 psrld m6, m5, 2 mova m4, [hmul_8w] pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m4 pmaddwd m3, m4 psrldq m4, m0, 4 psubd m5, m0, m4 paddd m0, m4 shufps m0, m0, m5, 10001000b psrldq m4, m1, 4 psubd m5, m1, m4 paddd m1, m4 shufps m1, m1, m5, 10001000b psrldq m4, m2, 4 psubd m5, m2, m4 paddd m2, m4 shufps m2, m2, m5, 10001000b psrldq m4, m3, 4 psubd m5, m3, m4 paddd m3, m4 shufps m3, m3, m5, 10001000b mova m4, m0 paddd m0, m1 psubd m1, m4 mova m4, m2 paddd m2, m3 psubd m3, m4 mova m4, m0 paddd m0, m2 psubd m2, m4 mova m4, m1 paddd m1, m3 psubd m3, m4 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 paddd m0, m2 paddd m1, m3 paddd m0, m1 psrldq m1, m0, 8 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 psrld m0, 1 psubd m0, m6 vextracti128 xm1, m0, 1 psubd m0, m1 pabsd m0, m0 movd eax, xm0 RET %macro PSY_SS_8x8 0 lea r4, [3 * r1] lea r6, [r0 + r1 * 4] movu xm0, [r0] movu xm1, [r0 + r1] movu xm2, [r0 + r1 * 2] movu xm3, [r0 + r4] movu xm4, [r6] movu xm5, [r6 + r1] movu xm6, [r6 + r1 * 2] movu xm7, [r6 + r4] lea r4, [3 * r3] lea r6, [r2 + r3 * 4] movu xm8, [r2] movu xm9, [r2 + r3] movu xm10, [r2 + r3 * 2] movu xm11, [r2 + r4] vinserti128 m0, m0, xm8, 1 vinserti128 m1, m1, xm9, 1 vinserti128 m2, m2, xm10, 1 vinserti128 m3, m3, xm11, 1 movu xm8, [r6] movu xm9, [r6 + r3] movu xm10, [r6 + r3 * 2] movu xm11, [r6 + r4] vinserti128 m4, m4, xm8, 1 vinserti128 m5, m5, xm9, 1 vinserti128 m6, m6, xm10, 1 vinserti128 m7, m7, xm11, 1 ;; store on stack to use later mova [rsp + 0 * mmsize], m0 mova [rsp + 1 * mmsize], m1 mova [rsp + 2 * mmsize], m2 mova [rsp + 3 * mmsize], m3 mova [rsp + 4 * mmsize], m4 mova [rsp + 5 * mmsize], m5 mova [rsp + 6 * mmsize], m6 mova [rsp + 7 * mmsize], m7 pabsw m8, m0 pabsw m9, m1 paddw m8, m9 pabsw m10, m2 pabsw m11, m3 paddw m10, m11 paddw m8, m10 pabsw m9, m4 pabsw m10, m5 paddw m9, m10 pabsw m11, m6 pabsw m10, m7 paddw m11, m10 paddw m9, m11 paddw m8, m9 psrldq m9, m8, 8 vextracti128 xm10, m8, 1 vextracti128 xm11, m9, 1 vpmovzxwd m8, xm8 vpmovzxwd m9, xm9 vpmovzxwd m10, xm10 vpmovzxwd m11, xm11 vinserti128 m8, m8, xm10, 1 vinserti128 m9, m9, xm11, 1 paddd m8, m9 psrldq m9, m8, 8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 ; sad_4x4 pmaddwd m0, m13 pmaddwd m1, m13 pmaddwd m2, m13 pmaddwd m3, m13 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 vshufps m0, m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 vshufps m0, m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 vshufps m1, m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 vshufps m1, m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 vshufps m2, m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 vshufps m2, m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 vshufps m3, m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 vshufps m3, m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 pmaddwd m4, m13 pmaddwd m5, m13 pmaddwd m6, m13 pmaddwd m7, m13 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 vshufps m4, m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 vshufps m4, m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 vshufps m5, m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 vshufps m5, m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 vshufps m6, m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 vshufps m6, m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 vshufps m7, m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 vshufps m7, m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m11, m0, m7 pmaddwd m0, m12, [rsp + 0 * mmsize] pmaddwd m1, m12, [rsp + 1 * mmsize] pmaddwd m2, m12, [rsp + 2 * mmsize] pmaddwd m3, m12, [rsp + 3 * mmsize] psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 vshufps m0, m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 vshufps m0, m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 vshufps m1, m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 vshufps m1, m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 vshufps m2, m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 vshufps m2, m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 vshufps m3, m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 vshufps m3, m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 pmaddwd m4, m12, [rsp + 4 * mmsize] pmaddwd m5, m12, [rsp + 5 * mmsize] pmaddwd m6, m12, [rsp + 6 * mmsize] pmaddwd m7, m12, [rsp + 7 * mmsize] psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 vshufps m4, m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 vshufps m4, m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 vshufps m5, m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 vshufps m5, m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 vshufps m6, m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 vshufps m6, m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 vshufps m7, m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 vshufps m7, m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m0, m7 paddd m0, m11 psrldq m1, m0, 8 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 paddd m0, [pd_2] psrld m0, 2 psubd m0, m8 vextracti128 xm1, m0, 1 psubd m0, m1 pabsd m0, m0 %endmacro %if ARCH_X86_64 INIT_YMM avx2 cglobal psyCost_ss_8x8, 4, 7, 14 ; NOTE: align stack to 64 bytes, so all of local data in same cache line mov r5, rsp sub rsp, 8*mmsize and rsp, ~63 mova m12, [pw_1] mova m13, [pw_pmpmpmpm] add r1, r1 add r3, r3 PSY_SS_8x8 movd eax, xm0 mov rsp, r5 RET %endif %if ARCH_X86_64 INIT_YMM avx2 cglobal psyCost_ss_16x16, 4, 9, 15 ; NOTE: align stack to 64 bytes, so all of local data in same cache line mov r5, rsp sub rsp, 8*mmsize and rsp, ~63 mova m12, [pw_1] mova m13, [pw_pmpmpmpm] add r1, r1 add r3, r3 pxor m14, m14 mov r7d, 2 .loopH: mov r8d, 2 .loopW: PSY_SS_8x8 paddd m14, m0 add r0, 16 add r2, 16 dec r8d jnz .loopW lea r0, [r0 + r1 * 8 - 32] lea r2, [r2 + r3 * 8 - 32] dec r7d jnz .loopH movd eax, xm14 mov rsp, r5 RET %endif %if ARCH_X86_64 INIT_YMM avx2 cglobal psyCost_ss_32x32, 4, 9, 15 ; NOTE: align stack to 64 bytes, so all of local data in same cache line mov r5, rsp sub rsp, 8*mmsize and rsp, ~63 mova m12, [pw_1] mova m13, [pw_pmpmpmpm] add r1, r1 add r3, r3 pxor m14, m14 mov r7d, 4 .loopH: mov r8d, 4 .loopW: PSY_SS_8x8 paddd m14, m0 add r0, 16 add r2, 16 dec r8d jnz .loopW lea r0, [r0 + r1 * 8 - 64] lea r2, [r2 + r3 * 8 - 64] dec r7d jnz .loopH movd eax, xm14 mov rsp, r5 RET %endif %if ARCH_X86_64 INIT_YMM avx2 cglobal psyCost_ss_64x64, 4, 9, 15 ; NOTE: align stack to 64 bytes, so all of local data in same cache line mov r5, rsp sub rsp, 8*mmsize and rsp, ~63 mova m12, [pw_1] mova m13, [pw_pmpmpmpm] add r1, r1 add r3, r3 pxor m14, m14 mov r7d, 8 .loopH: mov r8d, 8 .loopW: PSY_SS_8x8 paddd m14, m0 add r0, 16 add r2, 16 dec r8d jnz .loopW lea r0, [r0 + r1 * 8 - 128] lea r2, [r2 + r3 * 8 - 128] dec r7d jnz .loopH movd eax, xm14 mov rsp, r5 RET %endif ;------------------------------------------------------------------------------------------------------------------------------------- ; pixel planeClipAndMax(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix) ;------------------------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 INIT_YMM avx2 cglobal planeClipAndMax, 5,7,8 movd xm0, r5m vpbroadcastb m0, xm0 ; m0 = [min] vpbroadcastb m1, r6m ; m1 = [max] pxor m2, m2 ; m2 = sumLuma pxor m3, m3 ; m3 = maxLumaLevel pxor m4, m4 ; m4 = zero ; get mask to partial register pixels mov r5d, r2d and r2d, ~(mmsize - 1) sub r5d, r2d lea r6, [pb_movemask_32 + mmsize] sub r6, r5 movu m5, [r6] ; m5 = mask for last couple column .loopH: lea r5d, [r2 - mmsize] .loopW: movu m6, [r0 + r5] pmaxub m6, m0 pminub m6, m1 movu [r0 + r5], m6 ; store back pmaxub m3, m6 ; update maxLumaLevel psadbw m6, m4 paddq m2, m6 sub r5d, mmsize jge .loopW ; partial pixels movu m7, [r0 + r2] pmaxub m6, m7, m0 pminub m6, m1 pand m7, m5 ; get invalid/unchange pixel pandn m6, m5, m6 ; clear invalid pixels por m7, m6 ; combin valid & invalid pixels movu [r0 + r2], m7 ; store back pmaxub m3, m6 ; update maxLumaLevel psadbw m6, m4 paddq m2, m6 .next: add r0, r1 dec r3d jg .loopH ; sumLuma vextracti128 xm0, m2, 1 paddq xm0, xm2 movhlps xm1, xm0 paddq xm0, xm1 movq [r4], xm0 ; maxLumaLevel vextracti128 xm0, m3, 1 pmaxub xm0, xm3 movhlps xm3, xm0 pmaxub xm0, xm3 pmovzxbw xm0, xm0 pxor xm0, [pb_movemask + 16] phminposuw xm0, xm0 movd eax, xm0 not al movzx eax, al RET %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 %if HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10 %macro LOAD_DIFF_AVX2 4 movu %1, %3 movu %2, %4 psubw %1, %2 %endmacro %macro LOAD_DIFF_8x4P_AVX2 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer LOAD_DIFF_AVX2 xm%1, xm%5, [%7], [%8] LOAD_DIFF_AVX2 xm%2, xm%6, [%7+r1], [%8+r3] LOAD_DIFF_AVX2 xm%3, xm%5, [%7+2*r1], [%8+2*r3] LOAD_DIFF_AVX2 xm%4, xm%6, [%7+r4], [%8+r5] ;lea %7, [%7+4*r1] ;lea %8, [%8+4*r3] %endmacro INIT_XMM avx2 cglobal pixel_sa8d_8x8_internal lea r6, [r0+4*r1] lea r7, [r2+4*r3] LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax ;HADAMARD2_2D 0, 1, 2, 8, 6, wd ;HADAMARD2_2D 4, 5, 3, 9, 6, wd ;HADAMARD2_2D 0, 2, 1, 8, 6, dq ;HADAMARD2_2D 4, 3, 5, 9, 6, dq ;HADAMARD2_2D 0, 4, 2, 3, 6, qdq, amax ;HADAMARD2_2D 1, 5, 8, 9, 6, qdq, amax paddw m0, m1 paddw m0, m2 paddw m0, m8 SAVE_MM_PERMUTATION ret INIT_XMM avx2 cglobal pixel_sa8d_8x8, 4,8,12 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] call pixel_sa8d_8x8_internal HADDUW m0, m1 movd eax, m0 add eax, 1 shr eax, 1 RET INIT_YMM avx2 cglobal pixel_sa8d_16x16, 4,8,12 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] lea r6, [r0+4*r1] lea r7, [r2+4*r3] vbroadcasti128 m7, [pw_1] ; Top 16x8 ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 movu m0, [r0] ; 10 bits movu m5, [r2] psubw m0, m5 ; 11 bits movu m1, [r0 + r1] movu m6, [r2 + r3] psubw m1, m6 movu m2, [r0 + r1 * 2] movu m5, [r2 + r3 * 2] psubw m2, m5 movu m8, [r0 + r4] movu m6, [r2 + r5] psubw m8, m6 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 movu m4, [r6] movu m11, [r7] psubw m4, m11 movu m5, [r6 + r1] movu m6, [r7 + r3] psubw m5, m6 movu m3, [r6 + r1 * 2] movu m11, [r7 + r3 * 2] psubw m3, m11 movu m9, [r6 + r4] movu m6, [r7 + r5] psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax ; 16 bits pmaddwd m0, m7 pmaddwd m1, m7 pmaddwd m2, m7 pmaddwd m8, m7 paddd m0, m1 paddd m2, m8 paddd m10, m0, m2 lea r0, [r0+8*r1] lea r2, [r2+8*r3] lea r6, [r6+8*r1] lea r7, [r7+8*r3] ; Bottom 16x8 ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 movu m0, [r0] movu m5, [r2] psubw m0, m5 movu m1, [r0 + r1] movu m6, [r2 + r3] psubw m1, m6 movu m2, [r0 + r1 * 2] movu m5, [r2 + r3 * 2] psubw m2, m5 movu m8, [r0 + r4] movu m6, [r2 + r5] psubw m8, m6 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 movu m4, [r6] movu m11, [r7] psubw m4, m11 movu m5, [r6 + r1] movu m6, [r7 + r3] psubw m5, m6 movu m3, [r6 + r1 * 2] movu m11, [r7 + r3 * 2] psubw m3, m11 movu m9, [r6 + r4] movu m6, [r7 + r5] psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax pmaddwd m0, m7 pmaddwd m1, m7 pmaddwd m2, m7 pmaddwd m8, m7 paddd m0, m1 paddd m2, m8 paddd m10, m0 paddd m10, m2 HADDD m10, m0 movd eax, xm10 add eax, 1 shr eax, 1 RET ; TODO: optimize me, need more 2 of YMM registers because C model get partial result every 16x16 block INIT_YMM avx2 cglobal pixel_sa8d_32x32, 4,8,14 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] lea r6, [r0+4*r1] lea r7, [r2+4*r3] vbroadcasti128 m7, [pw_1] ;SA8D[16x8] ; pix[0] ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 movu m0, [r0] movu m5, [r2] psubw m0, m5 movu m1, [r0 + r1] movu m6, [r2 + r3] psubw m1, m6 movu m2, [r0 + r1 * 2] movu m5, [r2 + r3 * 2] psubw m2, m5 movu m8, [r0 + r4] movu m6, [r2 + r5] psubw m8, m6 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 movu m4, [r6] movu m11, [r7] psubw m4, m11 movu m5, [r6 + r1] movu m6, [r7 + r3] psubw m5, m6 movu m3, [r6 + r1 * 2] movu m11, [r7 + r3 * 2] psubw m3, m11 movu m9, [r6 + r4] movu m6, [r7 + r5] psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax pmaddwd m0, m7 pmaddwd m1, m7 pmaddwd m2, m7 pmaddwd m8, m7 paddd m0, m1 paddd m2, m8 paddd m10, m0, m2 ; SA8D[16x8] ; pix[16] add r0, mmsize add r2, mmsize add r6, mmsize add r7, mmsize ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 movu m0, [r0] movu m5, [r2] psubw m0, m5 movu m1, [r0 + r1] movu m6, [r2 + r3] psubw m1, m6 movu m2, [r0 + r1 * 2] movu m5, [r2 + r3 * 2] psubw m2, m5 movu m8, [r0 + r4] movu m6, [r2 + r5] psubw m8, m6 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 movu m4, [r6] movu m11, [r7] psubw m4, m11 movu m5, [r6 + r1] movu m6, [r7 + r3] psubw m5, m6 movu m3, [r6 + r1 * 2] movu m11, [r7 + r3 * 2] psubw m3, m11 movu m9, [r6 + r4] movu m6, [r7 + r5] psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax pmaddwd m0, m7 pmaddwd m1, m7 pmaddwd m2, m7 pmaddwd m8, m7 paddd m0, m1 paddd m2, m8 paddd m12, m0, m2 ; SA8D[16x8] ; pix[8*stride+16] lea r0, [r0+8*r1] lea r2, [r2+8*r3] lea r6, [r6+8*r1] lea r7, [r7+8*r3] ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 movu m0, [r0] movu m5, [r2] psubw m0, m5 movu m1, [r0 + r1] movu m6, [r2 + r3] psubw m1, m6 movu m2, [r0 + r1 * 2] movu m5, [r2 + r3 * 2] psubw m2, m5 movu m8, [r0 + r4] movu m6, [r2 + r5] psubw m8, m6 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 movu m4, [r6] movu m11, [r7] psubw m4, m11 movu m5, [r6 + r1] movu m6, [r7 + r3] psubw m5, m6 movu m3, [r6 + r1 * 2] movu m11, [r7 + r3 * 2] psubw m3, m11 movu m9, [r6 + r4] movu m6, [r7 + r5] psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax pmaddwd m0, m7 pmaddwd m1, m7 pmaddwd m2, m7 pmaddwd m8, m7 paddd m0, m1 paddd m2, m8 paddd m12, m0 paddd m12, m2 ; sum[1] HADDD m12, m0 ; SA8D[16x8] ; pix[8*stride] sub r0, mmsize sub r2, mmsize sub r6, mmsize sub r7, mmsize ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 movu m0, [r0] movu m5, [r2] psubw m0, m5 movu m1, [r0 + r1] movu m6, [r2 + r3] psubw m1, m6 movu m2, [r0 + r1 * 2] movu m5, [r2 + r3 * 2] psubw m2, m5 movu m8, [r0 + r4] movu m6, [r2 + r5] psubw m8, m6 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 movu m4, [r6] movu m11, [r7] psubw m4, m11 movu m5, [r6 + r1] movu m6, [r7 + r3] psubw m5, m6 movu m3, [r6 + r1 * 2] movu m11, [r7 + r3 * 2] psubw m3, m11 movu m9, [r6 + r4] movu m6, [r7 + r5] psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax pmaddwd m0, m7 pmaddwd m1, m7 pmaddwd m2, m7 pmaddwd m8, m7 paddd m0, m1 paddd m2, m8 paddd m10, m0 paddd m10, m2 ; sum[0] HADDD m10, m0 punpckldq xm10, xm12 ;SA8D[16x8] ; pix[16*stridr] lea r0, [r0+8*r1] lea r2, [r2+8*r3] lea r6, [r6+8*r1] lea r7, [r7+8*r3] ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 movu m0, [r0] movu m5, [r2] psubw m0, m5 movu m1, [r0 + r1] movu m6, [r2 + r3] psubw m1, m6 movu m2, [r0 + r1 * 2] movu m5, [r2 + r3 * 2] psubw m2, m5 movu m8, [r0 + r4] movu m6, [r2 + r5] psubw m8, m6 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 movu m4, [r6] movu m11, [r7] psubw m4, m11 movu m5, [r6 + r1] movu m6, [r7 + r3] psubw m5, m6 movu m3, [r6 + r1 * 2] movu m11, [r7 + r3 * 2] psubw m3, m11 movu m9, [r6 + r4] movu m6, [r7 + r5] psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax pmaddwd m0, m7 pmaddwd m1, m7 pmaddwd m2, m7 pmaddwd m8, m7 paddd m0, m1 paddd m2, m8 paddd m12, m0, m2 ; SA8D[16x8] ; pix[16*stride+16] add r0, mmsize add r2, mmsize add r6, mmsize add r7, mmsize ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 movu m0, [r0] movu m5, [r2] psubw m0, m5 movu m1, [r0 + r1] movu m6, [r2 + r3] psubw m1, m6 movu m2, [r0 + r1 * 2] movu m5, [r2 + r3 * 2] psubw m2, m5 movu m8, [r0 + r4] movu m6, [r2 + r5] psubw m8, m6 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 movu m4, [r6] movu m11, [r7] psubw m4, m11 movu m5, [r6 + r1] movu m6, [r7 + r3] psubw m5, m6 movu m3, [r6 + r1 * 2] movu m11, [r7 + r3 * 2] psubw m3, m11 movu m9, [r6 + r4] movu m6, [r7 + r5] psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax pmaddwd m0, m7 pmaddwd m1, m7 pmaddwd m2, m7 pmaddwd m8, m7 paddd m0, m1 paddd m2, m8 paddd m13, m0, m2 ; SA8D[16x8] ; pix[24*stride+16] lea r0, [r0+8*r1] lea r2, [r2+8*r3] lea r6, [r6+8*r1] lea r7, [r7+8*r3] ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 movu m0, [r0] movu m5, [r2] psubw m0, m5 movu m1, [r0 + r1] movu m6, [r2 + r3] psubw m1, m6 movu m2, [r0 + r1 * 2] movu m5, [r2 + r3 * 2] psubw m2, m5 movu m8, [r0 + r4] movu m6, [r2 + r5] psubw m8, m6 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 movu m4, [r6] movu m11, [r7] psubw m4, m11 movu m5, [r6 + r1] movu m6, [r7 + r3] psubw m5, m6 movu m3, [r6 + r1 * 2] movu m11, [r7 + r3 * 2] psubw m3, m11 movu m9, [r6 + r4] movu m6, [r7 + r5] psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax pmaddwd m0, m7 pmaddwd m1, m7 pmaddwd m2, m7 pmaddwd m8, m7 paddd m0, m1 paddd m2, m8 paddd m13, m0 paddd m13, m2 ; sum[3] HADDD m13, m0 ; SA8D[16x8] ; pix[24*stride] sub r0, mmsize sub r2, mmsize sub r6, mmsize sub r7, mmsize ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 movu m0, [r0] movu m5, [r2] psubw m0, m5 movu m1, [r0 + r1] movu m6, [r2 + r3] psubw m1, m6 movu m2, [r0 + r1 * 2] movu m5, [r2 + r3 * 2] psubw m2, m5 movu m8, [r0 + r4] movu m6, [r2 + r5] psubw m8, m6 ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 movu m4, [r6] movu m11, [r7] psubw m4, m11 movu m5, [r6 + r1] movu m6, [r7 + r3] psubw m5, m6 movu m3, [r6 + r1 * 2] movu m11, [r7 + r3 * 2] psubw m3, m11 movu m9, [r6 + r4] movu m6, [r7 + r5] psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax pmaddwd m0, m7 pmaddwd m1, m7 pmaddwd m2, m7 pmaddwd m8, m7 paddd m0, m1 paddd m2, m8 paddd m12, m0 paddd m12, m2 ; sum[2] HADDD m12, m0 punpckldq xm12, xm13 ; SA8D punpcklqdq xm0, xm10, xm12 paddd xm0, [pd_1] psrld xm0, 1 HADDD xm0, xm1 movd eax, xm0 RET %endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10 xavs2-1.3/source/common/x86/pixel-util.h000066400000000000000000000240651340660520300201100ustar00rootroot00000000000000/***************************************************************************** * Copyright (C) 2013-2017 MulticoreWare, Inc * * Authors: Steve Borho ;* Min Chen * Jiaqi Zhang * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ #ifndef XAVS2_PIXEL_UTIL_H #define XAVS2_PIXEL_UTIL_H #define xavs2_getResidual4_sse2 FPFX(getResidual4_sse2) void xavs2_getResidual4_sse2(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride); #define xavs2_getResidual8_sse2 FPFX(getResidual8_sse2) void xavs2_getResidual8_sse2(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride); #define xavs2_getResidual16_sse2 FPFX(getResidual16_sse2) void xavs2_getResidual16_sse2(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride); #define xavs2_getResidual16_sse4 FPFX(getResidual16_sse4) void xavs2_getResidual16_sse4(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride); #define xavs2_getResidual32_sse2 FPFX(getResidual32_sse2) void xavs2_getResidual32_sse2(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride); #define xavs2_getResidual32_sse4 FPFX(getResidual32_sse4) void xavs2_getResidual32_sse4(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride); #define xavs2_getResidual16_avx2 FPFX(getResidual16_avx2) void xavs2_getResidual16_avx2(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride); #define xavs2_getResidual32_avx2 FPFX(getResidual32_avx2) void xavs2_getResidual32_avx2(const pel_t *fenc, const pel_t *pred, int16_t *residual, intptr_t stride); #define xavs2_transpose4_sse2 FPFX(transpose4_sse2) void xavs2_transpose4_sse2(pel_t *dst, const pel_t *src, intptr_t stride); #define xavs2_transpose8_sse2 FPFX(transpose8_sse2) void xavs2_transpose8_sse2(pel_t *dst, const pel_t *src, intptr_t stride); #define xavs2_transpose16_sse2 FPFX(transpose16_sse2) void xavs2_transpose16_sse2(pel_t *dst, const pel_t *src, intptr_t stride); #define xavs2_transpose32_sse2 FPFX(transpose32_sse2) void xavs2_transpose32_sse2(pel_t *dst, const pel_t *src, intptr_t stride); #define xavs2_transpose64_sse2 FPFX(transpose64_sse2) void xavs2_transpose64_sse2(pel_t *dst, const pel_t *src, intptr_t stride); #define xavs2_transpose8_avx2 FPFX(transpose8_avx2) void xavs2_transpose8_avx2(pel_t *dst, const pel_t *src, intptr_t stride); #define xavs2_transpose16_avx2 FPFX(transpose16_avx2) void xavs2_transpose16_avx2(pel_t *dst, const pel_t *src, intptr_t stride); #define xavs2_transpose32_avx2 FPFX(transpose32_avx2) void xavs2_transpose32_avx2(pel_t *dst, const pel_t *src, intptr_t stride); #define xavs2_transpose64_avx2 FPFX(transpose64_avx2) void xavs2_transpose64_avx2(pel_t *dst, const pel_t *src, intptr_t stride); #define xavs2_count_nonzero_4x4_ssse3 FPFX(count_nonzero_4x4_ssse3) int xavs2_count_nonzero_4x4_ssse3(const int16_t *quantCoeff); #define xavs2_count_nonzero_8x8_ssse3 FPFX(count_nonzero_8x8_ssse3) int xavs2_count_nonzero_8x8_ssse3(const int16_t *quantCoeff); #define xavs2_count_nonzero_16x16_ssse3 FPFX(count_nonzero_16x16_ssse3) int xavs2_count_nonzero_16x16_ssse3(const int16_t *quantCoeff); #define xavs2_count_nonzero_32x32_ssse3 FPFX(count_nonzero_32x32_ssse3) int xavs2_count_nonzero_32x32_ssse3(const int16_t *quantCoeff); #define xavs2_count_nonzero_4x4_avx2 FPFX(count_nonzero_4x4_avx2) int xavs2_count_nonzero_4x4_avx2(const int16_t *quantCoeff); #define xavs2_count_nonzero_8x8_avx2 FPFX(count_nonzero_8x8_avx2) int xavs2_count_nonzero_8x8_avx2(const int16_t *quantCoeff); #define xavs2_count_nonzero_16x16_avx2 FPFX(count_nonzero_16x16_avx2) int xavs2_count_nonzero_16x16_avx2(const int16_t *quantCoeff); #define xavs2_count_nonzero_32x32_avx2 FPFX(count_nonzero_32x32_avx2) int xavs2_count_nonzero_32x32_avx2(const int16_t *quantCoeff); #define xavs2_weight_pp_sse4 FPFX(weight_pp_sse4) void xavs2_weight_pp_sse4(const pel_t *src, pel_t *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); #define xavs2_weight_pp_avx2 FPFX(weight_pp_avx2) void xavs2_weight_pp_avx2(const pel_t *src, pel_t *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); #define xavs2_weight_sp_sse4 FPFX(weight_sp_sse4) void xavs2_weight_sp_sse4(const int16_t *src, pel_t *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset); #define xavs2_pixel_ssim_4x4x2_core_mmx2 FPFX(pixel_ssim_4x4x2_core_mmx2) void xavs2_pixel_ssim_4x4x2_core_mmx2(const pel_t *pix1, intptr_t stride1, const pel_t *pix2, intptr_t stride2, int sums[2][4]); #define xavs2_pixel_ssim_4x4x2_core_sse2 FPFX(pixel_ssim_4x4x2_core_sse2) void xavs2_pixel_ssim_4x4x2_core_sse2(const pel_t *pix1, intptr_t stride1, const pel_t *pix2, intptr_t stride2, int sums[2][4]); #define xavs2_pixel_ssim_4x4x2_core_avx FPFX(pixel_ssim_4x4x2_core_avx) void xavs2_pixel_ssim_4x4x2_core_avx(const pel_t *pix1, intptr_t stride1, const pel_t *pix2, intptr_t stride2, int sums[2][4]); #define xavs2_pixel_ssim_end4_sse2 FPFX(pixel_ssim_end4_sse2) float xavs2_pixel_ssim_end4_sse2(int sum0[5][4], int sum1[5][4], int width); #define xavs2_pixel_ssim_end4_avx FPFX(pixel_ssim_end4_avx) float xavs2_pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width); #define xavs2_scale1D_128to64_ssse3 FPFX(scale1D_128to64_ssse3) void xavs2_scale1D_128to64_ssse3(pel_t*, const pel_t*); #define xavs2_scale1D_128to64_avx2 FPFX(scale1D_128to64_avx2) void xavs2_scale1D_128to64_avx2(pel_t*, const pel_t*); #define xavs2_scale2D_64to32_ssse3 FPFX(scale2D_64to32_ssse3) void xavs2_scale2D_64to32_ssse3(pel_t*, const pel_t*, intptr_t); #define xavs2_scale2D_64to32_avx2 FPFX(scale2D_64to32_avx2) void xavs2_scale2D_64to32_avx2(pel_t*, const pel_t*, intptr_t); #define xavs2_scanPosLast_x64 FPFX(scanPosLast_x64) int xavs2_scanPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t *scanCG4x4, const int trSize); #define xavs2_scanPosLast_avx2_bmi2 FPFX(scanPosLast_avx2_bmi2) int xavs2_scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t *scanCG4x4, const int trSize); #define xavs2_findPosFirstLast_ssse3 FPFX(findPosFirstLast_ssse3) uint32_t xavs2_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]); #define xavs2_costCoeffNxN_sse4 FPFX(costCoeffNxN_sse4) uint32_t xavs2_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase); #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \ void xavs2_pixel_sub_ps_ ## W ## x ## H ## cpu(coeff_t *dst, intptr_t destride, const pel_t *src0, const pel_t *src1, intptr_t srcstride0, intptr_t srcstride1); \ void xavs2_pixel_add_ps_ ## W ## x ## H ## cpu(pel_t *dst, intptr_t destride, const pel_t *src0, const coeff_t * src1, intptr_t srcStride0, intptr_t srcStride1); #define CHROMA_PIXELSUB_DEF(cpu) \ SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \ SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 8, cpu); \ SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 16, cpu); \ SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 32, cpu); #define CHROMA_422_PIXELSUB_DEF(cpu) \ SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 8, cpu); \ SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 16, cpu); \ SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 32, cpu); \ SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 64, cpu); #define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \ void xavs2_pixel_sub_ps_ ## W ## x ## H ## cpu(coeff_t *dst, intptr_t destride, const pel_t *src0, const pel_t *src1, intptr_t srcstride0, intptr_t srcstride1); \ void xavs2_pixel_add_ps_ ## W ## x ## H ## cpu(pel_t *dst, intptr_t destride, const pel_t *src0, const coeff_t * src1, intptr_t srcStride0, intptr_t srcStride1); #define LUMA_PIXELSUB_DEF(cpu) \ SETUP_LUMA_PIXELSUB_PS_FUNC(8, 8, cpu); \ SETUP_LUMA_PIXELSUB_PS_FUNC(16, 16, cpu); \ SETUP_LUMA_PIXELSUB_PS_FUNC(32, 32, cpu); \ SETUP_LUMA_PIXELSUB_PS_FUNC(64, 64, cpu); LUMA_PIXELSUB_DEF(_sse2); CHROMA_PIXELSUB_DEF(_sse2); CHROMA_422_PIXELSUB_DEF(_sse2); LUMA_PIXELSUB_DEF(_sse4); CHROMA_PIXELSUB_DEF(_sse4); CHROMA_422_PIXELSUB_DEF(_sse4); #define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \ uint64_t xavs2_pixel_var_ ## W ## x ## H ## cpu(const pel_t *pix, intptr_t pixstride); #define LUMA_PIXELVAR_DEF(cpu) \ SETUP_LUMA_PIXELVAR_FUNC(8, 8, cpu); \ SETUP_LUMA_PIXELVAR_FUNC(16, 16, cpu); \ SETUP_LUMA_PIXELVAR_FUNC(32, 32, cpu); \ SETUP_LUMA_PIXELVAR_FUNC(64, 64, cpu); LUMA_PIXELVAR_DEF(_sse2); LUMA_PIXELVAR_DEF(_xop); LUMA_PIXELVAR_DEF(_avx); #undef CHROMA_PIXELSUB_DEF #undef CHROMA_422_PIXELSUB_DEF #undef LUMA_PIXELSUB_DEF #undef LUMA_PIXELVAR_DEF #undef SETUP_CHROMA_PIXELSUB_PS_FUNC #undef SETUP_LUMA_PIXELSUB_PS_FUNC #undef SETUP_LUMA_PIXELVAR_FUNC #endif // ifndef XAVS2_PIXEL_UTIL_H xavs2-1.3/source/common/x86/pixel-util8.asm000066400000000000000000006416071340660520300205400ustar00rootroot00000000000000;***************************************************************************** ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Min Chen ;* Nabajit Deka ;* Rajesh Paulraj ;* Jiaqi Zhang ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;*****************************************************************************/ %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 %if BIT_DEPTH == 12 ssim_c1: times 4 dd 107321.76 ; .01*.01*4095*4095*64 ssim_c2: times 4 dd 60851437.92 ; .03*.03*4095*4095*64*63 pf_64: times 4 dd 64.0 pf_128: times 4 dd 128.0 %elif BIT_DEPTH == 10 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63 pf_64: times 4 dd 64.0 pf_128: times 4 dd 128.0 %elif BIT_DEPTH == 9 ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64 ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63 %else ; 8-bit ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 %endif mask_ff: times 16 db 0xff times 16 db 0 deinterleave_shuf: times 2 db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 interleave_shuf: times 2 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 deinterleave_word_shuf: times 2 db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 hmulw_16p: times 8 dw 1 times 4 dw 1, -1 SECTION .text cextern pw_1 cextern pw_0_7 cextern pb_1 cextern pb_128 cextern pw_00ff cextern pw_1023 cextern pw_3fff cextern pw_2000 cextern pw_pixel_max cextern pd_1 cextern pd_32767 cextern pd_n32768 cextern pb_2 cextern pb_4 cextern pb_8 cextern pb_15 cextern pb_16 cextern pb_32 cextern pb_64 cextern hmul_16p cextern trans8_shuf ; cextern_naked private_prefix %+ _entropyStateBits cextern pb_movemask cextern pw_exp2_0_15 ;----------------------------------------------------------------------------- ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride) ;----------------------------------------------------------------------------- INIT_XMM sse2 %if HIGH_BIT_DEPTH cglobal getResidual4, 4,4,4 add r3, r3 ; row 0-1 movh m0, [r0] movh m1, [r0 + r3] movh m2, [r1] movh m3, [r1 + r3] punpcklqdq m0, m1 punpcklqdq m2, m3 psubw m0, m2 movh [r2], m0 movhps [r2 + r3], m0 lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 2] ; row 2-3 movh m0, [r0] movh m1, [r0 + r3] movh m2, [r1] movh m3, [r1 + r3] punpcklqdq m0, m1 punpcklqdq m2, m3 psubw m0, m2 movh [r2], m0 movhps [r2 + r3], m0 RET %else cglobal getResidual4, 4,4,5 pxor m0, m0 ; row 0-1 movd m1, [r0] movd m2, [r0 + r3] movd m3, [r1] movd m4, [r1 + r3] punpckldq m1, m2 punpcklbw m1, m0 punpckldq m3, m4 punpcklbw m3, m0 psubw m1, m3 movh [r2], m1 movhps [r2 + r3 * 2], m1 lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 4] ; row 2-3 movd m1, [r0] movd m2, [r0 + r3] movd m3, [r1] movd m4, [r1 + r3] punpckldq m1, m2 punpcklbw m1, m0 punpckldq m3, m4 punpcklbw m3, m0 psubw m1, m3 movh [r2], m1 movhps [r2 + r3 * 2], m1 RET %endif INIT_XMM sse2 %if HIGH_BIT_DEPTH cglobal getResidual8, 4,4,4 add r3, r3 %assign x 0 %rep 8/2 ; row 0-1 movu m1, [r0] movu m2, [r0 + r3] movu m3, [r1] movu m4, [r1 + r3] psubw m1, m3 psubw m2, m4 movu [r2], m1 movu [r2 + r3], m2 %assign x x+1 %if (x != 4) lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 2] %endif %endrep RET %else cglobal getResidual8, 4,4,5 pxor m0, m0 %assign x 0 %rep 8/2 ; row 0-1 movh m1, [r0] movh m2, [r0 + r3] movh m3, [r1] movh m4, [r1 + r3] punpcklbw m1, m0 punpcklbw m2, m0 punpcklbw m3, m0 punpcklbw m4, m0 psubw m1, m3 psubw m2, m4 movu [r2], m1 movu [r2 + r3 * 2], m2 %assign x x+1 %if (x != 4) lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 4] %endif %endrep RET %endif %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal getResidual16, 4,5,6 add r3, r3 mov r4d, 16/4 .loop: ; row 0-1 movu m0, [r0] movu m1, [r0 + 16] movu m2, [r0 + r3] movu m3, [r0 + r3 + 16] movu m4, [r1] movu m5, [r1 + 16] psubw m0, m4 psubw m1, m5 movu m4, [r1 + r3] movu m5, [r1 + r3 + 16] psubw m2, m4 psubw m3, m5 lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] movu [r2], m0 movu [r2 + 16], m1 movu [r2 + r3], m2 movu [r2 + r3 + 16], m3 lea r2, [r2 + r3 * 2] ; row 2-3 movu m0, [r0] movu m1, [r0 + 16] movu m2, [r0 + r3] movu m3, [r0 + r3 + 16] movu m4, [r1] movu m5, [r1 + 16] psubw m0, m4 psubw m1, m5 movu m4, [r1 + r3] movu m5, [r1 + r3 + 16] psubw m2, m4 psubw m3, m5 movu [r2], m0 movu [r2 + 16], m1 movu [r2 + r3], m2 movu [r2 + r3 + 16], m3 dec r4d lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 2] jnz .loop RET %else INIT_XMM sse4 cglobal getResidual16, 4,5,8 mov r4d, 16/4 pxor m0, m0 .loop: ; row 0-1 movu m1, [r0] movu m2, [r0 + r3] movu m3, [r1] movu m4, [r1 + r3] pmovzxbw m5, m1 punpckhbw m1, m0 pmovzxbw m6, m2 punpckhbw m2, m0 pmovzxbw m7, m3 punpckhbw m3, m0 psubw m5, m7 psubw m1, m3 pmovzxbw m7, m4 punpckhbw m4, m0 psubw m6, m7 psubw m2, m4 movu [r2], m5 movu [r2 + 16], m1 movu [r2 + r3 * 2], m6 movu [r2 + r3 * 2 + 16], m2 lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 4] ; row 2-3 movu m1, [r0] movu m2, [r0 + r3] movu m3, [r1] movu m4, [r1 + r3] pmovzxbw m5, m1 punpckhbw m1, m0 pmovzxbw m6, m2 punpckhbw m2, m0 pmovzxbw m7, m3 punpckhbw m3, m0 psubw m5, m7 psubw m1, m3 pmovzxbw m7, m4 punpckhbw m4, m0 psubw m6, m7 psubw m2, m4 movu [r2], m5 movu [r2 + 16], m1 movu [r2 + r3 * 2], m6 movu [r2 + r3 * 2 + 16], m2 dec r4d lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 4] jnz .loop RET %endif %if HIGH_BIT_DEPTH INIT_YMM avx2 cglobal getResidual16, 4,4,5 add r3, r3 pxor m0, m0 %assign x 0 %rep 16/2 movu m1, [r0] movu m2, [r0 + r3] movu m3, [r1] movu m4, [r1 + r3] psubw m1, m3 psubw m2, m4 movu [r2], m1 movu [r2 + r3], m2 %assign x x+1 %if (x != 8) lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 2] %endif %endrep RET %else INIT_YMM avx2 cglobal getResidual16, 4,5,8 lea r4, [r3 * 2] add r4d, r3d %assign x 0 %rep 4 pmovzxbw m0, [r0] pmovzxbw m1, [r0 + r3] pmovzxbw m2, [r0 + r3 * 2] pmovzxbw m3, [r0 + r4] pmovzxbw m4, [r1] pmovzxbw m5, [r1 + r3] pmovzxbw m6, [r1 + r3 * 2] pmovzxbw m7, [r1 + r4] psubw m0, m4 psubw m1, m5 psubw m2, m6 psubw m3, m7 movu [r2], m0 movu [r2 + r3 * 2], m1 movu [r2 + r3 * 2 * 2], m2 movu [r2 + r4 * 2], m3 %assign x x+1 %if (x != 4) lea r0, [r0 + r3 * 2 * 2] lea r1, [r1 + r3 * 2 * 2] lea r2, [r2 + r3 * 4 * 2] %endif %endrep RET %endif %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal getResidual32, 4,5,6 add r3, r3 mov r4d, 32/2 .loop: ; row 0 movu m0, [r0] movu m1, [r0 + 16] movu m2, [r0 + 32] movu m3, [r0 + 48] movu m4, [r1] movu m5, [r1 + 16] psubw m0, m4 psubw m1, m5 movu m4, [r1 + 32] movu m5, [r1 + 48] psubw m2, m4 psubw m3, m5 movu [r2], m0 movu [r2 + 16], m1 movu [r2 + 32], m2 movu [r2 + 48], m3 ; row 1 movu m0, [r0 + r3] movu m1, [r0 + r3 + 16] movu m2, [r0 + r3 + 32] movu m3, [r0 + r3 + 48] movu m4, [r1 + r3] movu m5, [r1 + r3 + 16] psubw m0, m4 psubw m1, m5 movu m4, [r1 + r3 + 32] movu m5, [r1 + r3 + 48] psubw m2, m4 psubw m3, m5 movu [r2 + r3], m0 movu [r2 + r3 + 16], m1 movu [r2 + r3 + 32], m2 movu [r2 + r3 + 48], m3 dec r4d lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 2] jnz .loop RET %else INIT_XMM sse4 cglobal getResidual32, 4,5,7 mov r4d, 32/2 pxor m0, m0 .loop: movu m1, [r0] movu m2, [r0 + 16] movu m3, [r1] movu m4, [r1 + 16] pmovzxbw m5, m1 punpckhbw m1, m0 pmovzxbw m6, m3 punpckhbw m3, m0 psubw m5, m6 psubw m1, m3 movu [r2 + 0 * 16], m5 movu [r2 + 1 * 16], m1 pmovzxbw m5, m2 punpckhbw m2, m0 pmovzxbw m6, m4 punpckhbw m4, m0 psubw m5, m6 psubw m2, m4 movu [r2 + 2 * 16], m5 movu [r2 + 3 * 16], m2 movu m1, [r0 + r3] movu m2, [r0 + r3 + 16] movu m3, [r1 + r3] movu m4, [r1 + r3 + 16] pmovzxbw m5, m1 punpckhbw m1, m0 pmovzxbw m6, m3 punpckhbw m3, m0 psubw m5, m6 psubw m1, m3 movu [r2 + r3 * 2 + 0 * 16], m5 movu [r2 + r3 * 2 + 1 * 16], m1 pmovzxbw m5, m2 punpckhbw m2, m0 pmovzxbw m6, m4 punpckhbw m4, m0 psubw m5, m6 psubw m2, m4 movu [r2 + r3 * 2 + 2 * 16], m5 movu [r2 + r3 * 2 + 3 * 16], m2 dec r4d lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 4] jnz .loop RET %endif %if HIGH_BIT_DEPTH INIT_YMM avx2 cglobal getResidual32, 4,4,5 add r3, r3 pxor m0, m0 %assign x 0 %rep 32 movu m1, [r0] movu m2, [r0 + 32] movu m3, [r1] movu m4, [r1 + 32] psubw m1, m3 psubw m2, m4 movu [r2], m1 movu [r2 + 32], m2 %assign x x+1 %if (x != 32) lea r0, [r0 + r3] lea r1, [r1 + r3] lea r2, [r2 + r3] %endif %endrep RET %else INIT_YMM avx2 cglobal getResidual32, 4,5,8 lea r4, [r3 * 2] %assign x 0 %rep 16 pmovzxbw m0, [r0] pmovzxbw m1, [r0 + 16] pmovzxbw m2, [r0 + r3] pmovzxbw m3, [r0 + r3 + 16] pmovzxbw m4, [r1] pmovzxbw m5, [r1 + 16] pmovzxbw m6, [r1 + r3] pmovzxbw m7, [r1 + r3 + 16] psubw m0, m4 psubw m1, m5 psubw m2, m6 psubw m3, m7 movu [r2 + 0 ], m0 movu [r2 + 32], m1 movu [r2 + r4 + 0], m2 movu [r2 + r4 + 32], m3 %assign x x+1 %if (x != 16) lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 4] %endif %endrep RET %endif %if 0 ; REMOVED ;----------------------------------------------------------------------------- ; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal quant, 5,6,8 ; fill qbits movd m4, r4d ; m4 = qbits ; fill qbits-8 sub r4d, 8 movd m6, r4d ; m6 = qbits8 ; fill offset movd m5, r5m pshufd m5, m5, 0 ; m5 = add lea r5, [pd_1] mov r4d, r6m shr r4d, 3 pxor m7, m7 ; m7 = numZero .loop: ; 4 coeff pmovsxwd m0, [r0] ; m0 = level pabsd m1, m0 pmulld m1, [r1] ; m0 = tmpLevel1 paddd m2, m1, m5 psrad m2, m4 ; m2 = level1 pslld m3, m2, 8 psrad m1, m6 psubd m1, m3 ; m1 = deltaU1 movu [r2], m1 psignd m3, m2, m0 pminud m2, [r5] paddd m7, m2 packssdw m3, m3 movh [r3], m3 ; 4 coeff pmovsxwd m0, [r0 + 8] ; m0 = level pabsd m1, m0 pmulld m1, [r1 + 16] ; m0 = tmpLevel1 paddd m2, m1, m5 psrad m2, m4 ; m2 = level1 pslld m3, m2, 8 psrad m1, m6 psubd m1, m3 ; m1 = deltaU1 movu [r2 + 16], m1 psignd m3, m2, m0 pminud m2, [r5] paddd m7, m2 packssdw m3, m3 movh [r3 + 8], m3 add r0, 16 add r1, 32 add r2, 32 add r3, 16 dec r4d jnz .loop pshufd m0, m7, 00001110b paddd m0, m7 pshufd m1, m0, 00000001b paddd m0, m1 movd eax, m0 RET %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal quant, 5,6,9 ; fill qbits movd xm4, r4d ; m4 = qbits ; fill qbits-8 sub r4d, 8 movd xm6, r4d ; m6 = qbits8 ; fill offset %if UNIX64 == 0 vpbroadcastd m5, r5m ; m5 = add %else ; Mac movd xm5, r5m vpbroadcastd m5, xm5 ; m5 = add %endif lea r5, [pw_1] mov r4d, r6m shr r4d, 4 pxor m7, m7 ; m7 = numZero .loop: ; 8 coeff pmovsxwd m0, [r0] ; m0 = level pabsd m1, m0 pmulld m1, [r1] ; m0 = tmpLevel1 paddd m2, m1, m5 psrad m2, xm4 ; m2 = level1 pslld m3, m2, 8 psrad m1, xm6 psubd m1, m3 ; m1 = deltaU1 movu [r2], m1 psignd m2, m0 ; 8 coeff pmovsxwd m0, [r0 + mmsize/2] ; m0 = level pabsd m1, m0 pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1 paddd m3, m1, m5 psrad m3, xm4 ; m2 = level1 pslld m8, m3, 8 psrad m1, xm6 psubd m1, m8 ; m1 = deltaU1 movu [r2 + mmsize], m1 psignd m3, m0 packssdw m2, m3 vpermq m2, m2, q3120 movu [r3], m2 ; count non-zero coeff ; TODO: popcnt is faster, but some CPU can't support pminuw m2, [r5] paddw m7, m2 add r0, mmsize add r1, mmsize*2 add r2, mmsize*2 add r3, mmsize dec r4d jnz .loop ; sum count xorpd m0, m0 psadbw m7, m0 vextracti128 xm1, m7, 1 paddd xm7, xm1 movhlps xm0, xm7 paddd xm7, xm0 movd eax, xm7 RET %else ; ARCH_X86_64 == 1 INIT_YMM avx2 cglobal quant, 5,6,8 ; fill qbits movd xm4, r4d ; m4 = qbits ; fill qbits-8 sub r4d, 8 movd xm6, r4d ; m6 = qbits8 ; fill offset %if UNIX64 == 0 vpbroadcastd m5, r5m ; m5 = add %else ; Mac movd xm5, r5m vpbroadcastd m5, xm5 ; m5 = add %endif lea r5, [pd_1] mov r4d, r6m shr r4d, 4 pxor m7, m7 ; m7 = numZero .loop: ; 8 coeff pmovsxwd m0, [r0] ; m0 = level pabsd m1, m0 pmulld m1, [r1] ; m0 = tmpLevel1 paddd m2, m1, m5 psrad m2, xm4 ; m2 = level1 pslld m3, m2, 8 psrad m1, xm6 psubd m1, m3 ; m1 = deltaU1 movu [r2], m1 psignd m3, m2, m0 pminud m2, [r5] paddd m7, m2 packssdw m3, m3 vpermq m3, m3, q0020 movu [r3], xm3 ; 8 coeff pmovsxwd m0, [r0 + mmsize/2] ; m0 = level pabsd m1, m0 pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1 paddd m2, m1, m5 psrad m2, xm4 ; m2 = level1 pslld m3, m2, 8 psrad m1, xm6 psubd m1, m3 ; m1 = deltaU1 movu [r2 + mmsize], m1 psignd m3, m2, m0 pminud m2, [r5] paddd m7, m2 packssdw m3, m3 vpermq m3, m3, q0020 movu [r3 + mmsize/2], xm3 add r0, mmsize add r1, mmsize*2 add r2, mmsize*2 add r3, mmsize dec r4d jnz .loop xorpd m0, m0 psadbw m7, m0 vextracti128 xm1, m7, 1 paddd xm7, xm1 movhlps xm0, xm7 paddd xm7, xm0 movd eax, xm7 RET %endif ; ARCH_X86_64 == 1 ;----------------------------------------------------------------------------- ; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal nquant, 3,5,8 movd m6, r4m mov r4d, r5m pxor m7, m7 ; m7 = numZero movd m5, r3m ; m5 = qbits pshufd m6, m6, 0 ; m6 = add mov r3d, r4d ; r3 = numCoeff shr r4d, 3 pxor m4, m4 .loop: pmovsxwd m0, [r0] ; m0 = level pmovsxwd m1, [r0 + 8] ; m1 = level pabsd m2, m0 pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff paddd m2, m6 psrad m2, m5 ; m0 = level1 psignd m2, m0 pabsd m3, m1 pmulld m3, [r1 + 16] ; m1 = tmpLevel1 * qcoeff paddd m3, m6 psrad m3, m5 ; m1 = level1 psignd m3, m1 packssdw m2, m3 pabsw m2, m2 movu [r2], m2 add r0, 16 add r1, 32 add r2, 16 pcmpeqw m2, m4 psubw m7, m2 dec r4d jnz .loop packuswb m7, m7 psadbw m7, m4 mov eax, r3d movd r4d, m7 sub eax, r4d ; numSig RET INIT_YMM avx2 cglobal nquant, 3,5,7 %if UNIX64 == 0 vpbroadcastd m4, r4m %else ; Mac movd xm4, r4m vpbroadcastd m4, xm4 %endif vpbroadcastd m6, [pw_1] mov r4d, r5m pxor m5, m5 ; m7 = numZero movd xm3, r3m ; m5 = qbits mov r3d, r4d ; r3 = numCoeff shr r4d, 4 .loop: pmovsxwd m0, [r0] ; m0 = level pabsd m1, m0 pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff paddd m1, m4 psrad m1, xm3 ; m0 = level1 psignd m1, m0 pmovsxwd m0, [r0 + mmsize/2] ; m0 = level pabsd m2, m0 pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff paddd m2, m4 psrad m2, xm3 ; m0 = level1 psignd m2, m0 packssdw m1, m2 pabsw m1, m1 vpermq m2, m1, q3120 movu [r2], m2 add r0, mmsize add r1, mmsize * 2 add r2, mmsize pminuw m1, m6 paddw m5, m1 dec r4d jnz .loop pxor m0, m0 psadbw m5, m0 vextracti128 xm0, m5, 1 paddd xm5, xm0 pshufd xm0, xm5, 2 paddd xm5, xm0 movd eax, xm5 RET ;----------------------------------------------------------------------------- ; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal dequant_normal, 5,5,5 mova m2, [pw_1] %if HIGH_BIT_DEPTH cmp r3d, 32767 jle .skip shr r3d, (BIT_DEPTH - 8) sub r4d, (BIT_DEPTH - 8) .skip: %endif movd m0, r4d ; m0 = shift add r4d, 15 bts r3d, r4d movd m1, r3d pshufd m1, m1, 0 ; m1 = dword [add scale] ; m0 = shift ; m1 = scale ; m2 = word [1] .loop: movu m3, [r0] punpckhwd m4, m3, m2 punpcklwd m3, m2 pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add) pmaddwd m4, m1 psrad m3, m0 psrad m4, m0 packssdw m3, m4 mova [r1], m3 add r0, 16 add r1, 16 sub r2d, 8 jnz .loop RET ;---------------------------------------------------------------------------------------------------------------------- ;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift) ;---------------------------------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal dequant_scaling, 6,6,6 add r5d, 4 shr r3d, 3 ; num/8 cmp r5d, r4d jle .skip sub r5d, r4d mova m0, [pd_1] movd m1, r5d ; shift - per dec r5d movd m2, r5d ; shift - per - 1 pslld m0, m2 ; 1 << shift - per - 1 .part0: pmovsxwd m2, [r0] pmovsxwd m4, [r0 + 8] movu m3, [r1] movu m5, [r1 + 16] pmulld m2, m3 pmulld m4, m5 paddd m2, m0 paddd m4, m0 psrad m2, m1 psrad m4, m1 packssdw m2, m4 movu [r2], m2 add r0, 16 add r1, 32 add r2, 16 dec r3d jnz .part0 jmp .end .skip: sub r4d, r5d ; per - shift movd m0, r4d .part1: pmovsxwd m2, [r0] pmovsxwd m4, [r0 + 8] movu m3, [r1] movu m5, [r1 + 16] pmulld m2, m3 pmulld m4, m5 packssdw m2, m4 pmovsxwd m1, m2 psrldq m2, 8 pmovsxwd m2, m2 pslld m1, m0 pslld m2, m0 packssdw m1, m2 movu [r2], m1 add r0, 16 add r1, 32 add r2, 16 dec r3d jnz .part1 .end: RET ;---------------------------------------------------------------------------------------------------------------------- ;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift) ;---------------------------------------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal dequant_scaling, 6,6,6 add r5d, 4 shr r3d, 4 ; num/16 cmp r5d, r4d jle .skip sub r5d, r4d mova m0, [pd_1] movd xm1, r5d ; shift - per dec r5d movd xm2, r5d ; shift - per - 1 pslld m0, xm2 ; 1 << shift - per - 1 .part0: pmovsxwd m2, [r0] pmovsxwd m4, [r0 + 16] movu m3, [r1] movu m5, [r1 + 32] pmulld m2, m3 pmulld m4, m5 paddd m2, m0 paddd m4, m0 psrad m2, xm1 psrad m4, xm1 packssdw m2, m4 vpermq m2, m2, 11011000b movu [r2], m2 add r0, 32 add r1, 64 add r2, 32 dec r3d jnz .part0 jmp .end .skip: sub r4d, r5d ; per - shift movd xm0, r4d .part1: pmovsxwd m2, [r0] pmovsxwd m4, [r0 + 16] movu m3, [r1] movu m5, [r1 + 32] pmulld m2, m3 pmulld m4, m5 packssdw m2, m4 vextracti128 xm4, m2, 1 pmovsxwd m1, xm2 pmovsxwd m2, xm4 pslld m1, xm0 pslld m2, xm0 packssdw m1, m2 movu [r2], m1 add r0, 32 add r1, 64 add r2, 32 dec r3d jnz .part1 .end: RET INIT_YMM avx2 cglobal dequant_normal, 5,5,7 vpbroadcastd m2, [pw_1] ; m2 = word [1] vpbroadcastd m5, [pd_32767] ; m5 = dword [32767] vpbroadcastd m6, [pd_n32768] ; m6 = dword [-32768] %if HIGH_BIT_DEPTH cmp r3d, 32767 jle .skip shr r3d, (BIT_DEPTH - 8) sub r4d, (BIT_DEPTH - 8) .skip: %endif movd xm0, r4d ; m0 = shift add r4d, -1+16 bts r3d, r4d movd xm1, r3d vpbroadcastd m1, xm1 ; m1 = dword [add scale] ; m0 = shift ; m1 = scale ; m2 = word [1] shr r2d, 4 .loop: movu m3, [r0] punpckhwd m4, m3, m2 punpcklwd m3, m2 pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add) pmaddwd m4, m1 psrad m3, xm0 psrad m4, xm0 pminsd m3, m5 pmaxsd m3, m6 pminsd m4, m5 pmaxsd m4, m6 packssdw m3, m4 mova [r1 + 0 * mmsize/2], xm3 vextracti128 [r1 + 1 * mmsize/2], m3, 1 add r0, mmsize add r1, mmsize dec r2d jnz .loop RET %endif ; %if 0 ; REMOVED ;----------------------------------------------------------------------------- ; int xavs2_count_nonzero_4x4_sse2(const int16_t *quantCoeff); ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal count_nonzero_4x4, 1,1,2 pxor m0, m0 mova m1, [r0 + 0] packsswb m1, [r0 + 16] pcmpeqb m1, m0 paddb m1, [pb_1] psadbw m1, m0 pshufd m0, m1, 2 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int xavs2_count_nonzero_4x4_avx2(const int16_t *quantCoeff); ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal count_nonzero_4x4, 1,1,2 pxor m0, m0 movu m1, [r0] pcmpeqw m1, m0 pmovmskb eax, m1 not eax popcnt eax, eax shr eax, 1 RET ;----------------------------------------------------------------------------- ; int xavs2_count_nonzero_8x8_sse2(const int16_t *quantCoeff); ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal count_nonzero_8x8, 1,1,3 pxor m0, m0 movu m1, [pb_4] %rep 4 mova m2, [r0 + 0] packsswb m2, [r0 + 16] add r0, 32 pcmpeqb m2, m0 paddb m1, m2 %endrep psadbw m1, m0 pshufd m0, m1, 2 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int xavs2_count_nonzero_8x8_avx2(const int16_t *quantCoeff); ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal count_nonzero_8x8, 1,1,3 pxor m0, m0 movu m1, [pb_2] mova m2, [r0] packsswb m2, [r0 + 32] pcmpeqb m2, m0 paddb m1, m2 mova m2, [r0 + 64] packsswb m2, [r0 + 96] pcmpeqb m2, m0 paddb m1, m2 psadbw m1, m0 vextracti128 xm0, m1, 1 paddd m0, m1 pshufd m1, m0, 2 paddd m0, m1 movd eax, xm0 RET ;----------------------------------------------------------------------------- ; int xavs2_count_nonzero_16x16_sse2(const int16_t *quantCoeff); ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal count_nonzero_16x16, 1,1,3 pxor m0, m0 movu m1, [pb_16] %rep 16 mova m2, [r0 + 0] packsswb m2, [r0 + 16] add r0, 32 pcmpeqb m2, m0 paddb m1, m2 %endrep psadbw m1, m0 pshufd m0, m1, 2 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int xavs2_count_nonzero_16x16_avx2(const int16_t *quantCoeff); ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal count_nonzero_16x16, 1,1,3 pxor m0, m0 movu m1, [pb_8] %assign x 0 %rep 8 mova m2, [r0 + x] packsswb m2, [r0 + x + 32] %assign x x+64 pcmpeqb m2, m0 paddb m1, m2 %endrep psadbw m1, m0 vextracti128 xm0, m1, 1 paddd m0, m1 pshufd m1, m0, 2 paddd m0, m1 movd eax, xm0 RET ;----------------------------------------------------------------------------- ; int xavs2_count_nonzero_32x32_sse2(const int16_t *quantCoeff); ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal count_nonzero_32x32, 1,1,3 pxor m0, m0 movu m1, [pb_64] %rep 64 mova m2, [r0 + 0] packsswb m2, [r0 + 16] add r0, 32 pcmpeqb m2, m0 paddb m1, m2 %endrep psadbw m1, m0 pshufd m0, m1, 2 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int xavs2_count_nonzero_32x32_avx2(const int16_t *quantCoeff); ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal count_nonzero_32x32, 1,1,3 pxor m0, m0 movu m1, [pb_32] %assign x 0 %rep 32 mova m2, [r0 + x] packsswb m2, [r0 + x + 32] %assign x x+64 pcmpeqb m2, m0 paddb m1, m2 %endrep psadbw m1, m0 vextracti128 xm0, m1, 1 paddd m0, m1 pshufd m1, m0, 2 paddd m0, m1 movd eax, xm0 RET ;----------------------------------------------------------------------------------------------------------------------------------------------- ;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset) ;----------------------------------------------------------------------------------------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_XMM sse4 cglobal weight_pp, 4,7,7 %define correction (14 - BIT_DEPTH) mova m6, [pw_pixel_max] mov r6d, r6m mov r4d, r4m mov r5d, r5m shl r6d, 16 - correction or r6d, r5d ; assuming both (w0) and round are using maximum of 16 bits each. movd m0, r6d pshufd m0, m0, 0 ; m0 = [w0, round] mov r5d, r7m sub r5d, correction movd m1, r5d movd m2, r8m pshufd m2, m2, 0 mova m5, [pw_1] sub r2d, r3d add r2d, r2d shr r3d, 4 .loopH: mov r5d, r3d .loopW: movu m4, [r0] punpcklwd m3, m4, m5 pmaddwd m3, m0 psrad m3, m1 paddd m3, m2 ; TODO: we can put Offset into Round, but we have to analyze Dynamic Range before that. punpckhwd m4, m5 pmaddwd m4, m0 psrad m4, m1 paddd m4, m2 packusdw m3, m4 pminuw m3, m6 movu [r1], m3 movu m4, [r0 + mmsize] punpcklwd m3, m4, m5 pmaddwd m3, m0 psrad m3, m1 paddd m3, m2 punpckhwd m4, m5 pmaddwd m4, m0 psrad m4, m1 paddd m4, m2 packusdw m3, m4 pminuw m3, m6 movu [r1 + mmsize], m3 add r0, 2 * mmsize add r1, 2 * mmsize dec r5d jnz .loopW add r0, r2 add r1, r2 dec r4d jnz .loopH RET %else ; end of (HIGH_BIT_DEPTH == 1) INIT_XMM sse4 cglobal weight_pp, 6,7,6 shl r5d, 6 ; m0 = [w0<<6] mov r6d, r6m shl r6d, 16 or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each. movd m0, r6d pshufd m0, m0, 0 ; m0 = [w0<<6, round] movd m1, r7m movd m2, r8m pshufd m2, m2, 0 mova m5, [pw_1] sub r2d, r3d shr r3d, 4 .loopH: mov r5d, r3d .loopW: pmovzxbw m4, [r0] punpcklwd m3, m4, m5 pmaddwd m3, m0 psrad m3, m1 paddd m3, m2 punpckhwd m4, m5 pmaddwd m4, m0 psrad m4, m1 paddd m4, m2 packssdw m3, m4 packuswb m3, m3 movh [r1], m3 pmovzxbw m4, [r0 + 8] punpcklwd m3, m4, m5 pmaddwd m3, m0 psrad m3, m1 paddd m3, m2 punpckhwd m4, m5 pmaddwd m4, m0 psrad m4, m1 paddd m4, m2 packssdw m3, m4 packuswb m3, m3 movh [r1 + 8], m3 add r0, 16 add r1, 16 dec r5d jnz .loopW lea r0, [r0 + r2] lea r1, [r1 + r2] dec r4d jnz .loopH RET %endif ; end of (HIGH_BIT_DEPTH == 0) %if HIGH_BIT_DEPTH INIT_YMM avx2 cglobal weight_pp, 6, 7, 7 %define correction (14 - BIT_DEPTH) mov r6d, r6m shl r6d, 16 - correction or r6d, r5d ; assuming both w0 and round are using maximum of 16 bits each. movd xm0, r6d vpbroadcastd m0, xm0 mov r5d, r7m sub r5d, correction movd xm1, r5d vpbroadcastd m2, r8m mova m5, [pw_1] mova m6, [pw_pixel_max] add r2d, r2d add r3d, r3d sub r2d, r3d shr r3d, 5 .loopH: mov r5d, r3d .loopW: movu m4, [r0] punpcklwd m3, m4, m5 pmaddwd m3, m0 psrad m3, xm1 paddd m3, m2 punpckhwd m4, m5 pmaddwd m4, m0 psrad m4, xm1 paddd m4, m2 packusdw m3, m4 pminuw m3, m6 movu [r1], m3 add r0, 32 add r1, 32 dec r5d jnz .loopW lea r0, [r0 + r2] lea r1, [r1 + r2] dec r4d jnz .loopH %undef correction RET %else INIT_YMM avx2 cglobal weight_pp, 6, 7, 6 shl r5d, 6 ; m0 = [w0<<6] mov r6d, r6m shl r6d, 16 or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each. movd xm0, r6d vpbroadcastd m0, xm0 movd xm1, r7m vpbroadcastd m2, r8m mova m5, [pw_1] sub r2d, r3d shr r3d, 4 .loopH: mov r5d, r3d .loopW: pmovzxbw m4, [r0] punpcklwd m3, m4, m5 pmaddwd m3, m0 psrad m3, xm1 paddd m3, m2 punpckhwd m4, m5 pmaddwd m4, m0 psrad m4, xm1 paddd m4, m2 packssdw m3, m4 vextracti128 xm4, m3, 1 packuswb xm3, xm4 movu [r1], xm3 add r0, 16 add r1, 16 dec r5d jnz .loopW lea r0, [r0 + r2] lea r1, [r1 + r2] dec r4d jnz .loopH RET %endif ;------------------------------------------------------------------------------------------------------------------------------------------------- ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) ;------------------------------------------------------------------------------------------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_XMM sse4 cglobal weight_sp, 6,7,8 mova m1, [pw_pixel_max] mova m2, [pw_1] mov r6d, r7m shl r6d, 16 or r6d, r6m ; assuming both (w0) and round are using maximum of 16 bits each. movd m3, r6d pshufd m3, m3, 0 ; m3 = [round w0] movd m4, r8m ; m4 = [shift] movd m5, r9m pshufd m5, m5, 0 ; m5 = [offset] ; correct row stride add r3d, r3d add r2d, r2d mov r6d, r4d and r6d, ~(mmsize / SIZEOF_PIXEL - 1) sub r3d, r6d sub r3d, r6d sub r2d, r6d sub r2d, r6d ; generate partial width mask (MUST BE IN XMM0) mov r6d, r4d and r6d, (mmsize / SIZEOF_PIXEL - 1) movd m0, r6d pshuflw m0, m0, 0 punpcklqdq m0, m0 pcmpgtw m0, [pw_0_7] .loopH: mov r6d, r4d .loopW: movu m6, [r0] paddw m6, [pw_2000] punpcklwd m7, m6, m2 pmaddwd m7, m3 psrad m7, m4 paddd m7, m5 punpckhwd m6, m2 pmaddwd m6, m3 psrad m6, m4 paddd m6, m5 packusdw m7, m6 pminuw m7, m1 sub r6d, (mmsize / SIZEOF_PIXEL) jl .widthLess8 movu [r1], m7 lea r0, [r0 + mmsize] lea r1, [r1 + mmsize] je .nextH jmp .loopW .widthLess8: movu m6, [r1] pblendvb m6, m6, m7, m0 movu [r1], m6 .nextH: add r0, r2 add r1, r3 dec r5d jnz .loopH RET %else ; end of (HIGH_BIT_DEPTH == 1) INIT_XMM sse4 %if ARCH_X86_64 cglobal weight_sp, 6, 7+2, 7 %define tmp_r0 r7 %define tmp_r1 r8 %else ; ARCH_X86_64 = 0 cglobal weight_sp, 6, 7, 7, 0-(2*4) %define tmp_r0 [(rsp + 0 * 4)] %define tmp_r1 [(rsp + 1 * 4)] %endif ; ARCH_X86_64 movd m0, r6m ; m0 = [w0] movd m1, r7m ; m1 = [round] punpcklwd m0, m1 pshufd m0, m0, 0 ; m0 = [w0 round] movd m1, r8m ; m1 = [shift] movd m2, r9m pshufd m2, m2, 0 ; m2 =[offset] mova m3, [pw_1] mova m4, [pw_2000] add r2d, r2d .loopH: mov r6d, r4d ; save old src and dst mov tmp_r0, r0 mov tmp_r1, r1 .loopW: movu m5, [r0] paddw m5, m4 punpcklwd m6,m5, m3 pmaddwd m6, m0 psrad m6, m1 paddd m6, m2 punpckhwd m5, m3 pmaddwd m5, m0 psrad m5, m1 paddd m5, m2 packssdw m6, m5 packuswb m6, m6 sub r6d, 8 jl .width4 movh [r1], m6 je .nextH add r0, 16 add r1, 8 jmp .loopW .width4: cmp r6d, -4 jl .width2 movd [r1], m6 je .nextH add r1, 4 pshufd m6, m6, 1 .width2: pextrw [r1], m6, 0 .nextH: mov r0, tmp_r0 mov r1, tmp_r1 lea r0, [r0 + r2] lea r1, [r1 + r3] dec r5d jnz .loopH RET %endif %if ARCH_X86_64 == 1 %if HIGH_BIT_DEPTH INIT_YMM avx2 cglobal weight_sp, 6,7,9 mova m1, [pw_pixel_max] mova m2, [pw_1] mov r6d, r7m shl r6d, 16 or r6d, r6m movd xm3, r6d vpbroadcastd m3, xm3 ; m3 = [round w0] movd xm4, r8m ; m4 = [shift] vpbroadcastd m5, r9m ; m5 = [offset] ; correct row stride add r3d, r3d add r2d, r2d mov r6d, r4d and r6d, ~(mmsize / SIZEOF_PIXEL - 1) sub r3d, r6d sub r3d, r6d sub r2d, r6d sub r2d, r6d ; generate partial width mask (MUST BE IN YMM0) mov r6d, r4d and r6d, (mmsize / SIZEOF_PIXEL - 1) movd xm0, r6d pshuflw m0, m0, 0 punpcklqdq m0, m0 vinserti128 m0, m0, xm0, 1 pcmpgtw m0, [pw_0_7] .loopH: mov r6d, r4d .loopW: movu m6, [r0] paddw m6, [pw_2000] punpcklwd m7, m6, m2 pmaddwd m7, m3 ;(round w0) psrad m7, xm4 ;(shift) paddd m7, m5 ;(offset) punpckhwd m6, m2 pmaddwd m6, m3 psrad m6, xm4 paddd m6, m5 packusdw m7, m6 pminuw m7, m1 sub r6d, (mmsize / SIZEOF_PIXEL) jl .width14 movu [r1], m7 lea r0, [r0 + mmsize] lea r1, [r1 + mmsize] je .nextH jmp .loopW .width14: add r6d, 16 cmp r6d, 14 jl .width12 movu [r1], xm7 vextracti128 xm8, m7, 1 movq [r1 + 16], xm8 pextrd [r1 + 24], xm8, 2 je .nextH .width12: cmp r6d, 12 jl .width10 movu [r1], xm7 vextracti128 xm8, m7, 1 movq [r1 + 16], xm8 je .nextH .width10: cmp r6d, 10 jl .width8 movu [r1], xm7 vextracti128 xm8, m7, 1 movd [r1 + 16], xm8 je .nextH .width8: cmp r6d, 8 jl .width6 movu [r1], xm7 je .nextH .width6 cmp r6d, 6 jl .width4 movq [r1], xm7 pextrd [r1 + 8], xm7, 2 je .nextH .width4: cmp r6d, 4 jl .width2 movq [r1], xm7 je .nextH add r1, 4 pshufd m6, m6, 1 je .nextH .width2: movd [r1], xm7 .nextH: add r0, r2 add r1, r3 dec r5d jnz .loopH RET %else INIT_YMM avx2 cglobal weight_sp, 6, 9, 7 mov r7d, r7m shl r7d, 16 or r7d, r6m movd xm0, r7d vpbroadcastd m0, xm0 ; m0 = times 8 dw w0, round movd xm1, r8m ; m1 = [shift] vpbroadcastd m2, r9m ; m2 = times 16 dw offset vpbroadcastw m3, [pw_1] vpbroadcastw m4, [pw_2000] add r2d, r2d ; 2 * srcstride mov r7, r0 mov r8, r1 .loopH: mov r6d, r4d ; width ; save old src and dst mov r0, r7 ; src mov r1, r8 ; dst .loopW: movu m5, [r0] paddw m5, m4 punpcklwd m6,m5, m3 pmaddwd m6, m0 psrad m6, xm1 paddd m6, m2 punpckhwd m5, m3 pmaddwd m5, m0 psrad m5, xm1 paddd m5, m2 packssdw m6, m5 packuswb m6, m6 vpermq m6, m6, 10001000b sub r6d, 16 jl .width8 movu [r1], xm6 je .nextH add r0, 32 add r1, 16 jmp .loopW .width8: add r6d, 16 cmp r6d, 8 jl .width4 movq [r1], xm6 je .nextH psrldq m6, 8 sub r6d, 8 add r1, 8 .width4: cmp r6d, 4 jl .width2 movd [r1], xm6 je .nextH add r1, 4 pshufd m6, m6, 1 .width2: pextrw [r1], xm6, 0 .nextH: lea r7, [r7 + r2] lea r8, [r8 + r3] dec r5d jnz .loopH RET %endif %endif ;----------------------------------------------------------------- ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride) ;----------------------------------------------------------------- INIT_XMM sse2 cglobal transpose4, 3, 3, 4, dest, src, stride %if HIGH_BIT_DEPTH == 1 add r2, r2 movh m0, [r1] movh m1, [r1 + r2] movh m2, [r1 + 2 * r2] lea r1, [r1 + 2 * r2] movh m3, [r1 + r2] punpcklwd m0, m1 punpcklwd m2, m3 punpckhdq m1, m0, m2 punpckldq m0, m2 movu [r0], m0 movu [r0 + 16], m1 %else ;HIGH_BIT_DEPTH == 0 movd m0, [r1] movd m1, [r1 + r2] movd m2, [r1 + 2 * r2] lea r1, [r1 + 2 * r2] movd m3, [r1 + r2] punpcklbw m0, m1 punpcklbw m2, m3 punpcklwd m0, m2 movu [r0], m0 %endif RET ;----------------------------------------------------------------- ; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride) ;----------------------------------------------------------------- %if HIGH_BIT_DEPTH == 1 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal transpose8, 3, 5, 5 add r2, r2 lea r3, [3 * r2] lea r4, [r1 + 4 * r2] movu xm0, [r1] vinserti128 m0, m0, [r4], 1 movu xm1, [r1 + r2] vinserti128 m1, m1, [r4 + r2], 1 movu xm2, [r1 + 2 * r2] vinserti128 m2, m2, [r4 + 2 * r2], 1 movu xm3, [r1 + r3] vinserti128 m3, m3, [r4 + r3], 1 punpcklwd m4, m0, m1 ;[1 - 4][row1row2;row5row6] punpckhwd m0, m1 ;[5 - 8][row1row2;row5row6] punpcklwd m1, m2, m3 ;[1 - 4][row3row4;row7row8] punpckhwd m2, m3 ;[5 - 8][row3row4;row7row8] punpckldq m3, m4, m1 ;[1 - 2][row1row2row3row4;row5row6row7row8] punpckhdq m4, m1 ;[3 - 4][row1row2row3row4;row5row6row7row8] punpckldq m1, m0, m2 ;[5 - 6][row1row2row3row4;row5row6row7row8] punpckhdq m0, m2 ;[7 - 8][row1row2row3row4;row5row6row7row8] vpermq m3, m3, 0xD8 ;[1 ; 2][row1row2row3row4row5row6row7row8] vpermq m4, m4, 0xD8 ;[3 ; 4][row1row2row3row4row5row6row7row8] vpermq m1, m1, 0xD8 ;[5 ; 6][row1row2row3row4row5row6row7row8] vpermq m0, m0, 0xD8 ;[7 ; 8][row1row2row3row4row5row6row7row8] movu [r0 + 0 * 32], m3 movu [r0 + 1 * 32], m4 movu [r0 + 2 * 32], m1 movu [r0 + 3 * 32], m0 RET %endif INIT_XMM sse2 %macro TRANSPOSE_4x4 1 movh m0, [r1] movh m1, [r1 + r2] movh m2, [r1 + 2 * r2] lea r1, [r1 + 2 * r2] movh m3, [r1 + r2] punpcklwd m0, m1 punpcklwd m2, m3 punpckhdq m1, m0, m2 punpckldq m0, m2 movh [r0], m0 movhps [r0 + %1], m0 movh [r0 + 2 * %1], m1 lea r0, [r0 + 2 * %1] movhps [r0 + %1], m1 %endmacro cglobal transpose8_internal TRANSPOSE_4x4 r5 lea r1, [r1 + 2 * r2] lea r0, [r3 + 8] TRANSPOSE_4x4 r5 lea r1, [r1 + 2 * r2] neg r2 lea r1, [r1 + r2 * 8 + 8] neg r2 lea r0, [r3 + 4 * r5] TRANSPOSE_4x4 r5 lea r1, [r1 + 2 * r2] lea r0, [r3 + 8 + 4 * r5] TRANSPOSE_4x4 r5 ret cglobal transpose8, 3, 6, 4, dest, src, stride add r2, r2 mov r3, r0 mov r5, 16 call transpose8_internal RET %else ;HIGH_BIT_DEPTH == 0 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal transpose8, 3, 4, 4 lea r3, [r2 * 3] movq xm0, [r1] movhps xm0, [r1 + 2 * r2] movq xm1, [r1 + r2] movhps xm1, [r1 + r3] lea r1, [r1 + 4 * r2] movq xm2, [r1] movhps xm2, [r1 + 2 * r2] movq xm3, [r1 + r2] movhps xm3, [r1 + r3] vinserti128 m0, m0, xm2, 1 ;[row1 row3 row5 row7] vinserti128 m1, m1, xm3, 1 ;[row2 row4 row6 row8] punpcklbw m2, m0, m1 ;[1 - 8; 1 - 8][row1row2; row5row6] punpckhbw m0, m1 ;[1 - 8; 1 - 8][row3row4; row7row8] punpcklwd m1, m2, m0 ;[1 - 4; 1 - 4][row1row2row3row4; row5row6row7row8] punpckhwd m2, m0 ;[5 - 8; 5 - 8][row1row2row3row4; row5row6row7row8] mova m0, [trans8_shuf] vpermd m1, m0, m1 ;[1 - 2; 3 - 4][row1row2row3row4row5row6row7row8] vpermd m2, m0, m2 ;[4 - 5; 6 - 7][row1row2row3row4row5row6row7row8] movu [r0], m1 movu [r0 + 32], m2 RET %endif INIT_XMM sse2 cglobal transpose8, 3, 5, 8, dest, src, stride lea r3, [2 * r2] lea r4, [3 * r2] movh m0, [r1] movh m1, [r1 + r2] movh m2, [r1 + r3] movh m3, [r1 + r4] movh m4, [r1 + 4 * r2] lea r1, [r1 + 4 * r2] movh m5, [r1 + r2] movh m6, [r1 + r3] movh m7, [r1 + r4] punpcklbw m0, m1 punpcklbw m2, m3 punpcklbw m4, m5 punpcklbw m6, m7 punpckhwd m1, m0, m2 punpcklwd m0, m2 punpckhwd m5, m4, m6 punpcklwd m4, m6 punpckhdq m2, m0, m4 punpckldq m0, m4 punpckhdq m3, m1, m5 punpckldq m1, m5 movu [r0], m0 movu [r0 + 16], m2 movu [r0 + 32], m1 movu [r0 + 48], m3 RET %endif %macro TRANSPOSE_8x8 1 movh m0, [r1] movh m1, [r1 + r2] movh m2, [r1 + 2 * r2] lea r1, [r1 + 2 * r2] movh m3, [r1 + r2] movh m4, [r1 + 2 * r2] lea r1, [r1 + 2 * r2] movh m5, [r1 + r2] movh m6, [r1 + 2 * r2] lea r1, [r1 + 2 * r2] movh m7, [r1 + r2] punpcklbw m0, m1 punpcklbw m2, m3 punpcklbw m4, m5 punpcklbw m6, m7 punpckhwd m1, m0, m2 punpcklwd m0, m2 punpckhwd m5, m4, m6 punpcklwd m4, m6 punpckhdq m2, m0, m4 punpckldq m0, m4 punpckhdq m3, m1, m5 punpckldq m1, m5 movh [r0], m0 movhps [r0 + %1], m0 movh [r0 + 2 * %1], m2 lea r0, [r0 + 2 * %1] movhps [r0 + %1], m2 movh [r0 + 2 * %1], m1 lea r0, [r0 + 2 * %1] movhps [r0 + %1], m1 movh [r0 + 2 * %1], m3 lea r0, [r0 + 2 * %1] movhps [r0 + %1], m3 %endmacro ;----------------------------------------------------------------- ; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride) ;----------------------------------------------------------------- %if HIGH_BIT_DEPTH == 1 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal transpose16x8_internal movu m0, [r1] movu m1, [r1 + r2] movu m2, [r1 + 2 * r2] movu m3, [r1 + r3] lea r1, [r1 + 4 * r2] movu m4, [r1] movu m5, [r1 + r2] movu m6, [r1 + 2 * r2] movu m7, [r1 + r3] punpcklwd m8, m0, m1 ;[1 - 4; 9 - 12][1 2] punpckhwd m0, m1 ;[5 - 8; 13 -16][1 2] punpcklwd m1, m2, m3 ;[1 - 4; 9 - 12][3 4] punpckhwd m2, m3 ;[5 - 8; 13 -16][3 4] punpcklwd m3, m4, m5 ;[1 - 4; 9 - 12][5 6] punpckhwd m4, m5 ;[5 - 8; 13 -16][5 6] punpcklwd m5, m6, m7 ;[1 - 4; 9 - 12][7 8] punpckhwd m6, m7 ;[5 - 8; 13 -16][7 8] punpckldq m7, m8, m1 ;[1 - 2; 9 - 10][1 2 3 4] punpckhdq m8, m1 ;[3 - 4; 11 - 12][1 2 3 4] punpckldq m1, m3, m5 ;[1 - 2; 9 - 10][5 6 7 8] punpckhdq m3, m5 ;[3 - 4; 11 - 12][5 6 7 8] punpckldq m5, m0, m2 ;[5 - 6; 13 - 14][1 2 3 4] punpckhdq m0, m2 ;[7 - 8; 15 - 16][1 2 3 4] punpckldq m2, m4, m6 ;[5 - 6; 13 - 14][5 6 7 8] punpckhdq m4, m6 ;[7 - 8; 15 - 16][5 6 7 8] punpcklqdq m6, m7, m1 ;[1 ; 9 ][1 2 3 4 5 6 7 8] punpckhqdq m7, m1 ;[2 ; 10][1 2 3 4 5 6 7 8] punpcklqdq m1, m8, m3 ;[3 ; 11][1 2 3 4 5 6 7 8] punpckhqdq m8, m3 ;[4 ; 12][1 2 3 4 5 6 7 8] punpcklqdq m3, m5, m2 ;[5 ; 13][1 2 3 4 5 6 7 8] punpckhqdq m5, m2 ;[6 ; 14][1 2 3 4 5 6 7 8] punpcklqdq m2, m0, m4 ;[7 ; 15][1 2 3 4 5 6 7 8] punpckhqdq m0, m4 ;[8 ; 16][1 2 3 4 5 6 7 8] movu [r0 + 0 * 32], xm6 vextracti128 [r0 + 8 * 32], m6, 1 movu [r0 + 1 * 32], xm7 vextracti128 [r0 + 9 * 32], m7, 1 movu [r0 + 2 * 32], xm1 vextracti128 [r0 + 10 * 32], m1, 1 movu [r0 + 3 * 32], xm8 vextracti128 [r0 + 11 * 32], m8, 1 movu [r0 + 4 * 32], xm3 vextracti128 [r0 + 12 * 32], m3, 1 movu [r0 + 5 * 32], xm5 vextracti128 [r0 + 13 * 32], m5, 1 movu [r0 + 6 * 32], xm2 vextracti128 [r0 + 14 * 32], m2, 1 movu [r0 + 7 * 32], xm0 vextracti128 [r0 + 15 * 32], m0, 1 ret cglobal transpose16, 3, 4, 9 add r2, r2 lea r3, [r2 * 3] call transpose16x8_internal lea r1, [r1 + 4 * r2] add r0, 16 call transpose16x8_internal RET %endif INIT_XMM sse2 cglobal transpose16, 3, 7, 4, dest, src, stride add r2, r2 mov r3, r0 mov r4, r1 mov r5, 32 mov r6, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 16] mov r3, r0 call transpose8_internal lea r1, [r4 + 16] lea r0, [r6 + 8 * r5] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 8 * r5 + 16] mov r3, r0 call transpose8_internal RET %else ;HIGH_BIT_DEPTH == 0 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal transpose16, 3, 5, 9 lea r3, [r2 * 3] lea r4, [r1 + 8 * r2] movu xm0, [r1] movu xm1, [r1 + r2] movu xm2, [r1 + 2 * r2] movu xm3, [r1 + r3] vinserti128 m0, m0, [r4], 1 vinserti128 m1, m1, [r4 + r2], 1 vinserti128 m2, m2, [r4 + 2 * r2], 1 vinserti128 m3, m3, [r4 + r3], 1 lea r1, [r1 + 4 * r2] lea r4, [r4 + 4 * r2] movu xm4, [r1] movu xm5, [r1 + r2] movu xm6, [r1 + 2 * r2] movu xm7, [r1 + r3] vinserti128 m4, m4, [r4], 1 vinserti128 m5, m5, [r4 + r2], 1 vinserti128 m6, m6, [r4 + 2 * r2], 1 vinserti128 m7, m7, [r4 + r3], 1 punpcklbw m8, m0, m1 ;[1 - 8 ; 1 - 8 ][1 2 9 10] punpckhbw m0, m1 ;[9 - 16; 9 - 16][1 2 9 10] punpcklbw m1, m2, m3 ;[1 - 8 ; 1 - 8 ][3 4 11 12] punpckhbw m2, m3 ;[9 - 16; 9 - 16][3 4 11 12] punpcklbw m3, m4, m5 ;[1 - 8 ; 1 - 8 ][5 6 13 14] punpckhbw m4, m5 ;[9 - 16; 9 - 16][5 6 13 14] punpcklbw m5, m6, m7 ;[1 - 8 ; 1 - 8 ][7 8 15 16] punpckhbw m6, m7 ;[9 - 16; 9 - 16][7 8 15 16] punpcklwd m7, m8, m1 ;[1 - 4 ; 1 - 4][1 2 3 4 9 10 11 12] punpckhwd m8, m1 ;[5 - 8 ; 5 - 8][1 2 3 4 9 10 11 12] punpcklwd m1, m3, m5 ;[1 - 4 ; 1 - 4][5 6 7 8 13 14 15 16] punpckhwd m3, m5 ;[5 - 8 ; 5 - 8][5 6 7 8 13 14 15 16] punpcklwd m5, m0, m2 ;[9 - 12; 9 - 12][1 2 3 4 9 10 11 12] punpckhwd m0, m2 ;[13- 16; 13 - 16][1 2 3 4 9 10 11 12] punpcklwd m2, m4, m6 ;[9 - 12; 9 - 12][5 6 7 8 13 14 15 16] punpckhwd m4, m6 ;[13- 16; 13 - 16][5 6 7 8 13 14 15 16] punpckldq m6, m7, m1 ;[1 - 2 ; 1 - 2][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhdq m7, m1 ;[3 - 4 ; 3 - 4][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckldq m1, m8, m3 ;[5 - 6 ; 5 - 6][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhdq m8, m3 ;[7 - 8 ; 7 - 8][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckldq m3, m5, m2 ;[9 - 10; 9 - 10][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhdq m5, m2 ;[11- 12; 11 - 12][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckldq m2, m0, m4 ;[13- 14; 13 - 14][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhdq m0, m4 ;[15- 16; 15 - 16][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] vpermq m6, m6, 0xD8 vpermq m7, m7, 0xD8 vpermq m1, m1, 0xD8 vpermq m8, m8, 0xD8 vpermq m3, m3, 0xD8 vpermq m5, m5, 0xD8 vpermq m2, m2, 0xD8 vpermq m0, m0, 0xD8 movu [r0 + 0 * 16], m6 movu [r0 + 2 * 16], m7 movu [r0 + 4 * 16], m1 movu [r0 + 6 * 16], m8 movu [r0 + 8 * 16], m3 movu [r0 + 10 * 16], m5 movu [r0 + 12 * 16], m2 movu [r0 + 14 * 16], m0 RET %endif INIT_XMM sse2 cglobal transpose16, 3, 5, 8, dest, src, stride mov r3, r0 mov r4, r1 TRANSPOSE_8x8 16 lea r1, [r1 + 2 * r2] lea r0, [r3 + 8] TRANSPOSE_8x8 16 lea r1, [r4 + 8] lea r0, [r3 + 8 * 16] TRANSPOSE_8x8 16 lea r1, [r1 + 2 * r2] lea r0, [r3 + 8 * 16 + 8] TRANSPOSE_8x8 16 RET %endif cglobal transpose16_internal TRANSPOSE_8x8 r6 lea r1, [r1 + 2 * r2] lea r0, [r5 + 8] TRANSPOSE_8x8 r6 lea r1, [r1 + 2 * r2] neg r2 lea r1, [r1 + r2 * 8] lea r1, [r1 + r2 * 8 + 8] neg r2 lea r0, [r5 + 8 * r6] TRANSPOSE_8x8 r6 lea r1, [r1 + 2 * r2] lea r0, [r5 + 8 * r6 + 8] TRANSPOSE_8x8 r6 ret ;----------------------------------------------------------------- ; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride) ;----------------------------------------------------------------- %if HIGH_BIT_DEPTH == 1 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal transpose8x32_internal movu m0, [r1] movu m1, [r1 + 32] movu m2, [r1 + r2] movu m3, [r1 + r2 + 32] movu m4, [r1 + 2 * r2] movu m5, [r1 + 2 * r2 + 32] movu m6, [r1 + r3] movu m7, [r1 + r3 + 32] lea r1, [r1 + 4 * r2] punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2] punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2] punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4] punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4] punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2] punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2] punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4] punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4] punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4] punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4] punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4] punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4] punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4] punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4] punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4] punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4] movq [r0 + 0 * 64], xm7 movhps [r0 + 1 * 64], xm7 vextracti128 xm5, m7, 1 movq [r0 + 8 * 64], xm5 movhps [r0 + 9 * 64], xm5 movu m7, [r1] movu m9, [r1 + 32] movu m10, [r1 + r2] movu m11, [r1 + r2 + 32] movu m12, [r1 + 2 * r2] movu m13, [r1 + 2 * r2 + 32] movu m14, [r1 + r3] movu m15, [r1 + r3 + 32] punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6] punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6] punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8] punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8] punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6] punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6] punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8] punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8] punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8] punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8] punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8] punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8] punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8] punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8] punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8] punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8] movq [r0 + 0 * 64 + 8], xm15 movhps [r0 + 1 * 64 + 8], xm15 vextracti128 xm13, m15, 1 movq [r0 + 8 * 64 + 8], xm13 movhps [r0 + 9 * 64 + 8], xm13 punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8] punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8] punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8] punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8] punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8] punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8] punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8] punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8] punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8] punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8] punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8] punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8] punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8] punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8] movu [r0 + 2 * 64], xm13 vextracti128 [r0 + 10 * 64], m13, 1 movu [r0 + 3 * 64], xm8 vextracti128 [r0 + 11 * 64], m8, 1 movu [r0 + 4 * 64], xm5 vextracti128 [r0 + 12 * 64], m5, 1 movu [r0 + 5 * 64], xm2 vextracti128 [r0 + 13 * 64], m2, 1 movu [r0 + 6 * 64], xm10 vextracti128 [r0 + 14 * 64], m10, 1 movu [r0 + 7 * 64], xm0 vextracti128 [r0 + 15 * 64], m0, 1 movu [r0 + 16 * 64], xm7 vextracti128 [r0 + 24 * 64], m7, 1 movu [r0 + 17 * 64], xm4 vextracti128 [r0 + 25 * 64], m4, 1 movu [r0 + 18 * 64], xm12 vextracti128 [r0 + 26 * 64], m12, 1 movu [r0 + 19 * 64], xm6 vextracti128 [r0 + 27 * 64], m6, 1 movu [r0 + 20 * 64], xm14 vextracti128 [r0 + 28 * 64], m14, 1 movu [r0 + 21 * 64], xm3 vextracti128 [r0 + 29 * 64], m3, 1 movu [r0 + 22 * 64], xm11 vextracti128 [r0 + 30 * 64], m11, 1 movu [r0 + 23 * 64], xm1 vextracti128 [r0 + 31 * 64], m1, 1 ret cglobal transpose32, 3, 4, 16 add r2, r2 lea r3, [r2 * 3] call transpose8x32_internal add r0, 16 lea r1, [r1 + 4 * r2] call transpose8x32_internal add r0, 16 lea r1, [r1 + 4 * r2] call transpose8x32_internal add r0, 16 lea r1, [r1 + 4 * r2] call transpose8x32_internal RET %endif INIT_XMM sse2 cglobal transpose32, 3, 7, 4, dest, src, stride add r2, r2 mov r3, r0 mov r4, r1 mov r5, 64 mov r6, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 16] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 32] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 48] mov r3, r0 call transpose8_internal lea r1, [r4 + 16] lea r0, [r6 + 8 * 64] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 8 * 64 + 16] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 8 * 64 + 32] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 8 * 64 + 48] mov r3, r0 call transpose8_internal lea r1, [r4 + 32] lea r0, [r6 + 16 * 64] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 16 * 64 + 16] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 16 * 64 + 32] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 16 * 64 + 48] mov r3, r0 call transpose8_internal lea r1, [r4 + 48] lea r0, [r6 + 24 * 64] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 24 * 64 + 16] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 24 * 64 + 32] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 24 * 64 + 48] mov r3, r0 call transpose8_internal RET %else ;HIGH_BIT_DEPTH == 0 INIT_XMM sse2 cglobal transpose32, 3, 7, 8, dest, src, stride mov r3, r0 mov r4, r1 mov r5, r0 mov r6, 32 call transpose16_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r3 + 16] mov r5, r0 call transpose16_internal lea r1, [r4 + 16] lea r0, [r3 + 16 * 32] mov r5, r0 call transpose16_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r3 + 16 * 32 + 16] mov r5, r0 call transpose16_internal RET %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal transpose32, 3, 5, 16 lea r3, [r2 * 3] mov r4d, 2 .loop: movu m0, [r1] movu m1, [r1 + r2] movu m2, [r1 + 2 * r2] movu m3, [r1 + r3] lea r1, [r1 + 4 * r2] movu m4, [r1] movu m5, [r1 + r2] movu m6, [r1 + 2 * r2] movu m7, [r1 + r3] punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2] punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2] punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4] punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4] punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6] punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6] punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8] punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8] punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4] punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4] punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8] punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8] punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4] punpckhwd m0, m2 ;[13- 15; 29 - 32][1 2 3 4] punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8] punpckhwd m4, m6 ;[13- 15; 29 - 32][5 6 7 8] punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8] punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8] punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8] punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8] punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8] punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8] punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8] punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8] movq [r0 + 0 * 32], xm6 movhps [r0 + 1 * 32], xm6 vextracti128 xm4, m6, 1 movq [r0 + 16 * 32], xm4 movhps [r0 + 17 * 32], xm4 lea r1, [r1 + 4 * r2] movu m9, [r1] movu m10, [r1 + r2] movu m11, [r1 + 2 * r2] movu m12, [r1 + r3] lea r1, [r1 + 4 * r2] movu m13, [r1] movu m14, [r1 + r2] movu m15, [r1 + 2 * r2] movu m6, [r1 + r3] punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10] punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10] punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12] punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12] punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14] punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14] punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16] punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16] punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12] punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12] punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16] punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16] punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12] punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12] punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16] punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16] punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16] punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16] punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16] punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16] punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16] punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16] punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16] punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16] punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] movq [r0 + 0 * 32 + 8], xm15 movhps [r0 + 1 * 32 + 8], xm15 vextracti128 xm9, m15, 1 movq [r0 + 16 * 32 + 8], xm9 movhps [r0 + 17 * 32 + 8], xm9 movu [r0 + 2 * 32], xm13 vextracti128 [r0 + 18 * 32], m13, 1 movu [r0 + 3 * 32], xm7 vextracti128 [r0 + 19 * 32], m7, 1 movu [r0 + 4 * 32], xm6 vextracti128 [r0 + 20 * 32], m6, 1 movu [r0 + 5 * 32], xm1 vextracti128 [r0 + 21 * 32], m1, 1 movu [r0 + 6 * 32], xm10 vextracti128 [r0 + 22 * 32], m10, 1 movu [r0 + 7 * 32], xm8 vextracti128 [r0 + 23 * 32], m8, 1 movu [r0 + 8 * 32], xm4 vextracti128 [r0 + 24 * 32], m4, 1 movu [r0 + 9 * 32], xm3 vextracti128 [r0 + 25 * 32], m3, 1 movu [r0 + 10 * 32], xm12 vextracti128 [r0 + 26 * 32], m12, 1 movu [r0 + 11 * 32], xm5 vextracti128 [r0 + 27 * 32], m5, 1 movu [r0 + 12 * 32], xm14 vextracti128 [r0 + 28 * 32], m14, 1 movu [r0 + 13 * 32], xm2 vextracti128 [r0 + 29 * 32], m2, 1 movu [r0 + 14 * 32], xm11 vextracti128 [r0 + 30 * 32], m11, 1 movu [r0 + 15 * 32], xm0 vextracti128 [r0 + 31 * 32], m0, 1 add r0, 16 lea r1, [r1 + 4 * r2] dec r4d jnz .loop RET %endif %endif ;----------------------------------------------------------------- ; void transpose_64x64(pixel *dst, pixel *src, intptr_t stride) ;----------------------------------------------------------------- %if HIGH_BIT_DEPTH == 1 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal transpose8x32_64_internal movu m0, [r1] movu m1, [r1 + 32] movu m2, [r1 + r2] movu m3, [r1 + r2 + 32] movu m4, [r1 + 2 * r2] movu m5, [r1 + 2 * r2 + 32] movu m6, [r1 + r3] movu m7, [r1 + r3 + 32] lea r1, [r1 + 4 * r2] punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2] punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2] punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4] punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4] punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2] punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2] punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4] punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4] punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4] punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4] punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4] punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4] punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4] punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4] punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4] punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4] movq [r0 + 0 * 128], xm7 movhps [r0 + 1 * 128], xm7 vextracti128 xm5, m7, 1 movq [r0 + 8 * 128], xm5 movhps [r0 + 9 * 128], xm5 movu m7, [r1] movu m9, [r1 + 32] movu m10, [r1 + r2] movu m11, [r1 + r2 + 32] movu m12, [r1 + 2 * r2] movu m13, [r1 + 2 * r2 + 32] movu m14, [r1 + r3] movu m15, [r1 + r3 + 32] punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6] punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6] punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8] punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8] punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6] punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6] punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8] punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8] punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8] punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8] punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8] punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8] punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8] punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8] punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8] punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8] movq [r0 + 0 * 128 + 8], xm15 movhps [r0 + 1 * 128 + 8], xm15 vextracti128 xm13, m15, 1 movq [r0 + 8 * 128 + 8], xm13 movhps [r0 + 9 * 128 + 8], xm13 punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8] punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8] punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8] punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8] punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8] punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8] punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8] punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8] punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8] punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8] punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8] punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8] punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8] punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8] movu [r0 + 2 * 128], xm13 vextracti128 [r0 + 10 * 128], m13, 1 movu [r0 + 3 * 128], xm8 vextracti128 [r0 + 11 * 128], m8, 1 movu [r0 + 4 * 128], xm5 vextracti128 [r0 + 12 * 128], m5, 1 movu [r0 + 5 * 128], xm2 vextracti128 [r0 + 13 * 128], m2, 1 movu [r0 + 6 * 128], xm10 vextracti128 [r0 + 14 * 128], m10, 1 movu [r0 + 7 * 128], xm0 vextracti128 [r0 + 15 * 128], m0, 1 movu [r0 + 16 * 128], xm7 vextracti128 [r0 + 24 * 128], m7, 1 movu [r0 + 17 * 128], xm4 vextracti128 [r0 + 25 * 128], m4, 1 movu [r0 + 18 * 128], xm12 vextracti128 [r0 + 26 * 128], m12, 1 movu [r0 + 19 * 128], xm6 vextracti128 [r0 + 27 * 128], m6, 1 movu [r0 + 20 * 128], xm14 vextracti128 [r0 + 28 * 128], m14, 1 movu [r0 + 21 * 128], xm3 vextracti128 [r0 + 29 * 128], m3, 1 movu [r0 + 22 * 128], xm11 vextracti128 [r0 + 30 * 128], m11, 1 movu [r0 + 23 * 128], xm1 vextracti128 [r0 + 31 * 128], m1, 1 ret cglobal transpose64, 3, 6, 16 add r2, r2 lea r3, [3 * r2] lea r4, [r1 + 64] lea r5, [r0 + 16] call transpose8x32_64_internal mov r1, r4 lea r0, [r0 + 32 * 128] call transpose8x32_64_internal mov r0, r5 lea r5, [r0 + 16] lea r4, [r1 + 4 * r2] lea r1, [r4 - 64] call transpose8x32_64_internal mov r1, r4 lea r0, [r0 + 32 * 128] call transpose8x32_64_internal mov r0, r5 lea r5, [r0 + 16] lea r4, [r1 + 4 * r2] lea r1, [r4 - 64] call transpose8x32_64_internal mov r1, r4 lea r0, [r0 + 32 * 128] call transpose8x32_64_internal mov r0, r5 lea r5, [r0 + 16] lea r4, [r1 + 4 * r2] lea r1, [r4 - 64] call transpose8x32_64_internal mov r1, r4 lea r0, [r0 + 32 * 128] call transpose8x32_64_internal mov r0, r5 lea r5, [r0 + 16] lea r4, [r1 + 4 * r2] lea r1, [r4 - 64] call transpose8x32_64_internal mov r1, r4 lea r0, [r0 + 32 * 128] call transpose8x32_64_internal mov r0, r5 lea r5, [r0 + 16] lea r4, [r1 + 4 * r2] lea r1, [r4 - 64] call transpose8x32_64_internal mov r1, r4 lea r0, [r0 + 32 * 128] call transpose8x32_64_internal mov r0, r5 lea r5, [r0 + 16] lea r4, [r1 + 4 * r2] lea r1, [r4 - 64] call transpose8x32_64_internal mov r1, r4 lea r0, [r0 + 32 * 128] call transpose8x32_64_internal mov r0, r5 lea r4, [r1 + 4 * r2] lea r1, [r4 - 64] call transpose8x32_64_internal mov r1, r4 lea r0, [r0 + 32 * 128] call transpose8x32_64_internal RET %endif INIT_XMM sse2 cglobal transpose64, 3, 7, 4, dest, src, stride add r2, r2 mov r3, r0 mov r4, r1 mov r5, 128 mov r6, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 16] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 32] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 48] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 64] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 80] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 96] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 112] mov r3, r0 call transpose8_internal lea r1, [r4 + 16] lea r0, [r6 + 8 * 128] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 8 * 128 + 16] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 8 * 128 + 32] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 8 * 128 + 48] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 8 * 128 + 64] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 8 * 128 + 80] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 8 * 128 + 96] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 8 * 128 + 112] mov r3, r0 call transpose8_internal lea r1, [r4 + 32] lea r0, [r6 + 16 * 128] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 16 * 128 + 16] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 16 * 128 + 32] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 16 * 128 + 48] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 16 * 128 + 64] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 16 * 128 + 80] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 16 * 128 + 96] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 16 * 128 + 112] mov r3, r0 call transpose8_internal lea r1, [r4 + 48] lea r0, [r6 + 24 * 128] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 24 * 128 + 16] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 24 * 128 + 32] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 24 * 128 + 48] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 24 * 128 + 64] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 24 * 128 + 80] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 24 * 128 + 96] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 24 * 128 + 112] mov r3, r0 call transpose8_internal lea r1, [r4 + 64] lea r0, [r6 + 32 * 128] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 32 * 128 + 16] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 32 * 128 + 32] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 32 * 128 + 48] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 32 * 128 + 64] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 32 * 128 + 80] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 32 * 128 + 96] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 32 * 128 + 112] mov r3, r0 call transpose8_internal lea r1, [r4 + 80] lea r0, [r6 + 40 * 128] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 40 * 128 + 16] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 40 * 128 + 32] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 40 * 128 + 48] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 40 * 128 + 64] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 40 * 128 + 80] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 40 * 128 + 96] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 40 * 128 + 112] mov r3, r0 call transpose8_internal lea r1, [r4 + 96] lea r0, [r6 + 48 * 128] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 48 * 128 + 16] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 48 * 128 + 32] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 48 * 128 + 48] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 48 * 128 + 64] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 48 * 128 + 80] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 48 * 128 + 96] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 48 * 128 + 112] mov r3, r0 call transpose8_internal lea r1, [r4 + 112] lea r0, [r6 + 56 * 128] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 56 * 128 + 16] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 56 * 128 + 32] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 56 * 128 + 48] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 56 * 128 + 64] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 56 * 128 + 80] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 56 * 128 + 96] mov r3, r0 call transpose8_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r6 + 56 * 128 + 112] mov r3, r0 call transpose8_internal RET %else ;HIGH_BIT_DEPTH == 0 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal transpose16x32_avx2 movu m0, [r1] movu m1, [r1 + r2] movu m2, [r1 + 2 * r2] movu m3, [r1 + r3] lea r1, [r1 + 4 * r2] movu m4, [r1] movu m5, [r1 + r2] movu m6, [r1 + 2 * r2] movu m7, [r1 + r3] punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2] punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2] punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4] punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4] punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6] punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6] punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8] punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8] punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4] punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4] punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8] punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8] punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4] punpckhwd m0, m2 ;[12- 15; 29 - 32][1 2 3 4] punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8] punpckhwd m4, m6 ;[12- 15; 29 - 32][5 6 7 8] punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8] punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8] punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8] punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8] punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8] punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8] punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8] punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8] movq [r0 + 0 * 64], xm6 movhps [r0 + 1 * 64], xm6 vextracti128 xm4, m6, 1 movq [r0 + 16 * 64], xm4 movhps [r0 + 17 * 64], xm4 lea r1, [r1 + 4 * r2] movu m9, [r1] movu m10, [r1 + r2] movu m11, [r1 + 2 * r2] movu m12, [r1 + r3] lea r1, [r1 + 4 * r2] movu m13, [r1] movu m14, [r1 + r2] movu m15, [r1 + 2 * r2] movu m6, [r1 + r3] punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10] punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10] punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12] punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12] punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14] punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14] punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16] punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16] punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12] punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12] punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16] punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16] punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12] punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12] punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16] punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16] punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16] punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16] punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16] punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16] punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16] punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16] punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16] punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16] punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] movq [r0 + 0 * 64 + 8], xm15 movhps [r0 + 1 * 64 + 8], xm15 vextracti128 xm9, m15, 1 movq [r0 + 16 * 64 + 8], xm9 movhps [r0 + 17 * 64 + 8], xm9 movu [r0 + 2 * 64], xm13 vextracti128 [r0 + 18 * 64], m13, 1 movu [r0 + 3 * 64], xm7 vextracti128 [r0 + 19 * 64], m7, 1 movu [r0 + 4 * 64], xm6 vextracti128 [r0 + 20 * 64], m6, 1 movu [r0 + 5 * 64], xm1 vextracti128 [r0 + 21 * 64], m1, 1 movu [r0 + 6 * 64], xm10 vextracti128 [r0 + 22 * 64], m10, 1 movu [r0 + 7 * 64], xm8 vextracti128 [r0 + 23 * 64], m8, 1 movu [r0 + 8 * 64], xm4 vextracti128 [r0 + 24 * 64], m4, 1 movu [r0 + 9 * 64], xm3 vextracti128 [r0 + 25 * 64], m3, 1 movu [r0 + 10 * 64], xm12 vextracti128 [r0 + 26 * 64], m12, 1 movu [r0 + 11 * 64], xm5 vextracti128 [r0 + 27 * 64], m5, 1 movu [r0 + 12 * 64], xm14 vextracti128 [r0 + 28 * 64], m14, 1 movu [r0 + 13 * 64], xm2 vextracti128 [r0 + 29 * 64], m2, 1 movu [r0 + 14 * 64], xm11 vextracti128 [r0 + 30 * 64], m11, 1 movu [r0 + 15 * 64], xm0 vextracti128 [r0 + 31 * 64], m0, 1 ret cglobal transpose64, 3, 6, 16 lea r3, [r2 * 3] lea r4, [r0 + 16] lea r5, [r1 + 32] call transpose16x32_avx2 lea r0, [r0 + 32 * 64] mov r1, r5 call transpose16x32_avx2 mov r0, r4 lea r5, [r1 + 4 * r2] lea r1, [r5 - 32] call transpose16x32_avx2 lea r0, [r0 + 32 * 64] mov r1, r5 call transpose16x32_avx2 lea r0, [r4 + 16] lea r5, [r1 + 4 * r2] lea r1, [r5 - 32] call transpose16x32_avx2 lea r0, [r0 + 32 * 64] mov r1, r5 call transpose16x32_avx2 lea r5, [r1 + 4 * r2] lea r0, [r4 + 32] lea r1, [r5 - 32] call transpose16x32_avx2 lea r0, [r0 + 32 * 64] mov r1, r5 call transpose16x32_avx2 RET %endif INIT_XMM sse2 cglobal transpose64, 3, 7, 8, dest, src, stride mov r3, r0 mov r4, r1 mov r5, r0 mov r6, 64 call transpose16_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r3 + 16] mov r5, r0 call transpose16_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r3 + 32] mov r5, r0 call transpose16_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r3 + 48] mov r5, r0 call transpose16_internal lea r1, [r4 + 16] lea r0, [r3 + 16 * 64] mov r5, r0 call transpose16_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r3 + 16 * 64 + 16] mov r5, r0 call transpose16_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r3 + 16 * 64 + 32] mov r5, r0 call transpose16_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r3 + 16 * 64 + 48] mov r5, r0 call transpose16_internal lea r1, [r4 + 32] lea r0, [r3 + 32 * 64] mov r5, r0 call transpose16_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r3 + 32 * 64 + 16] mov r5, r0 call transpose16_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r3 + 32 * 64 + 32] mov r5, r0 call transpose16_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r3 + 32 * 64 + 48] mov r5, r0 call transpose16_internal lea r1, [r4 + 48] lea r0, [r3 + 48 * 64] mov r5, r0 call transpose16_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r3 + 48 * 64 + 16] mov r5, r0 call transpose16_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r3 + 48 * 64 + 32] mov r5, r0 call transpose16_internal lea r1, [r1 - 8 + 2 * r2] lea r0, [r3 + 48 * 64 + 48] mov r5, r0 call transpose16_internal RET %endif %if 0 ; un-used functions (SSIM) ;============================================================================= ; SSIM ;============================================================================= ;----------------------------------------------------------------------------- ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- %macro SSIM_ITER 1 %if HIGH_BIT_DEPTH movdqu m5, [r0+(%1&1)*r1] movdqu m6, [r2+(%1&1)*r3] %else movq m5, [r0+(%1&1)*r1] movq m6, [r2+(%1&1)*r3] punpcklbw m5, m0 punpcklbw m6, m0 %endif %if %1==1 lea r0, [r0+r1*2] lea r2, [r2+r3*2] %endif %if %1==0 movdqa m1, m5 movdqa m2, m6 %else paddw m1, m5 paddw m2, m6 %endif pmaddwd m7, m5, m6 pmaddwd m5, m5 pmaddwd m6, m6 ACCUM paddd, 3, 5, %1 ACCUM paddd, 4, 7, %1 paddd m3, m6 %endmacro %macro SSIM 0 cglobal pixel_ssim_4x4x2_core, 4,4,8 FIX_STRIDES r1, r3 pxor m0, m0 SSIM_ITER 0 SSIM_ITER 1 SSIM_ITER 2 SSIM_ITER 3 ; PHADDW m1, m2 ; PHADDD m3, m4 movdqa m7, [pw_1] pshufd m5, m3, q2301 pmaddwd m1, m7 pmaddwd m2, m7 pshufd m6, m4, q2301 packssdw m1, m2 paddd m3, m5 pshufd m1, m1, q3120 paddd m4, m6 pmaddwd m1, m7 punpckhdq m5, m3, m4 punpckldq m3, m4 %if UNIX64 %define t0 r4 %else %define t0 rax mov t0, r4mp %endif movq [t0+ 0], m1 movq [t0+ 8], m3 movhps [t0+16], m1 movq [t0+24], m5 RET ;----------------------------------------------------------------------------- ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width ) ;----------------------------------------------------------------------------- cglobal pixel_ssim_end4, 2,3 mov r2d, r2m mova m0, [r0+ 0] mova m1, [r0+16] mova m2, [r0+32] mova m3, [r0+48] mova m4, [r0+64] paddd m0, [r1+ 0] paddd m1, [r1+16] paddd m2, [r1+32] paddd m3, [r1+48] paddd m4, [r1+64] paddd m0, m1 paddd m1, m2 paddd m2, m3 paddd m3, m4 TRANSPOSE4x4D 0, 1, 2, 3, 4 ; s1=m0, s2=m1, ss=m2, s12=m3 %if BIT_DEPTH >= 10 cvtdq2ps m0, m0 cvtdq2ps m1, m1 cvtdq2ps m2, m2 cvtdq2ps m3, m3 mulps m4, m0, m1 ; s1*s2 mulps m0, m0 ; s1*s1 mulps m1, m1 ; s2*s2 mulps m2, [pf_64] ; ss*64 mulps m3, [pf_128] ; s12*128 addps m4, m4 ; s1*s2*2 addps m0, m1 ; s1*s1 + s2*s2 subps m2, m0 ; vars subps m3, m4 ; covar*2 movaps m1, [ssim_c1] addps m4, m1 ; s1*s2*2 + ssim_c1 addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1 movaps m1, [ssim_c2] addps m2, m1 ; vars + ssim_c2 addps m3, m1 ; covar*2 + ssim_c2 %else pmaddwd m4, m1, m0 ; s1*s2 pslld m1, 16 por m0, m1 pmaddwd m0, m0 ; s1*s1 + s2*s2 pslld m4, 1 pslld m3, 7 pslld m2, 6 psubd m3, m4 ; covar*2 psubd m2, m0 ; vars mova m1, [ssim_c1] paddd m0, m1 paddd m4, m1 mova m1, [ssim_c2] paddd m3, m1 paddd m2, m1 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1) cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1) cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2) cvtdq2ps m2, m2 ; (float)(vars + ssim_c2) %endif mulps m4, m3 mulps m0, m2 divps m4, m0 ; ssim cmp r2d, 4 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level neg r2 %ifdef PIC lea r3, [mask_ff + 16] %xdefine %%mask r3 %else %xdefine %%mask mask_ff + 16 %endif %if cpuflag(avx) andps m4, [%%mask + r2*4] %else movups m0, [%%mask + r2*4] andps m4, m0 %endif .skip: movhlps m0, m4 addps m0, m4 %if cpuflag(ssse3) movshdup m4, m0 %else pshuflw m4, m0, q0032 %endif addss m0, m4 %if ARCH_X86_64 == 0 movss r0m, m0 fld dword r0m %endif RET %endmacro ; SSIM INIT_XMM sse2 SSIM INIT_XMM avx SSIM %endif ; %if 0 ; REMOVED (SSIM) %macro SCALE1D_128to64_HBD 0 movu m0, [r1] palignr m1, m0, 2 movu m2, [r1 + 16] palignr m3, m2, 2 movu m4, [r1 + 32] palignr m5, m4, 2 movu m6, [r1 + 48] pavgw m0, m1 palignr m1, m6, 2 pavgw m2, m3 pavgw m4, m5 pavgw m6, m1 pshufb m0, m0, m7 pshufb m2, m2, m7 pshufb m4, m4, m7 pshufb m6, m6, m7 punpcklqdq m0, m2 movu [r0], m0 punpcklqdq m4, m6 movu [r0 + 16], m4 movu m0, [r1 + 64] palignr m1, m0, 2 movu m2, [r1 + 80] palignr m3, m2, 2 movu m4, [r1 + 96] palignr m5, m4, 2 movu m6, [r1 + 112] pavgw m0, m1 palignr m1, m6, 2 pavgw m2, m3 pavgw m4, m5 pavgw m6, m1 pshufb m0, m0, m7 pshufb m2, m2, m7 pshufb m4, m4, m7 pshufb m6, m6, m7 punpcklqdq m0, m2 movu [r0 + 32], m0 punpcklqdq m4, m6 movu [r0 + 48], m4 movu m0, [r1 + 128] palignr m1, m0, 2 movu m2, [r1 + 144] palignr m3, m2, 2 movu m4, [r1 + 160] palignr m5, m4, 2 movu m6, [r1 + 176] pavgw m0, m1 palignr m1, m6, 2 pavgw m2, m3 pavgw m4, m5 pavgw m6, m1 pshufb m0, m0, m7 pshufb m2, m2, m7 pshufb m4, m4, m7 pshufb m6, m6, m7 punpcklqdq m0, m2 movu [r0 + 64], m0 punpcklqdq m4, m6 movu [r0 + 80], m4 movu m0, [r1 + 192] palignr m1, m0, 2 movu m2, [r1 + 208] palignr m3, m2, 2 movu m4, [r1 + 224] palignr m5, m4, 2 movu m6, [r1 + 240] pavgw m0, m1 palignr m1, m6, 2 pavgw m2, m3 pavgw m4, m5 pavgw m6, m1 pshufb m0, m0, m7 pshufb m2, m2, m7 pshufb m4, m4, m7 pshufb m6, m6, m7 punpcklqdq m0, m2 movu [r0 + 96], m0 punpcklqdq m4, m6 movu [r0 + 112], m4 %endmacro ;----------------------------------------------------------------- ; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/) ;----------------------------------------------------------------- INIT_XMM ssse3 cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride %if HIGH_BIT_DEPTH mova m7, [deinterleave_word_shuf] ;Top pixel SCALE1D_128to64_HBD ;Left pixel add r1, 256 add r0, 128 SCALE1D_128to64_HBD %else mova m7, [deinterleave_shuf] ;Top pixel movu m0, [r1] palignr m1, m0, 1 movu m2, [r1 + 16] palignr m3, m2, 1 movu m4, [r1 + 32] palignr m5, m4, 1 movu m6, [r1 + 48] pavgb m0, m1 palignr m1, m6, 1 pavgb m2, m3 pavgb m4, m5 pavgb m6, m1 pshufb m0, m0, m7 pshufb m2, m2, m7 pshufb m4, m4, m7 pshufb m6, m6, m7 punpcklqdq m0, m2 movu [r0], m0 punpcklqdq m4, m6 movu [r0 + 16], m4 movu m0, [r1 + 64] palignr m1, m0, 1 movu m2, [r1 + 80] palignr m3, m2, 1 movu m4, [r1 + 96] palignr m5, m4, 1 movu m6, [r1 + 112] pavgb m0, m1 palignr m1, m6, 1 pavgb m2, m3 pavgb m4, m5 pavgb m6, m1 pshufb m0, m0, m7 pshufb m2, m2, m7 pshufb m4, m4, m7 pshufb m6, m6, m7 punpcklqdq m0, m2 movu [r0 + 32], m0 punpcklqdq m4, m6 movu [r0 + 48], m4 ;Left pixel movu m0, [r1 + 128] palignr m1, m0, 1 movu m2, [r1 + 144] palignr m3, m2, 1 movu m4, [r1 + 160] palignr m5, m4, 1 movu m6, [r1 + 176] pavgb m0, m1 palignr m1, m6, 1 pavgb m2, m3 pavgb m4, m5 pavgb m6, m1 pshufb m0, m0, m7 pshufb m2, m2, m7 pshufb m4, m4, m7 pshufb m6, m6, m7 punpcklqdq m0, m2 movu [r0 + 64], m0 punpcklqdq m4, m6 movu [r0 + 80], m4 movu m0, [r1 + 192] palignr m1, m0, 1 movu m2, [r1 + 208] palignr m3, m2, 1 movu m4, [r1 + 224] palignr m5, m4, 1 movu m6, [r1 + 240] pavgb m0, m1 palignr m1, m6, 1 pavgb m2, m3 pavgb m4, m5 pavgb m6, m1 pshufb m0, m0, m7 pshufb m2, m2, m7 pshufb m4, m4, m7 pshufb m6, m6, m7 punpcklqdq m0, m2 movu [r0 + 96], m0 punpcklqdq m4, m6 movu [r0 + 112], m4 %endif RET %if HIGH_BIT_DEPTH == 1 INIT_YMM avx2 cglobal scale1D_128to64, 2, 2, 3 pxor m2, m2 ;Top pixel movu m0, [r1] movu m1, [r1 + 32] phaddw m0, m1 pavgw m0, m2 vpermq m0, m0, 0xD8 movu [r0], m0 movu m0, [r1 + 64] movu m1, [r1 + 96] phaddw m0, m1 pavgw m0, m2 vpermq m0, m0, 0xD8 movu [r0 + 32], m0 movu m0, [r1 + 128] movu m1, [r1 + 160] phaddw m0, m1 pavgw m0, m2 vpermq m0, m0, 0xD8 movu [r0 + 64], m0 movu m0, [r1 + 192] movu m1, [r1 + 224] phaddw m0, m1 pavgw m0, m2 vpermq m0, m0, 0xD8 movu [r0 + 96], m0 ;Left pixel movu m0, [r1 + 256] movu m1, [r1 + 288] phaddw m0, m1 pavgw m0, m2 vpermq m0, m0, 0xD8 movu [r0 + 128], m0 movu m0, [r1 + 320] movu m1, [r1 + 352] phaddw m0, m1 pavgw m0, m2 vpermq m0, m0, 0xD8 movu [r0 + 160], m0 movu m0, [r1 + 384] movu m1, [r1 + 416] phaddw m0, m1 pavgw m0, m2 vpermq m0, m0, 0xD8 movu [r0 + 192], m0 movu m0, [r1 + 448] movu m1, [r1 + 480] phaddw m0, m1 pavgw m0, m2 vpermq m0, m0, 0xD8 movu [r0 + 224], m0 RET %else ; HIGH_BIT_DEPTH == 0 INIT_YMM avx2 cglobal scale1D_128to64, 2, 2, 4 pxor m2, m2 mova m3, [pb_1] ;Top pixel movu m0, [r1] pmaddubsw m0, m0, m3 pavgw m0, m2 movu m1, [r1 + 32] pmaddubsw m1, m1, m3 pavgw m1, m2 packuswb m0, m1 vpermq m0, m0, 0xD8 movu [r0], m0 movu m0, [r1 + 64] pmaddubsw m0, m0, m3 pavgw m0, m2 movu m1, [r1 + 96] pmaddubsw m1, m1, m3 pavgw m1, m2 packuswb m0, m1 vpermq m0, m0, 0xD8 movu [r0 + 32], m0 ;Left pixel movu m0, [r1 + 128] pmaddubsw m0, m0, m3 pavgw m0, m2 movu m1, [r1 + 160] pmaddubsw m1, m1, m3 pavgw m1, m2 packuswb m0, m1 vpermq m0, m0, 0xD8 movu [r0 + 64], m0 movu m0, [r1 + 192] pmaddubsw m0, m0, m3 pavgw m0, m2 movu m1, [r1 + 224] pmaddubsw m1, m1, m3 pavgw m1, m2 packuswb m0, m1 vpermq m0, m0, 0xD8 movu [r0 + 96], m0 RET %endif ;----------------------------------------------------------------- ; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride) ;----------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_XMM ssse3 cglobal scale2D_64to32, 3, 4, 8, dest, src, stride mov r3d, 32 mova m7, [deinterleave_word_shuf] add r2, r2 .loop: movu m0, [r1] ;i psrld m1, m0, 16 ;j movu m2, [r1 + r2] ;k psrld m3, m2, 16 ;l movu m4, m0 movu m5, m2 pxor m4, m1 ;i^j pxor m5, m3 ;k^l por m4, m5 ;ij|kl pavgw m0, m1 ;s pavgw m2, m3 ;t movu m5, m0 pavgw m0, m2 ;(s+t+1)/2 pxor m5, m2 ;s^t pand m4, m5 ;(ij|kl)&st pand m4, [hmulw_16p] psubw m0, m4 ;Result movu m1, [r1 + 16] ;i psrld m2, m1, 16 ;j movu m3, [r1 + r2 + 16] ;k psrld m4, m3, 16 ;l movu m5, m1 movu m6, m3 pxor m5, m2 ;i^j pxor m6, m4 ;k^l por m5, m6 ;ij|kl pavgw m1, m2 ;s pavgw m3, m4 ;t movu m6, m1 pavgw m1, m3 ;(s+t+1)/2 pxor m6, m3 ;s^t pand m5, m6 ;(ij|kl)&st pand m5, [hmulw_16p] psubw m1, m5 ;Result pshufb m0, m7 pshufb m1, m7 punpcklqdq m0, m1 movu [r0], m0 movu m0, [r1 + 32] ;i psrld m1, m0, 16 ;j movu m2, [r1 + r2 + 32] ;k psrld m3, m2, 16 ;l movu m4, m0 movu m5, m2 pxor m4, m1 ;i^j pxor m5, m3 ;k^l por m4, m5 ;ij|kl pavgw m0, m1 ;s pavgw m2, m3 ;t movu m5, m0 pavgw m0, m2 ;(s+t+1)/2 pxor m5, m2 ;s^t pand m4, m5 ;(ij|kl)&st pand m4, [hmulw_16p] psubw m0, m4 ;Result movu m1, [r1 + 48] ;i psrld m2, m1, 16 ;j movu m3, [r1 + r2 + 48] ;k psrld m4, m3, 16 ;l movu m5, m1 movu m6, m3 pxor m5, m2 ;i^j pxor m6, m4 ;k^l por m5, m6 ;ij|kl pavgw m1, m2 ;s pavgw m3, m4 ;t movu m6, m1 pavgw m1, m3 ;(s+t+1)/2 pxor m6, m3 ;s^t pand m5, m6 ;(ij|kl)&st pand m5, [hmulw_16p] psubw m1, m5 ;Result pshufb m0, m7 pshufb m1, m7 punpcklqdq m0, m1 movu [r0 + 16], m0 movu m0, [r1 + 64] ;i psrld m1, m0, 16 ;j movu m2, [r1 + r2 + 64] ;k psrld m3, m2, 16 ;l movu m4, m0 movu m5, m2 pxor m4, m1 ;i^j pxor m5, m3 ;k^l por m4, m5 ;ij|kl pavgw m0, m1 ;s pavgw m2, m3 ;t movu m5, m0 pavgw m0, m2 ;(s+t+1)/2 pxor m5, m2 ;s^t pand m4, m5 ;(ij|kl)&st pand m4, [hmulw_16p] psubw m0, m4 ;Result movu m1, [r1 + 80] ;i psrld m2, m1, 16 ;j movu m3, [r1 + r2 + 80] ;k psrld m4, m3, 16 ;l movu m5, m1 movu m6, m3 pxor m5, m2 ;i^j pxor m6, m4 ;k^l por m5, m6 ;ij|kl pavgw m1, m2 ;s pavgw m3, m4 ;t movu m6, m1 pavgw m1, m3 ;(s+t+1)/2 pxor m6, m3 ;s^t pand m5, m6 ;(ij|kl)&st pand m5, [hmulw_16p] psubw m1, m5 ;Result pshufb m0, m7 pshufb m1, m7 punpcklqdq m0, m1 movu [r0 + 32], m0 movu m0, [r1 + 96] ;i psrld m1, m0, 16 ;j movu m2, [r1 + r2 + 96] ;k psrld m3, m2, 16 ;l movu m4, m0 movu m5, m2 pxor m4, m1 ;i^j pxor m5, m3 ;k^l por m4, m5 ;ij|kl pavgw m0, m1 ;s pavgw m2, m3 ;t movu m5, m0 pavgw m0, m2 ;(s+t+1)/2 pxor m5, m2 ;s^t pand m4, m5 ;(ij|kl)&st pand m4, [hmulw_16p] psubw m0, m4 ;Result movu m1, [r1 + 112] ;i psrld m2, m1, 16 ;j movu m3, [r1 + r2 + 112] ;k psrld m4, m3, 16 ;l movu m5, m1 movu m6, m3 pxor m5, m2 ;i^j pxor m6, m4 ;k^l por m5, m6 ;ij|kl pavgw m1, m2 ;s pavgw m3, m4 ;t movu m6, m1 pavgw m1, m3 ;(s+t+1)/2 pxor m6, m3 ;s^t pand m5, m6 ;(ij|kl)&st pand m5, [hmulw_16p] psubw m1, m5 ;Result pshufb m0, m7 pshufb m1, m7 punpcklqdq m0, m1 movu [r0 + 48], m0 lea r0, [r0 + 64] lea r1, [r1 + 2 * r2] dec r3d jnz .loop RET %else INIT_XMM ssse3 cglobal scale2D_64to32, 3, 4, 8, dest, src, stride mov r3d, 32 mova m7, [deinterleave_shuf] .loop: movu m0, [r1] ;i psrlw m1, m0, 8 ;j movu m2, [r1 + r2] ;k psrlw m3, m2, 8 ;l movu m4, m0 movu m5, m2 pxor m4, m1 ;i^j pxor m5, m3 ;k^l por m4, m5 ;ij|kl pavgb m0, m1 ;s pavgb m2, m3 ;t movu m5, m0 pavgb m0, m2 ;(s+t+1)/2 pxor m5, m2 ;s^t pand m4, m5 ;(ij|kl)&st pand m4, [hmul_16p] psubb m0, m4 ;Result movu m1, [r1 + 16] ;i psrlw m2, m1, 8 ;j movu m3, [r1 + r2 + 16] ;k psrlw m4, m3, 8 ;l movu m5, m1 movu m6, m3 pxor m5, m2 ;i^j pxor m6, m4 ;k^l por m5, m6 ;ij|kl pavgb m1, m2 ;s pavgb m3, m4 ;t movu m6, m1 pavgb m1, m3 ;(s+t+1)/2 pxor m6, m3 ;s^t pand m5, m6 ;(ij|kl)&st pand m5, [hmul_16p] psubb m1, m5 ;Result pshufb m0, m0, m7 pshufb m1, m1, m7 punpcklqdq m0, m1 movu [r0], m0 movu m0, [r1 + 32] ;i psrlw m1, m0, 8 ;j movu m2, [r1 + r2 + 32] ;k psrlw m3, m2, 8 ;l movu m4, m0 movu m5, m2 pxor m4, m1 ;i^j pxor m5, m3 ;k^l por m4, m5 ;ij|kl pavgb m0, m1 ;s pavgb m2, m3 ;t movu m5, m0 pavgb m0, m2 ;(s+t+1)/2 pxor m5, m2 ;s^t pand m4, m5 ;(ij|kl)&st pand m4, [hmul_16p] psubb m0, m4 ;Result movu m1, [r1 + 48] ;i psrlw m2, m1, 8 ;j movu m3, [r1 + r2 + 48] ;k psrlw m4, m3, 8 ;l movu m5, m1 movu m6, m3 pxor m5, m2 ;i^j pxor m6, m4 ;k^l por m5, m6 ;ij|kl pavgb m1, m2 ;s pavgb m3, m4 ;t movu m6, m1 pavgb m1, m3 ;(s+t+1)/2 pxor m6, m3 ;s^t pand m5, m6 ;(ij|kl)&st pand m5, [hmul_16p] psubb m1, m5 ;Result pshufb m0, m0, m7 pshufb m1, m1, m7 punpcklqdq m0, m1 movu [r0 + 16], m0 lea r0, [r0 + 32] lea r1, [r1 + 2 * r2] dec r3d jnz .loop RET %endif ;----------------------------------------------------------------- ; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride) ;----------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_YMM avx2 cglobal scale2D_64to32, 3, 4, 5, dest, src, stride mov r3d, 32 add r2d, r2d mova m4, [pw_2000] .loop: movu m0, [r1] movu m1, [r1 + 1 * mmsize] movu m2, [r1 + r2] movu m3, [r1 + r2 + 1 * mmsize] paddw m0, m2 paddw m1, m3 phaddw m0, m1 pmulhrsw m0, m4 vpermq m0, m0, q3120 movu [r0], m0 movu m0, [r1 + 2 * mmsize] movu m1, [r1 + 3 * mmsize] movu m2, [r1 + r2 + 2 * mmsize] movu m3, [r1 + r2 + 3 * mmsize] paddw m0, m2 paddw m1, m3 phaddw m0, m1 pmulhrsw m0, m4 vpermq m0, m0, q3120 movu [r0 + mmsize], m0 add r0, 64 lea r1, [r1 + 2 * r2] dec r3d jnz .loop RET %else INIT_YMM avx2 cglobal scale2D_64to32, 3, 5, 8, dest, src, stride mov r3d, 16 mova m7, [deinterleave_shuf] .loop: movu m0, [r1] ; i lea r4, [r1 + r2 * 2] psrlw m1, m0, 8 ; j movu m2, [r1 + r2] ; k psrlw m3, m2, 8 ; l pxor m4, m0, m1 ; i^j pxor m5, m2, m3 ; k^l por m4, m5 ; ij|kl pavgb m0, m1 ; s pavgb m2, m3 ; t mova m5, m0 pavgb m0, m2 ; (s+t+1)/2 pxor m5, m2 ; s^t pand m4, m5 ; (ij|kl)&st pand m4, [pb_1] psubb m0, m4 ; Result movu m1, [r1 + 32] ; i psrlw m2, m1, 8 ; j movu m3, [r1 + r2 + 32] ; k psrlw m4, m3, 8 ; l pxor m5, m1, m2 ; i^j pxor m6, m3, m4 ; k^l por m5, m6 ; ij|kl pavgb m1, m2 ; s pavgb m3, m4 ; t mova m6, m1 pavgb m1, m3 ; (s+t+1)/2 pxor m6, m3 ; s^t pand m5, m6 ; (ij|kl)&st pand m5, [pb_1] psubb m1, m5 ; Result pshufb m0, m0, m7 pshufb m1, m1, m7 punpcklqdq m0, m1 vpermq m0, m0, 11011000b movu [r0], m0 add r0, 32 movu m0, [r4] ; i psrlw m1, m0, 8 ; j movu m2, [r4 + r2] ; k psrlw m3, m2, 8 ; l pxor m4, m0, m1 ; i^j pxor m5, m2, m3 ; k^l por m4, m5 ; ij|kl pavgb m0, m1 ; s pavgb m2, m3 ; t mova m5, m0 pavgb m0, m2 ; (s+t+1)/2 pxor m5, m2 ; s^t pand m4, m5 ; (ij|kl)&st pand m4, [pb_1] psubb m0, m4 ; Result movu m1, [r4 + 32] ; i psrlw m2, m1, 8 ; j movu m3, [r4 + r2 + 32] ; k psrlw m4, m3, 8 ; l pxor m5, m1, m2 ; i^j pxor m6, m3, m4 ; k^l por m5, m6 ; ij|kl pavgb m1, m2 ; s pavgb m3, m4 ; t mova m6, m1 pavgb m1, m3 ; (s+t+1)/2 pxor m6, m3 ; s^t pand m5, m6 ; (ij|kl)&st pand m5, [pb_1] psubb m1, m5 ; Result pshufb m0, m0, m7 pshufb m1, m1, m7 punpcklqdq m0, m1 vpermq m0, m0, 11011000b movu [r0], m0 lea r1, [r1 + 4 * r2] add r0, 32 dec r3d jnz .loop RET %endif ;----------------------------------------------------------------------------- ; void pixel_sub_ps_4x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1 add r4, r4 add r5, r5 add r1, r1 movh m0, [r2] movh m2, [r2 + r4] movh m1, [r3] movh m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] movh m4, [r2] movh m6, [r2 + r4] movh m5, [r3] movh m7, [r3 + r5] psubw m0, m1 psubw m2, m3 psubw m4, m5 psubw m6, m7 movh [r0], m0 movh [r0 + r1], m2 lea r0, [r0 + r1 * 2] movh [r0], m4 movh [r0 + r1], m6 RET %else INIT_XMM sse4 cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1 add r1, r1 movd m0, [r2] movd m2, [r2 + r4] movd m1, [r3] movd m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] movd m4, [r2] movd m6, [r2 + r4] movd m5, [r3] movd m7, [r3 + r5] punpckldq m0, m2 punpckldq m1, m3 punpckldq m4, m6 punpckldq m5, m7 pmovzxbw m0, m0 pmovzxbw m1, m1 pmovzxbw m4, m4 pmovzxbw m5, m5 psubw m0, m1 psubw m4, m5 movh [r0], m0 movhps [r0 + r1], m0 movh [r0 + r1 * 2], m4 lea r0, [r0 + r1 * 2] movhps [r0 + r1], m4 RET %endif ;----------------------------------------------------------------------------- ; void pixel_sub_ps_4x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- %macro PIXELSUB_PS_W4_H4 2 %if HIGH_BIT_DEPTH cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 mov r6d, %2/4 add r4, r4 add r5, r5 add r1, r1 .loop: movh m0, [r2] movh m2, [r2 + r4] movh m1, [r3] movh m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] movh m4, [r2] movh m6, [r2 + r4] movh m5, [r3] movh m7, [r3 + r5] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] psubw m0, m1 psubw m2, m3 psubw m4, m5 psubw m6, m7 movh [r0], m0 movh [r0 + r1], m2 movh [r0 + r1 * 2], m4 lea r0, [r0 + r1 * 2] movh [r0 + r1], m6 lea r0, [r0 + r1 * 2] jnz .loop RET %else cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 mov r6d, %2/4 add r1, r1 .loop: movd m0, [r2] movd m2, [r2 + r4] movd m1, [r3] movd m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] movd m4, [r2] movd m6, [r2 + r4] movd m5, [r3] movd m7, [r3 + r5] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] punpckldq m0, m2 punpckldq m1, m3 punpckldq m4, m6 punpckldq m5, m7 pmovzxbw m0, m0 pmovzxbw m1, m1 pmovzxbw m4, m4 pmovzxbw m5, m5 psubw m0, m1 psubw m4, m5 movh [r0], m0 movhps [r0 + r1], m0 movh [r0 + r1 * 2], m4 lea r0, [r0 + r1 * 2] movhps [r0 + r1], m4 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PIXELSUB_PS_W4_H4 4, 8 PIXELSUB_PS_W4_H4 4, 16 %else INIT_XMM sse4 PIXELSUB_PS_W4_H4 4, 8 PIXELSUB_PS_W4_H4 4, 16 %endif ;----------------------------------------------------------------------------- ; void pixel_sub_ps_8x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- %macro PIXELSUB_PS_W8_H4 2 %if HIGH_BIT_DEPTH cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 mov r6d, %2/4 add r4, r4 add r5, r5 add r1, r1 .loop: movu m0, [r2] movu m2, [r2 + r4] movu m1, [r3] movu m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] movu m4, [r2] movu m6, [r2 + r4] movu m5, [r3] movu m7, [r3 + r5] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] psubw m0, m1 psubw m2, m3 psubw m4, m5 psubw m6, m7 movu [r0], m0 movu [r0 + r1], m2 movu [r0 + r1 * 2], m4 lea r0, [r0 + r1 * 2] movu [r0 + r1], m6 lea r0, [r0 + r1 * 2] jnz .loop RET %else cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 mov r6d, %2/4 add r1, r1 .loop: movh m0, [r2] movh m2, [r2 + r4] movh m1, [r3] movh m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] movh m4, [r2] movh m6, [r2 + r4] movh m5, [r3] movh m7, [r3 + r5] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] pmovzxbw m0, m0 pmovzxbw m1, m1 pmovzxbw m2, m2 pmovzxbw m3, m3 pmovzxbw m4, m4 pmovzxbw m5, m5 pmovzxbw m6, m6 pmovzxbw m7, m7 psubw m0, m1 psubw m2, m3 psubw m4, m5 psubw m6, m7 movu [r0], m0 movu [r0 + r1], m2 movu [r0 + r1 * 2], m4 lea r0, [r0 + r1 * 2] movu [r0 + r1], m6 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PIXELSUB_PS_W8_H4 8, 8 PIXELSUB_PS_W8_H4 8, 16 PIXELSUB_PS_W8_H4 8, 32 %else INIT_XMM sse4 PIXELSUB_PS_W8_H4 8, 8 PIXELSUB_PS_W8_H4 8, 16 PIXELSUB_PS_W8_H4 8, 32 %endif ;----------------------------------------------------------------------------- ; void pixel_sub_ps_16x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- %macro PIXELSUB_PS_W16_H4 2 %if HIGH_BIT_DEPTH cglobal pixel_sub_ps_16x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 mov r6d, %2/4 add r4, r4 add r5, r5 add r1, r1 .loop: movu m0, [r2] movu m2, [r2 + 16] movu m1, [r3] movu m3, [r3 + 16] movu m4, [r2 + r4] movu m6, [r2 + r4 + 16] movu m5, [r3 + r5] movu m7, [r3 + r5 + 16] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] psubw m0, m1 psubw m2, m3 psubw m4, m5 psubw m6, m7 movu [r0], m0 movu [r0 + 16], m2 movu [r0 + r1], m4 movu [r0 + r1 + 16], m6 movu m0, [r2] movu m2, [r2 + 16] movu m1, [r3] movu m3, [r3 + 16] movu m4, [r2 + r4] movu m5, [r3 + r5] movu m6, [r2 + r4 + 16] movu m7, [r3 + r5 + 16] lea r0, [r0 + r1 * 2] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] psubw m0, m1 psubw m2, m3 psubw m4, m5 psubw m6, m7 movu [r0], m0 movu [r0 + 16], m2 movu [r0 + r1], m4 movu [r0 + r1 + 16], m6 lea r0, [r0 + r1 * 2] jnz .loop RET %else cglobal pixel_sub_ps_16x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1 mov r6d, %2/4 pxor m6, m6 add r1, r1 .loop: movu m1, [r2] movu m3, [r3] pmovzxbw m0, m1 pmovzxbw m2, m3 punpckhbw m1, m6 punpckhbw m3, m6 psubw m0, m2 psubw m1, m3 movu m5, [r2 + r4] movu m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] pmovzxbw m4, m5 pmovzxbw m2, m3 punpckhbw m5, m6 punpckhbw m3, m6 psubw m4, m2 psubw m5, m3 movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m4 movu [r0 + r1 + 16], m5 movu m1, [r2] movu m3, [r3] pmovzxbw m0, m1 pmovzxbw m2, m3 punpckhbw m1, m6 punpckhbw m3, m6 psubw m0, m2 psubw m1, m3 movu m5, [r2 + r4] movu m3, [r3 + r5] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] lea r0, [r0 + r1 * 2] pmovzxbw m4, m5 pmovzxbw m2, m3 punpckhbw m5, m6 punpckhbw m3, m6 psubw m4, m2 psubw m5, m3 movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m4 movu [r0 + r1 + 16], m5 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PIXELSUB_PS_W16_H4 16, 4 PIXELSUB_PS_W16_H4 16, 16 PIXELSUB_PS_W16_H4 16, 32 PIXELSUB_PS_W16_H4 16, 64 %else INIT_XMM sse4 PIXELSUB_PS_W16_H4 16, 4 PIXELSUB_PS_W16_H4 16, 16 PIXELSUB_PS_W16_H4 16, 32 PIXELSUB_PS_W16_H4 16, 64 %endif ;----------------------------------------------------------------------------- ; void pixel_sub_ps_16x16(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH %macro PIXELSUB_PS_W16_H4_avx2 1 %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_sub_ps_16x%1, 6, 9, 4, dest, deststride, src0, src1, srcstride0, srcstride1 add r1d, r1d add r4d, r4d add r5d, r5d lea r6, [r1 * 3] lea r7, [r4 * 3] lea r8, [r5 * 3] %rep %1/4 movu m0, [r2] movu m1, [r3] movu m2, [r2 + r4] movu m3, [r3 + r5] psubw m0, m1 psubw m2, m3 movu [r0], m0 movu [r0 + r1], m2 movu m0, [r2 + r4 * 2] movu m1, [r3 + r5 * 2] movu m2, [r2 + r7] movu m3, [r3 + r8] psubw m0, m1 psubw m2, m3 movu [r0 + r1 * 2], m0 movu [r0 + r6], m2 lea r0, [r0 + r1 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] %endrep RET %endif %endmacro PIXELSUB_PS_W16_H4_avx2 4 PIXELSUB_PS_W16_H4_avx2 8 PIXELSUB_PS_W16_H4_avx2 16 PIXELSUB_PS_W16_H4_avx2 32 PIXELSUB_PS_W16_H4_avx2 64 %else ;----------------------------------------------------------------------------- ; void pixel_sub_ps_16x16(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- %macro PIXELSUB_PS_W16_H8_avx2 2 %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_sub_ps_16x%2, 6, 10, 4, dest, deststride, src0, src1, srcstride0, srcstride1 add r1, r1 lea r6, [r1 * 3] mov r7d, %2/8 lea r9, [r4 * 3] lea r8, [r5 * 3] .loop pmovzxbw m0, [r2] pmovzxbw m1, [r3] pmovzxbw m2, [r2 + r4] pmovzxbw m3, [r3 + r5] psubw m0, m1 psubw m2, m3 movu [r0], m0 movu [r0 + r1], m2 pmovzxbw m0, [r2 + 2 * r4] pmovzxbw m1, [r3 + 2 * r5] pmovzxbw m2, [r2 + r9] pmovzxbw m3, [r3 + r8] psubw m0, m1 psubw m2, m3 movu [r0 + r1 * 2], m0 movu [r0 + r6], m2 lea r0, [r0 + r1 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] pmovzxbw m0, [r2] pmovzxbw m1, [r3] pmovzxbw m2, [r2 + r4] pmovzxbw m3, [r3 + r5] psubw m0, m1 psubw m2, m3 movu [r0], m0 movu [r0 + r1], m2 pmovzxbw m0, [r2 + 2 * r4] pmovzxbw m1, [r3 + 2 * r5] pmovzxbw m2, [r2 + r9] pmovzxbw m3, [r3 + r8] psubw m0, m1 psubw m2, m3 movu [r0 + r1 * 2], m0 movu [r0 + r6], m2 lea r0, [r0 + r1 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] dec r7d jnz .loop RET %endif %endmacro PIXELSUB_PS_W16_H8_avx2 16, 16 PIXELSUB_PS_W16_H8_avx2 16, 32 PIXELSUB_PS_W16_H8_avx2 16, 64 %endif ;----------------------------------------------------------------------------- ; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- %macro PIXELSUB_PS_W32_H2 2 %if HIGH_BIT_DEPTH cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 mov r6d, %2/2 add r4, r4 add r5, r5 add r1, r1 .loop: movu m0, [r2] movu m2, [r2 + 16] movu m4, [r2 + 32] movu m6, [r2 + 48] movu m1, [r3] movu m3, [r3 + 16] movu m5, [r3 + 32] movu m7, [r3 + 48] dec r6d psubw m0, m1 psubw m2, m3 psubw m4, m5 psubw m6, m7 movu [r0], m0 movu [r0 + 16], m2 movu [r0 + 32], m4 movu [r0 + 48], m6 movu m0, [r2 + r4] movu m2, [r2 + r4 + 16] movu m4, [r2 + r4 + 32] movu m6, [r2 + r4 + 48] movu m1, [r3 + r5] movu m3, [r3 + r5 + 16] movu m5, [r3 + r5 + 32] movu m7, [r3 + r5 + 48] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] psubw m0, m1 psubw m2, m3 psubw m4, m5 psubw m6, m7 movu [r0 + r1], m0 movu [r0 + r1 + 16], m2 movu [r0 + r1 + 32], m4 movu [r0 + r1 + 48], m6 lea r0, [r0 + r1 * 2] jnz .loop RET %else cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 mov r6d, %2/2 add r1, r1 .loop: movh m0, [r2] movh m1, [r2 + 8] movh m2, [r2 + 16] movh m6, [r2 + 24] movh m3, [r3] movh m4, [r3 + 8] movh m5, [r3 + 16] movh m7, [r3 + 24] dec r6d pmovzxbw m0, m0 pmovzxbw m1, m1 pmovzxbw m2, m2 pmovzxbw m6, m6 pmovzxbw m3, m3 pmovzxbw m4, m4 pmovzxbw m5, m5 pmovzxbw m7, m7 psubw m0, m3 psubw m1, m4 psubw m2, m5 psubw m6, m7 movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m6 movh m0, [r2 + r4] movh m1, [r2 + r4 + 8] movh m2, [r2 + r4 + 16] movh m6, [r2 + r4 + 24] movh m3, [r3 + r5] movh m4, [r3 + r5 + 8] movh m5, [r3 + r5 + 16] movh m7, [r3 + r5 + 24] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] pmovzxbw m0, m0 pmovzxbw m1, m1 pmovzxbw m2, m2 pmovzxbw m6, m6 pmovzxbw m3, m3 pmovzxbw m4, m4 pmovzxbw m5, m5 pmovzxbw m7, m7 psubw m0, m3 psubw m1, m4 psubw m2, m5 psubw m6, m7 movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m6 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PIXELSUB_PS_W32_H2 32, 8 PIXELSUB_PS_W32_H2 32, 32 PIXELSUB_PS_W32_H2 32, 64 %else INIT_XMM sse4 PIXELSUB_PS_W32_H2 32, 8 PIXELSUB_PS_W32_H2 32, 32 PIXELSUB_PS_W32_H2 32, 64 %endif ;----------------------------------------------------------------------------- ; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH %macro PIXELSUB_PS_W32_H4_avx2 1 %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_sub_ps_32x%1, 6, 10, 4, dest, deststride, src0, src1, srcstride0, srcstride1 add r1d, r1d add r4d, r4d add r5d, r5d mov r9d, %1/4 lea r6, [r1 * 3] lea r7, [r4 * 3] lea r8, [r5 * 3] .loop movu m0, [r2] movu m1, [r2 + 32] movu m2, [r3] movu m3, [r3 + 32] psubw m0, m2 psubw m1, m3 movu [r0], m0 movu [r0 + 32], m1 movu m0, [r2 + r4] movu m1, [r2 + r4 + 32] movu m2, [r3 + r5] movu m3, [r3 + r5 + 32] psubw m0, m2 psubw m1, m3 movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 movu m0, [r2 + r4 * 2] movu m1, [r2 + r4 * 2 + 32] movu m2, [r3 + r5 * 2] movu m3, [r3 + r5 * 2 + 32] psubw m0, m2 psubw m1, m3 movu [r0 + r1 * 2], m0 movu [r0 + r1 * 2 + 32], m1 movu m0, [r2 + r7] movu m1, [r2 + r7 + 32] movu m2, [r3 + r8] movu m3, [r3 + r8 + 32] psubw m0, m2 psubw m1, m3 movu [r0 + r6], m0 movu [r0 + r6 + 32], m1 lea r0, [r0 + r1 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] dec r9d jnz .loop RET %endif %endmacro PIXELSUB_PS_W32_H4_avx2 8 PIXELSUB_PS_W32_H4_avx2 32 PIXELSUB_PS_W32_H4_avx2 64 %else ;----------------------------------------------------------------------------- ; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- %macro PIXELSUB_PS_W32_H8_avx2 2 %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_sub_ps_32x%2, 6, 10, 4, dest, deststride, src0, src1, srcstride0, srcstride1 mov r6d, %2/8 add r1, r1 lea r7, [r4 * 3] lea r8, [r5 * 3] lea r9, [r1 * 3] .loop: pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 16] pmovzxbw m2, [r3] pmovzxbw m3, [r3 + 16] psubw m0, m2 psubw m1, m3 movu [r0], m0 movu [r0 + 32], m1 pmovzxbw m0, [r2 + r4] pmovzxbw m1, [r2 + r4 + 16] pmovzxbw m2, [r3 + r5] pmovzxbw m3, [r3 + r5 + 16] psubw m0, m2 psubw m1, m3 movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 pmovzxbw m0, [r2 + 2 * r4] pmovzxbw m1, [r2 + 2 * r4 + 16] pmovzxbw m2, [r3 + 2 * r5] pmovzxbw m3, [r3 + 2 * r5 + 16] psubw m0, m2 psubw m1, m3 movu [r0 + r1 * 2 ], m0 movu [r0 + r1 * 2 + 32], m1 pmovzxbw m0, [r2 + r7] pmovzxbw m1, [r2 + r7 + 16] pmovzxbw m2, [r3 + r8] pmovzxbw m3, [r3 + r8 + 16] psubw m0, m2 psubw m1, m3 movu [r0 + r9], m0 movu [r0 + r9 +32], m1 lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] lea r0, [r0 + r1 * 4] pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 16] pmovzxbw m2, [r3] pmovzxbw m3, [r3 + 16] psubw m0, m2 psubw m1, m3 movu [r0 ], m0 movu [r0 + 32], m1 pmovzxbw m0, [r2 + r4] pmovzxbw m1, [r2 + r4 + 16] pmovzxbw m2, [r3 + r5] pmovzxbw m3, [r3 + r5 + 16] psubw m0, m2 psubw m1, m3 movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 pmovzxbw m0, [r2 + 2 * r4] pmovzxbw m1, [r2 + 2 * r4 + 16] pmovzxbw m2, [r3 + 2 * r5] pmovzxbw m3, [r3 + 2 * r5 + 16] psubw m0, m2 psubw m1, m3 movu [r0 + r1 * 2], m0 movu [r0 + r1 * 2 + 32], m1 pmovzxbw m0, [r2 + r7] pmovzxbw m1, [r2 + r7 + 16] pmovzxbw m2, [r3 + r8] pmovzxbw m3, [r3 + r8 + 16] psubw m0, m2 psubw m1, m3 movu [r0 + r9], m0 movu [r0 + r9 + 32], m1 lea r0, [r0 + r1 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] dec r6d jnz .loop RET %endif %endmacro PIXELSUB_PS_W32_H8_avx2 32, 8 PIXELSUB_PS_W32_H8_avx2 32, 16 PIXELSUB_PS_W32_H8_avx2 32, 32 PIXELSUB_PS_W32_H8_avx2 32, 64 %endif ;----------------------------------------------------------------------------- ; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- %macro PIXELSUB_PS_W64_H2 2 %if HIGH_BIT_DEPTH cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 mov r6d, %2/2 add r4, r4 add r5, r5 add r1, r1 .loop: movu m0, [r2] movu m2, [r2 + 16] movu m4, [r2 + 32] movu m6, [r2 + 48] movu m1, [r3] movu m3, [r3 + 16] movu m5, [r3 + 32] movu m7, [r3 + 48] psubw m0, m1 psubw m2, m3 psubw m4, m5 psubw m6, m7 movu [r0], m0 movu [r0 + 16], m2 movu [r0 + 32], m4 movu [r0 + 48], m6 movu m0, [r2 + 64] movu m2, [r2 + 80] movu m4, [r2 + 96] movu m6, [r2 + 112] movu m1, [r3 + 64] movu m3, [r3 + 80] movu m5, [r3 + 96] movu m7, [r3 + 112] psubw m0, m1 psubw m2, m3 psubw m4, m5 psubw m6, m7 movu [r0 + 64], m0 movu [r0 + 80], m2 movu [r0 + 96], m4 movu [r0 + 112], m6 movu m0, [r2 + r4] movu m2, [r2 + r4 + 16] movu m4, [r2 + r4 + 32] movu m6, [r2 + r4 + 48] movu m1, [r3 + r5] movu m3, [r3 + r5 + 16] movu m5, [r3 + r5 + 32] movu m7, [r3 + r5 + 48] psubw m0, m1 psubw m2, m3 psubw m4, m5 psubw m6, m7 movu [r0 + r1], m0 movu [r0 + r1 + 16], m2 movu [r0 + r1 + 32], m4 movu [r0 + r1 + 48], m6 movu m0, [r2 + r4 + 64] movu m2, [r2 + r4 + 80] movu m4, [r2 + r4 + 96] movu m6, [r2 + r4 + 112] movu m1, [r3 + r5 + 64] movu m3, [r3 + r5 + 80] movu m5, [r3 + r5 + 96] movu m7, [r3 + r5 + 112] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] psubw m0, m1 psubw m2, m3 psubw m4, m5 psubw m6, m7 movu [r0 + r1 + 64], m0 movu [r0 + r1 + 80], m2 movu [r0 + r1 + 96], m4 movu [r0 + r1 + 112], m6 lea r0, [r0 + r1 * 2] jnz .loop RET %else cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 mov r6d, %2/2 pxor m6, m6 add r1, r1 .loop: movu m1, [r2] movu m5, [r2 + 16] movu m3, [r3] movu m7, [r3 + 16] pmovzxbw m0, m1 pmovzxbw m4, m5 pmovzxbw m2, m3 punpckhbw m1, m6 punpckhbw m3, m6 punpckhbw m5, m6 psubw m0, m2 psubw m1, m3 pmovzxbw m2, m7 punpckhbw m7, m6 psubw m4, m2 psubw m5, m7 movu m3, [r2 + 32] movu m7, [r3 + 32] pmovzxbw m2, m3 punpckhbw m3, m6 movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m4 movu [r0 + 48], m5 movu m1, [r2 + 48] movu m5, [r3 + 48] pmovzxbw m0, m1 pmovzxbw m4, m7 punpckhbw m1, m6 punpckhbw m7, m6 psubw m2, m4 psubw m3, m7 movu [r0 + 64], m2 movu [r0 + 80], m3 movu m7, [r2 + r4] movu m3, [r3 + r5] pmovzxbw m2, m5 pmovzxbw m4, m7 punpckhbw m5, m6 punpckhbw m7, m6 psubw m0, m2 psubw m1, m5 movu [r0 + 96], m0 movu [r0 + 112], m1 movu m2, [r2 + r4 + 16] movu m5, [r3 + r5 + 16] pmovzxbw m0, m3 pmovzxbw m1, m2 punpckhbw m3, m6 punpckhbw m2, m6 psubw m4, m0 psubw m7, m3 movu [r0 + r1], m4 movu [r0 + r1 + 16], m7 movu m0, [r2 + r4 + 32] movu m3, [r3 + r5 + 32] dec r6d pmovzxbw m4, m5 pmovzxbw m7, m0 punpckhbw m5, m6 punpckhbw m0, m6 psubw m1, m4 psubw m2, m5 movu [r0 + r1 + 32], m1 movu [r0 + r1 + 48], m2 movu m4, [r2 + r4 + 48] movu m5, [r3 + r5 + 48] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] pmovzxbw m1, m3 pmovzxbw m2, m4 punpckhbw m3, m6 punpckhbw m4, m6 psubw m7, m1 psubw m0, m3 movu [r0 + r1 + 64], m7 movu [r0 + r1 + 80], m0 pmovzxbw m7, m5 punpckhbw m5, m6 psubw m2, m7 psubw m4, m5 movu [r0 + r1 + 96], m2 movu [r0 + r1 + 112], m4 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PIXELSUB_PS_W64_H2 64, 16 PIXELSUB_PS_W64_H2 64, 64 %else INIT_XMM sse4 PIXELSUB_PS_W64_H2 64, 16 PIXELSUB_PS_W64_H2 64, 64 %endif ;----------------------------------------------------------------------------- ; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- %macro PIXELSUB_PS_W64_H4_avx2 1 %if HIGH_BIT_DEPTH %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_sub_ps_64x%1, 6, 10, 8, dest, deststride, src0, src1, srcstride0, srcstride1 add r1d, r1d add r4d, r4d add r5d, r5d mov r9d, %1/4 lea r6, [r1 * 3] lea r7, [r4 * 3] lea r8, [r5 * 3] .loop movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + 64] movu m3, [r2 + 96] movu m4, [r3] movu m5, [r3 + 32] movu m6, [r3 + 64] movu m7, [r3 + 96] psubw m0, m4 psubw m1, m5 psubw m2, m6 psubw m3, m7 movu [r0], m0 movu [r0 + 32], m1 movu [r0 + 64], m2 movu [r0 + 96], m3 movu m0, [r2 + r4] movu m1, [r2 + r4 + 32] movu m2, [r2 + r4 + 64] movu m3, [r2 + r4 + 96] movu m4, [r3 + r5] movu m5, [r3 + r5 + 32] movu m6, [r3 + r5 + 64] movu m7, [r3 + r5 + 96] psubw m0, m4 psubw m1, m5 psubw m2, m6 psubw m3, m7 movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 movu [r0 + r1 + 64], m2 movu [r0 + r1 + 96], m3 movu m0, [r2 + r4 * 2] movu m1, [r2 + r4 * 2 + 32] movu m2, [r2 + r4 * 2 + 64] movu m3, [r2 + r4 * 2 + 96] movu m4, [r3 + r5 * 2] movu m5, [r3 + r5 * 2 + 32] movu m6, [r3 + r5 * 2 + 64] movu m7, [r3 + r5 * 2 + 96] psubw m0, m4 psubw m1, m5 psubw m2, m6 psubw m3, m7 movu [r0 + r1 * 2], m0 movu [r0 + r1 * 2 + 32], m1 movu [r0 + r1 * 2 + 64], m2 movu [r0 + r1 * 2 + 96], m3 movu m0, [r2 + r7] movu m1, [r2 + r7 + 32] movu m2, [r2 + r7 + 64] movu m3, [r2 + r7 + 96] movu m4, [r3 + r8] movu m5, [r3 + r8 + 32] movu m6, [r3 + r8 + 64] movu m7, [r3 + r8 + 96] psubw m0, m4 psubw m1, m5 psubw m2, m6 psubw m3, m7 movu [r0 + r6], m0 movu [r0 + r6 + 32], m1 movu [r0 + r6 + 64], m2 movu [r0 + r6 + 96], m3 lea r0, [r0 + r1 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] dec r9d jnz .loop RET %endif %else ;----------------------------------------------------------------------------- ; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal pixel_sub_ps_64x%1, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 mov r6d, %1/4 add r1, r1 .loop: pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 16] pmovzxbw m2, [r2 + 32] pmovzxbw m3, [r2 + 48] pmovzxbw m4, [r3] pmovzxbw m5, [r3 + 16] pmovzxbw m6, [r3 + 32] pmovzxbw m7, [r3 + 48] psubw m0, m4 psubw m1, m5 psubw m2, m6 psubw m3, m7 movu [r0], m0 movu [r0 + 32], m1 movu [r0 + 64], m2 movu [r0 + 96], m3 add r0, r1 add r2, r4 add r3, r5 pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 16] pmovzxbw m2, [r2 + 32] pmovzxbw m3, [r2 + 48] pmovzxbw m4, [r3] pmovzxbw m5, [r3 + 16] pmovzxbw m6, [r3 + 32] pmovzxbw m7, [r3 + 48] psubw m0, m4 psubw m1, m5 psubw m2, m6 psubw m3, m7 movu [r0], m0 movu [r0 + 32], m1 movu [r0 + 64], m2 movu [r0 + 96], m3 add r0, r1 add r2, r4 add r3, r5 pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 16] pmovzxbw m2, [r2 + 32] pmovzxbw m3, [r2 + 48] pmovzxbw m4, [r3] pmovzxbw m5, [r3 + 16] pmovzxbw m6, [r3 + 32] pmovzxbw m7, [r3 + 48] psubw m0, m4 psubw m1, m5 psubw m2, m6 psubw m3, m7 movu [r0], m0 movu [r0 + 32], m1 movu [r0 + 64], m2 movu [r0 + 96], m3 add r0, r1 add r2, r4 add r3, r5 pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 16] pmovzxbw m2, [r2 + 32] pmovzxbw m3, [r2 + 48] pmovzxbw m4, [r3] pmovzxbw m5, [r3 + 16] pmovzxbw m6, [r3 + 32] pmovzxbw m7, [r3 + 48] psubw m0, m4 psubw m1, m5 psubw m2, m6 psubw m3, m7 movu [r0], m0 movu [r0 + 32], m1 movu [r0 + 64], m2 movu [r0 + 96], m3 add r0, r1 add r2, r4 add r3, r5 dec r6d jnz .loop RET %endif %endmacro PIXELSUB_PS_W64_H4_avx2 16 PIXELSUB_PS_W64_H4_avx2 64 ;============================================================================= ; variance ;============================================================================= %macro VAR_START 1 pxor m5, m5 ; sum pxor m6, m6 ; sum squared %if HIGH_BIT_DEPTH == 0 %if %1 mova m7, [pw_00ff] %elif mmsize < 32 pxor m7, m7 ; zero %endif %endif ; !HIGH_BIT_DEPTH %endmacro %macro VAR_END 2 %if HIGH_BIT_DEPTH %if mmsize == 8 && %1*%2 == 256 HADDUW m5, m2 %else %if %1 >= 32 HADDW m5, m2 movd m7, r4d paddd m5, m7 %else HADDW m5, m2 %endif %endif %else ; !HIGH_BIT_DEPTH %if %1 == 64 HADDW m5, m2 movd m7, r4d paddd m5, m7 %else HADDW m5, m2 %endif %endif ; HIGH_BIT_DEPTH HADDD m6, m1 %if ARCH_X86_64 punpckldq m5, m6 movq rax, m5 %else movd eax, m5 movd edx, m6 %endif RET %endmacro %macro VAR_END_12bit 2 HADDD m5, m1 HADDD m6, m1 %if ARCH_X86_64 punpckldq m5, m6 movq rax, m5 %else movd eax, m5 movd edx, m6 %endif RET %endmacro %macro VAR_CORE 0 paddw m5, m0 paddw m5, m3 paddw m5, m1 paddw m5, m4 pmaddwd m0, m0 pmaddwd m3, m3 pmaddwd m1, m1 pmaddwd m4, m4 paddd m6, m0 paddd m6, m3 paddd m6, m1 paddd m6, m4 %endmacro %macro VAR_2ROW 2 mov r2d, %2 %%loop: %if HIGH_BIT_DEPTH movu m0, [r0] movu m1, [r0+mmsize] movu m3, [r0+%1] movu m4, [r0+%1+mmsize] %else ; !HIGH_BIT_DEPTH mova m0, [r0] punpckhbw m1, m0, m7 mova m3, [r0+%1] mova m4, m3 punpcklbw m0, m7 %endif ; HIGH_BIT_DEPTH %ifidn %1, r1 lea r0, [r0+%1*2] %else add r0, r1 %endif %if HIGH_BIT_DEPTH == 0 punpcklbw m3, m7 punpckhbw m4, m7 %endif ; !HIGH_BIT_DEPTH VAR_CORE dec r2d jg %%loop %endmacro ;----------------------------------------------------------------------------- ; int pixel_var_wxh( uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal pixel_var_16x16, 2,3 FIX_STRIDES r1 VAR_START 0 VAR_2ROW 8*SIZEOF_PIXEL, 16 VAR_END 16, 16 cglobal pixel_var_8x8, 2,3 FIX_STRIDES r1 VAR_START 0 VAR_2ROW r1, 4 VAR_END 8, 8 %if HIGH_BIT_DEPTH %macro VAR 0 %if BIT_DEPTH <= 10 cglobal pixel_var_16x16, 2,3,8 FIX_STRIDES r1 VAR_START 0 VAR_2ROW r1, 8 VAR_END 16, 16 cglobal pixel_var_32x32, 2,6,8 FIX_STRIDES r1 mov r3, r0 VAR_START 0 VAR_2ROW r1, 8 HADDW m5, m2 movd r4d, m5 pxor m5, m5 VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 lea r0, [r3 + 32] VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 VAR_2ROW r1, 8 VAR_END 32, 32 cglobal pixel_var_64x64, 2,6,8 FIX_STRIDES r1 mov r3, r0 VAR_START 0 VAR_2ROW r1, 8 HADDW m5, m2 movd r4d, m5 pxor m5, m5 VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 lea r0, [r3 + 32] VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 lea r0, [r3 + 64] VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 lea r0, [r3 + 96] VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 VAR_2ROW r1, 8 HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 VAR_2ROW r1, 8 VAR_END 64, 64 %else ; BIT_DEPTH <= 10 cglobal pixel_var_16x16, 2,3,8 FIX_STRIDES r1 VAR_START 0 VAR_2ROW r1, 4 HADDUWD m5, m1 mova m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m5, m7 VAR_END_12bit 16, 16 cglobal pixel_var_32x32, 2,6,8 FIX_STRIDES r1 mov r3, r0 VAR_START 0 VAR_2ROW r1, 4 HADDUWD m5, m1 mova m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 lea r0, [r3 + 32] pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m5, m7 VAR_END_12bit 32, 32 cglobal pixel_var_64x64, 2,6,8 FIX_STRIDES r1 mov r3, r0 VAR_START 0 VAR_2ROW r1, 4 HADDUWD m5, m1 mova m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 lea r0, [r3 + 16 * SIZEOF_PIXEL] pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 lea r0, [r3 + 32 * SIZEOF_PIXEL] pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 lea r0, [r3 + 48 * SIZEOF_PIXEL] pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m7, m5 pxor m5, m5 VAR_2ROW r1, 4 HADDUWD m5, m1 paddd m5, m7 VAR_END_12bit 64, 64 %endif ; BIT_DEPTH <= 10 cglobal pixel_var_8x8, 2,3,8 lea r2, [r1*3] VAR_START 0 movu m0, [r0] movu m1, [r0+r1*2] movu m3, [r0+r1*4] movu m4, [r0+r2*2] lea r0, [r0+r1*8] VAR_CORE movu m0, [r0] movu m1, [r0+r1*2] movu m3, [r0+r1*4] movu m4, [r0+r2*2] VAR_CORE VAR_END 8, 8 %endmacro ; VAR INIT_XMM sse2 VAR INIT_XMM avx VAR INIT_XMM xop VAR %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 %macro VAR 0 cglobal pixel_var_8x8, 2,3,8 VAR_START 1 lea r2, [r1 * 3] movh m0, [r0] movh m3, [r0 + r1] movhps m0, [r0 + r1 * 2] movhps m3, [r0 + r2] DEINTB 1, 0, 4, 3, 7 lea r0, [r0 + r1 * 4] VAR_CORE movh m0, [r0] movh m3, [r0 + r1] movhps m0, [r0 + r1 * 2] movhps m3, [r0 + r2] DEINTB 1, 0, 4, 3, 7 VAR_CORE VAR_END 8, 8 cglobal pixel_var_16x16_internal movu m0, [r0] movu m3, [r0 + r1] DEINTB 1, 0, 4, 3, 7 VAR_CORE movu m0, [r0 + 2 * r1] movu m3, [r0 + r2] DEINTB 1, 0, 4, 3, 7 lea r0, [r0 + r1 * 4] VAR_CORE movu m0, [r0] movu m3, [r0 + r1] DEINTB 1, 0, 4, 3, 7 VAR_CORE movu m0, [r0 + 2 * r1] movu m3, [r0 + r2] DEINTB 1, 0, 4, 3, 7 lea r0, [r0 + r1 * 4] VAR_CORE movu m0, [r0] movu m3, [r0 + r1] DEINTB 1, 0, 4, 3, 7 VAR_CORE movu m0, [r0 + 2 * r1] movu m3, [r0 + r2] DEINTB 1, 0, 4, 3, 7 lea r0, [r0 + r1 * 4] VAR_CORE movu m0, [r0] movu m3, [r0 + r1] DEINTB 1, 0, 4, 3, 7 VAR_CORE movu m0, [r0 + 2 * r1] movu m3, [r0 + r2] DEINTB 1, 0, 4, 3, 7 VAR_CORE ret cglobal pixel_var_16x16, 2,3,8 VAR_START 1 lea r2, [r1 * 3] call pixel_var_16x16_internal VAR_END 16, 16 cglobal pixel_var_32x32, 2,4,8 VAR_START 1 lea r2, [r1 * 3] mov r3, r0 call pixel_var_16x16_internal lea r0, [r0 + r1 * 4] call pixel_var_16x16_internal lea r0, [r3 + 16] call pixel_var_16x16_internal lea r0, [r0 + r1 * 4] call pixel_var_16x16_internal VAR_END 32, 32 cglobal pixel_var_64x64, 2,6,8 VAR_START 1 lea r2, [r1 * 3] mov r3, r0 call pixel_var_16x16_internal lea r0, [r0 + r1 * 4] call pixel_var_16x16_internal lea r0, [r0 + r1 * 4] call pixel_var_16x16_internal lea r0, [r0 + r1 * 4] call pixel_var_16x16_internal HADDW m5, m2 movd r4d, m5 pxor m5, m5 lea r0, [r3 + 16] call pixel_var_16x16_internal lea r0, [r0 + r1 * 4] call pixel_var_16x16_internal lea r0, [r0 + r1 * 4] call pixel_var_16x16_internal lea r0, [r0 + r1 * 4] call pixel_var_16x16_internal HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 lea r0, [r3 + 32] call pixel_var_16x16_internal lea r0, [r0 + r1 * 4] call pixel_var_16x16_internal lea r0, [r0 + r1 * 4] call pixel_var_16x16_internal lea r0, [r0 + r1 * 4] call pixel_var_16x16_internal lea r0, [r3 + 48] HADDW m5, m2 movd r5d, m5 add r4, r5 pxor m5, m5 call pixel_var_16x16_internal lea r0, [r0 + r1 * 4] call pixel_var_16x16_internal lea r0, [r0 + r1 * 4] call pixel_var_16x16_internal lea r0, [r0 + r1 * 4] call pixel_var_16x16_internal VAR_END 64, 64 %endmacro ; VAR INIT_XMM sse2 VAR INIT_XMM avx VAR INIT_XMM xop VAR INIT_YMM avx2 cglobal pixel_var_16x16, 2,4,7 VAR_START 0 mov r2d, 4 lea r3, [r1*3] .loop: pmovzxbw m0, [r0] pmovzxbw m3, [r0+r1] pmovzxbw m1, [r0+r1*2] pmovzxbw m4, [r0+r3] lea r0, [r0+r1*4] VAR_CORE dec r2d jg .loop vextracti128 xm0, m5, 1 vextracti128 xm1, m6, 1 paddw xm5, xm0 paddd xm6, xm1 HADDW xm5, xm2 HADDD xm6, xm1 %if ARCH_X86_64 punpckldq xm5, xm6 movq rax, xm5 %else movd eax, xm5 movd edx, xm6 %endif RET INIT_YMM avx2 cglobal pixel_var_32x32, 2,4,7 VAR_START 0 mov r2d, 16 .loop: pmovzxbw m0, [r0] pmovzxbw m3, [r0 + 16] pmovzxbw m1, [r0 + r1] pmovzxbw m4, [r0 + r1 + 16] lea r0, [r0 + r1 * 2] VAR_CORE dec r2d jg .loop vextracti128 xm0, m5, 1 vextracti128 xm1, m6, 1 paddw xm5, xm0 paddd xm6, xm1 HADDW xm5, xm2 HADDD xm6, xm1 %if ARCH_X86_64 punpckldq xm5, xm6 movq rax, xm5 %else movd eax, xm5 movd edx, xm6 %endif RET INIT_YMM avx2 cglobal pixel_var_64x64, 2,4,7 VAR_START 0 mov r2d, 64 .loop: pmovzxbw m0, [r0] pmovzxbw m3, [r0 + 16] pmovzxbw m1, [r0 + mmsize] pmovzxbw m4, [r0 + mmsize + 16] lea r0, [r0 + r1] VAR_CORE dec r2d jg .loop pxor m1, m1 punpcklwd m0, m5, m1 punpckhwd m5, m1 paddd m5, m0 vextracti128 xm2, m5, 1 vextracti128 xm1, m6, 1 paddd xm5, xm2 paddd xm6, xm1 HADDD xm5, xm2 HADDD xm6, xm1 %if ARCH_X86_64 punpckldq xm5, xm6 movq rax, xm5 %else movd eax, xm5 movd edx, xm6 %endif RET %endif ; !HIGH_BIT_DEPTH %macro VAR2_END 3 HADDW %2, xm1 movd r1d, %2 imul r1d, r1d HADDD %3, xm1 shr r1d, %1 movd eax, %3 movd [r4], %3 sub eax, r1d ; sqr - (sum * sum >> shift) RET %endmacro ;int scanPosLast(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize) ;{ ; int scanPosLast = 0; ; do ; { ; const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE; ; ; const uint32_t posLast = scan[scanPosLast++]; ; ; const int curCoeff = coeff[posLast]; ; const uint32_t isNZCoeff = (curCoeff != 0); ; numSig -= isNZCoeff; ; ; coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]); ; coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff; ; coeffNum[cgIdx] += (uint8_t)isNZCoeff; ; } ; while (numSig > 0); ; return scanPosLast - 1; ;} %if ARCH_X86_64 == 1 INIT_XMM avx2,bmi2 cglobal scanPosLast, 7,11,6 ; convert unit of Stride(trSize) to int16_t mov r7d, r7m add r7d, r7d ; loading scan table and convert to Byte mova m0, [r6] packuswb m0, [r6 + mmsize] pxor m1, m0, [pb_15] ; clear CG count xor r9d, r9d ; m0 - Zigzag scan table ; m1 - revert order scan table ; m4 - zero ; m5 - ones pxor m4, m4 pcmpeqb m5, m5 lea r8d, [r7d * 3] .loop: ; position of current CG movzx r6d, word [r0] lea r6, [r6 * 2 + r1] add r0, 16 * 2 ; loading current CG movh m2, [r6] movhps m2, [r6 + r7] movh m3, [r6 + r7 * 2] movhps m3, [r6 + r8] packsswb m2, m3 ; Zigzag pshufb m3, m2, m0 pshufb m2, m1 ; get sign pmovmskb r6d, m3 pcmpeqb m3, m4 pmovmskb r10d, m3 not r10d pext r6d, r6d, r10d mov [r2 + r9 * 2], r6w ; get non-zero flag ; TODO: reuse above result with reorder pcmpeqb m2, m4 pxor m2, m5 pmovmskb r6d, m2 mov [r3 + r9 * 2], r6w ; get non-zero number, POPCNT is faster pabsb m2, m2 psadbw m2, m4 movhlps m3, m2 paddd m2, m3 movd r6d, m2 mov [r4 + r9], r6b inc r9d sub r5d, r6d jg .loop ; fixup last CG non-zero flag dec r9d movzx r0d, word [r3 + r9 * 2] ;%if cpuflag(bmi1) ; 2uops? ; tzcnt r1d, r0d ;%else bsf r1d, r0d ;%endif shrx r0d, r0d, r1d mov [r3 + r9 * 2], r0w ; get last pos mov eax, r9d shl eax, 4 xor r1d, 15 add eax, r1d RET ; t3 must be ecx, since it's used for shift. %if WIN64 DECLARE_REG_TMP 3,1,2,0 %elif ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3 %else ; X86_32 %error Unsupport platform X86_32 %endif INIT_CPUFLAGS cglobal scanPosLast_x64, 5,12 mov r10, r3mp movifnidn t0, r0mp mov r5d, r5m xor r11d, r11d ; cgIdx xor r7d, r7d ; tmp for non-zero flag .loop: xor r8d, r8d ; coeffSign[] xor r9d, r9d ; coeffFlag[] xor t3d, t3d ; coeffNum[] %assign x 0 %rep 16 movzx r6d, word [t0 + x * 2] movsx r6d, word [t1 + r6 * 2] test r6d, r6d setnz r7b shr r6d, 31 shl r6d, t3b or r8d, r6d lea r9, [r9 * 2 + r7] add t3d, r7d %assign x x+1 %endrep ; store latest group data mov [t2 + r11 * 2], r8w mov [r10 + r11 * 2], r9w mov [r4 + r11], t3b inc r11d add t0, 16 * 2 sub r5d, t3d jnz .loop ; store group data bsf t3d, r9d shr r9d, t3b mov [r10 + (r11 - 1) * 2], r9w ; get posLast shl r11d, 4 sub r11d, t3d lea eax, [r11d - 1] RET %endif ;----------------------------------------------------------------------------- ; uint32_t[sumSign last first] findPosFirstLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16], uint32_t *absSum) ;----------------------------------------------------------------------------- INIT_XMM ssse3 cglobal findPosFirstLast, 3,3,4 ; convert stride to int16_t add r1d, r1d ; loading scan table and convert to Byte mova m0, [r2] packuswb m0, [r2 + mmsize] ; loading 16 of coeff movh m1, [r0] movhps m1, [r0 + r1] movh m2, [r0 + r1 * 2] lea r1d, [r1 * 3] movhps m2, [r0 + r1] pxor m3, m1, m2 packsswb m1, m2 ; get absSum movhlps m2, m3 pxor m3, m2 pshufd m2, m3, q2301 pxor m3, m2 movd r0d, m3 mov r2d, r0d shr r2d, 16 xor r2d, r0d shl r2d, 31 ; get non-zero mask pxor m2, m2 pcmpeqb m1, m2 ; reorder by Zigzag scan pshufb m1, m0 ; get First and Last pos pmovmskb r0d, m1 not r0d bsr r1w, r0w bsf eax, r0d ; side effect: clear AH to Zero shl r1d, 8 or eax, r2d ; merge absSumSign or eax, r1d ; merge lastNZPosInCG RET %if 0 ; REMOVED ; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int subPosBase) ;for (int i = 0; i < MLS_CG_SIZE; i++) ;{ ; tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]); ; tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]); ; tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]); ; tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]); ;} ;do ;{ ; uint32_t blkPos, sig, ctxSig; ; blkPos = g_scan4x4[codingParameters.scanType][scanPosSigOff]; ; const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0; ; sig = scanFlagMask & 1; ; scanFlagMask >>= 1; ; if (scanPosSigOff + (subSet == 0) + numNonZero) ; { ; const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset; ; ctxSig = cnt & posZeroMask; ; ; const uint32_t mstate = baseCtx[ctxSig]; ; const uint32_t mps = mstate & 1; ; const uint32_t stateBits = xavs2_entropyStateBits[mstate ^ sig]; ; uint32_t nextState = (stateBits >> 24) + mps; ; if ((mstate ^ sig) == 1) ; nextState = sig; ; baseCtx[ctxSig] = (uint8_t)nextState; ; sum += stateBits; ; } ; absCoeff[numNonZero] = tmpCoeff[blkPos]; ; numNonZero += sig; ; scanPosSigOff--; ;} ;while(scanPosSigOff >= 0); ; sum &= 0xFFFFFF %if ARCH_X86_64 ; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase) INIT_XMM sse4 cglobal costCoeffNxN, 6,11,6 add r2d, r2d ; abs(coeff) movh m1, [r1] movhps m1, [r1 + r2] movh m2, [r1 + r2 * 2] lea r2, [r2 * 3] movhps m2, [r1 + r2] pabsw m1, m1 pabsw m2, m2 ; r[1-2] free here ; WARNING: beyond-bound read here! ; loading scan table mov r2d, r8m xor r2d, 15 movu m0, [r0 + r2 * 2] movu m3, [r0 + r2 * 2 + mmsize] packuswb m0, m3 pxor m0, [pb_15] xchg r2d, r8m ; r[0-1] free here ; reorder coeff mova m3, [deinterleave_shuf] pshufb m1, m3 pshufb m2, m3 punpcklqdq m3, m1, m2 punpckhqdq m1, m2 pshufb m3, m0 pshufb m1, m0 punpcklbw m2, m3, m1 punpckhbw m3, m1 ; r[0-1], m[1] free here ; loading tabSigCtx (+offset) mova m1, [r4] pshufb m1, m0 movd m4, r7m pxor m5, m5 pshufb m4, m5 paddb m1, m4 ; register mapping ; m0 - Zigzag ; m1 - sigCtx ; {m3,m2} - abs(coeff) ; r0 - xavs2_entropyStateBits ; r1 - baseCtx ; r2 - scanPosSigOff ; r3 - absCoeff ; r4 - nonZero ; r5 - scanFlagMask ; r6 - sum lea r0, [private_prefix %+ _entropyStateBits] mov r1, r6mp xor r6d, r6d xor r4d, r4d xor r8d, r8d test r2d, r2d jz .idx_zero .loop: ; { ; const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset; ; ctxSig = cnt & posZeroMask; ; const uint32_t mstate = baseCtx[ctxSig]; ; const uint32_t mps = mstate & 1; ; const uint32_t stateBits = xavs2_entropyStateBits[mstate ^ sig]; ; uint32_t nextState = (stateBits >> 24) + mps; ; if ((mstate ^ sig) == 1) ; nextState = sig; ; baseCtx[ctxSig] = (uint8_t)nextState; ; sum += stateBits; ; } ; absCoeff[numNonZero] = tmpCoeff[blkPos]; ; numNonZero += sig; ; scanPosSigOff--; pextrw [r3 + r4 * 2], m2, 0 ; absCoeff[numNonZero] = tmpCoeff[blkPos] shr r5d, 1 setc r8b ; r8 = sig add r4d, r8d ; numNonZero += sig palignr m4, m3, m2, 2 psrldq m3, 2 mova m2, m4 movd r7d, m1 ; r7 = ctxSig movzx r7d, r7b psrldq m1, 1 movzx r9d, byte [r1 + r7] ; mstate = baseCtx[ctxSig] mov r10d, r9d and r10d, 1 ; mps = mstate & 1 xor r9d, r8d ; r9 = mstate ^ sig add r6d, [r0 + r9 * 4] ; sum += xavs2_entropyStateBits[mstate ^ sig] add r10b, byte [r0 + r9 * 4 + 3] ; nextState = (stateBits >> 24) + mps cmp r9b, 1 cmove r10d, r8d mov byte [r1 + r7], r10b dec r2d jg .loop .idx_zero: pextrw [r3 + r4 * 2], m2, 0 ; absCoeff[numNonZero] = tmpCoeff[blkPos] add r4b, r8m xor r2d, r2d cmp word r9m, 0 sete r2b add r4b, r2b jz .exit dec r2b movd r3d, m1 and r2d, r3d movzx r3d, byte [r1 + r2] ; mstate = baseCtx[ctxSig] mov r4d, r5d xor r5d, r3d ; r0 = mstate ^ sig and r3d, 1 ; mps = mstate & 1 add r6d, [r0 + r5 * 4] ; sum += xavs2_entropyStateBits[mstate ^ sig] add r3b, [r0 + r5 * 4 + 3] ; nextState = (stateBits >> 24) + mps cmp r5b, 1 cmove r3d, r4d mov byte [r1 + r2], r3b .exit: %ifnidn eax,r6d mov eax, r6d %endif and eax, 0xFFFFFF RET ; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase) INIT_YMM avx2,bmi2 cglobal costCoeffNxN, 6,10,5 add r2d, r2d ; abs(coeff) movq xm1, [r1] movhps xm1, [r1 + r2] movq xm2, [r1 + r2 * 2] lea r2, [r2 * 3] movhps xm2, [r1 + r2] vinserti128 m1, m1, xm2, 1 pabsw m1, m1 ; r[1-2] free here ; loading tabSigCtx mova xm2, [r4] ; r[4] free here ; WARNING: beyond-bound read here! ; loading scan table mov r2d, r8m bzhi r4d, r5d, r2d ; clear non-scan mask bits mov r6d, r2d xor r2d, 15 movu m0, [r0 + r2 * 2] packuswb m0, m0 pxor m0, [pb_15] vpermq m0, m0, q3120 add r4d, r2d ; r4d = (scanPosSigOff == 15) -> (numNonZero == 0) mov r2d, r6d ; reorder tabSigCtx (+offset) pshufb xm2, xm0 vpbroadcastb xm3, r7m paddb xm2, xm3 ; r[0-1] free here ; reorder coeff pshufb m1, [deinterleave_shuf] vpermq m1, m1, q3120 pshufb m1, m0 vpermq m1, m1, q3120 pshufb m1, [interleave_shuf] ; r[0-1], m[2-3] free here ; sig mask pxor xm3, xm3 movd xm4, r5d vpbroadcastw m4, xm4 pandn m4, m4, [pw_exp2_0_15] pcmpeqw m4, m3 ; absCoeff[numNonZero] = tmpCoeff[blkPos] ; [0-3] movq r0, xm4 movq r1, xm1 pext r6, r1, r0 mov qword [r3], r6 popcnt r0, r0 shr r0, 3 add r3, r0 ; [4-7] pextrq r0, xm4, 1 pextrq r1, xm1, 1 pext r6, r1, r0 mov qword [r3], r6 popcnt r0, r0 shr r0, 3 add r3, r0 ; [8-B] vextracti128 xm4, m4, 1 movq r0, xm4 vextracti128 xm1, m1, 1 movq r1, xm1 pext r6, r1, r0 mov qword [r3], r6 popcnt r0, r0 shr r0, 3 add r3, r0 ; [C-F] pextrq r0, xm4, 1 pextrq r1, xm1, 1 pext r6, r1, r0 mov qword [r3], r6 ; r[0-1,3] free here ; register mapping ; m0 - Zigzag ; m1 - sigCtx ; r0 - xavs2_entropyStateBits ; r1 - baseCtx ; r2 - scanPosSigOff ; r5 - scanFlagMask ; r6 - sum ; {r3,r4} - ctxSig[15-0] ; r8m - (numNonZero != 0) || (subPosBase == 0) lea r0, [private_prefix %+ _entropyStateBits] mov r1, r6mp xor r6d, r6d xor r8d, r8d test r2d, r2d jz .idx_zero ; { ; const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset; ; ctxSig = cnt & posZeroMask; ; const uint32_t mstate = baseCtx[ctxSig]; ; const uint32_t mps = mstate & 1; ; const uint32_t stateBits = xavs2_entropyStateBits[mstate ^ sig]; ; uint32_t nextState = (stateBits >> 24) + mps; ; if ((mstate ^ sig) == 1) ; nextState = sig; ; baseCtx[ctxSig] = (uint8_t)nextState; ; sum += stateBits; ; } ; absCoeff[numNonZero] = tmpCoeff[blkPos]; ; numNonZero += sig; ; scanPosSigOff--; .loop: shr r5d, 1 setc r8b ; r8 = sig movd r7d, xm2 ; r7 = ctxSig movzx r7d, r7b psrldq xm2, 1 movzx r9d, byte [r1 + r7] ; mstate = baseCtx[ctxSig] mov r3d, r9d and r3b, 1 ; mps = mstate & 1 xor r9d, r8d ; r9 = mstate ^ sig add r6d, [r0 + r9 * 4] ; sum += entropyStateBits[mstate ^ sig] add r3b, byte [r0 + r9 * 4 + 3] ; nextState = (stateBits >> 24) + mps cmp r9d, 1 cmove r3d, r8d mov byte [r1 + r7], r3b dec r2d jg .loop .idx_zero: xor r2d, r2d cmp word r9m, 0 sete r2b add r4d, r2d ; (numNonZero != 0) || (subPosBase == 0) jz .exit dec r2b movd r3d, xm2 and r2d, r3d movzx r3d, byte [r1 + r2] ; mstate = baseCtx[ctxSig] mov r4d, r5d xor r5d, r3d ; r0 = mstate ^ sig and r3b, 1 ; mps = mstate & 1 add r6d, [r0 + r5 * 4] ; sum += xavs2_entropyStateBits[mstate ^ sig] add r3b, [r0 + r5 * 4 + 3] ; nextState = (stateBits >> 24) + mps cmp r5b, 1 cmove r3d, r4d mov byte [r1 + r2], r3b .exit: %ifnidn eax,r6d mov eax, r6d %endif and eax, 0xFFFFFF RET %endif ; ARCH_X86_64 ;uint32_t goRiceParam = 0; ;int firstCoeff2 = 1; ;uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel ;idx = 0; ;do ;{ ; int baseLevel = (baseLevelN & 3) | firstCoeff2; ; baseLevelN >>= 2; ; int codeNumber = absCoeff[idx] - baseLevel; ; if (codeNumber >= 0) ; { ; uint32_t length = 0; ; codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION; ; if (codeNumber >= 0) ; { ; { ; unsigned long cidx; ; CLZ(cidx, codeNumber + 1); ; length = cidx; ; } ; codeNumber = (length + length); ; } ; sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber); ; if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam)) ; goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2); ; } ; if (absCoeff[idx] >= 2) ; firstCoeff2 = 0; ; idx++; ;} ;while(idx < numNonZero); ; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero, int idx) INIT_XMM sse4 cglobal costCoeffRemain, 0,7,1 ; assign RCX to R3 ; RAX always in R6 and free %if WIN64 DECLARE_REG_TMP 3,1,2,0 mov t0, r0 mov r4d, r2d %elif ARCH_X86_64 ; *nix x64 didn't do anything DECLARE_REG_TMP 0,1,2,3 mov r4d, r2d %else ; X86_32 DECLARE_REG_TMP 6,3,2,1 mov t0, r0m mov r4d, r2m %endif xor t3d, t3d xor r5d, r5d lea t0, [t0 + r4 * 2] mov r2d, 3 ; register mapping ; r2d - baseLevel & tmp ; r4d - idx ; t3 - goRiceParam ; eax - absCoeff[idx] & tmp ; r5 - sum .loop: mov eax, 1 cmp r4d, 8 cmovge r2d, eax movzx eax, word [t0] add t0, 2 sub eax, r2d ; codeNumber = absCoeff[idx] - baseLevel jl .next shr eax, t3b ; codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION lea r2d, [rax - 3 + 1] ; CLZ(cidx, codeNumber + 1); bsr r2d, r2d add r2d, r2d ; codeNumber = (length + length) sub eax, 3 cmovge eax, r2d lea eax, [3 + 1 + t3 + rax] ; sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber) add r5d, eax ; if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam)) ; goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2); cmp t3d, 4 setl al mov r2d, 3 shl r2d, t3b cmp word [t0 - 2], r2w setg r2b and al, r2b add t3b, al .next: inc r4d mov r2d, 2 cmp r4d, r1m jl .loop mov eax, r5d RET ; uint32_t costC1C2Flag(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset) ;idx = 0; ;do ;{ ; uint32_t symbol1 = absCoeff[idx] > 1; ; uint32_t symbol2 = absCoeff[idx] > 2; ; { ; const uint32_t mstate = baseCtxMod[c1]; ; baseCtxMod[c1] = sbacNext(mstate, symbol1); ; sum += sbacGetEntropyBits(mstate, symbol1); ; } ; if (symbol1) ; c1Next = 0; ; if (symbol1 + firstC2Flag == 3) ; firstC2Flag = symbol2; ; if (symbol1 + firstC2Idx == 9) ; firstC2Idx = idx; ; c1 = (c1Next & 3); ; c1Next >>= 2; ; idx++; ;} ;while(idx < numC1Flag); ;if (!c1) ;{ ; baseCtxMod = &m_contextState[(bIsLuma ? 0 : NUM_ABS_FLAG_CTX_LUMA) + OFF_ABS_FLAG_CTX + ctxSet]; ; { ; const uint32_t mstate = baseCtxMod[0]; ; baseCtxMod[0] = sbacNext(mstate, firstC2Flag); ; sum += sbacGetEntropyBits(mstate, firstC2Flag); ; } ;} ;m_fracBits += (sum & 0xFFFFFF); ; TODO: we need more register, so I writen code as x64 only, but it is easy to portab to x86 platform %if ARCH_X86_64 INIT_XMM sse2 cglobal costC1C2Flag, 4,12,2 mova m0, [r0] packsswb m0, m0 pcmpgtb m1, m0, [pb_1] pcmpgtb m0, [pb_2] ; get mask for 'X>1' pmovmskb r0d, m1 mov r11d, r0d ; clear unavailable coeff flags xor r6d, r6d bts r6d, r1d dec r6d and r11d, r6d ; calculate firstC2Idx or r11d, 0x100 ; default value setting to 8 bsf r11d, r11d lea r5, [private_prefix %+ _entropyStateBits] xor r6d, r6d mov r4d, 0xFFFFFFF9 ; register mapping ; r4d - nextC1 ; r5 - xavs2_entropyStateBits ; r6d - sum ; r[7-10] - tmp ; r11d - firstC2Idx (not use in loop) ; process c1 flag .loop: ; const uint32_t mstate = baseCtx[ctxSig]; ; const uint32_t mps = mstate & 1; ; const uint32_t stateBits = xavs2_entropyStateBits[mstate ^ sig]; ; uint32_t nextState = (stateBits >> 24) + mps; ; if ((mstate ^ sig) == 1) ; nextState = sig; mov r10d, r4d ; c1 and r10d, 3 shr r4d, 2 xor r7d, r7d shr r0d, 1 cmovc r4d, r7d ; c1 <- 0 when C1Flag=1 setc r7b ; symbol1 movzx r8d, byte [r2 + r10] ; mstate = baseCtx[c1] mov r9d, r7d ; sig = symbol1 xor r7d, r8d ; mstate ^ sig and r8d, 1 ; mps = mstate & 1 add r6d, [r5 + r7 * 4] ; sum += xavs2_entropyStateBits[mstate ^ sig] add r8b, [r5 + r7 * 4 + 3] ; nextState = (stateBits >> 24) + mps cmp r7b, 1 ; if ((mstate ^ sig) == 1) nextState = sig; cmove r8d, r9d mov byte [r2 + r10], r8b dec r1d jg .loop ; check and generate c1 flag shl r4d, 30 jnz .quit ; move to c2 ctx add r2, r3 ; process c2 flag pmovmskb r8d, m0 bt r8d, r11d setc r7b movzx r8d, byte [r2] ; mstate = baseCtx[c1] mov r1d, r7d ; sig = symbol1 xor r7d, r8d ; mstate ^ sig and r8d, 1 ; mps = mstate & 1 add r6d, [r5 + r7 * 4] ; sum += xavs2_entropyStateBits[mstate ^ sig] add r8b, [r5 + r7 * 4 + 3] ; nextState = (stateBits >> 24) + mps cmp r7b, 1 ; if ((mstate ^ sig) == 1) nextState = sig; cmove r8d, r1d mov byte [r2], r8b .quit: shrd r4d, r11d, 4 %ifnidn r6d,eax mov eax, r6d %endif and eax, 0x00FFFFFF or eax, r4d RET %endif ; ARCH_X86_64 %endif ; %if 0 ; REMOVEDxavs2-1.3/source/common/x86/pixel.h000066400000000000000000000161611340660520300171330ustar00rootroot00000000000000/***************************************************************************** * pixel.h: x86 pixel metrics ***************************************************************************** * Copyright (C) 2003-2013 x264 project * Copyright (C) 2013-2017 MulticoreWare, Inc * * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser ;* Min Chen * Jiaqi Zhang * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ #ifndef XAVS2_X86_PIXEL_H #define XAVS2_X86_PIXEL_H /** * =========================================================================== * function declares * =========================================================================== */ #define FUNCDEF_TU(ret, name, cpu, ...) \ ret FPFX(name ## _4x4_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _8x8_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _16x16_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _32x32_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _64x64_ ## cpu(__VA_ARGS__)) #define FUNCDEF_TU_S(ret, name, cpu, ...) \ ret FPFX(name ## _4_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _8_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _16_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _32_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _64_ ## cpu(__VA_ARGS__)) #define FUNCDEF_PU(ret, name, cpu, ...) \ ret FPFX(name ## _4x4_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _64x64_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x4_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _4x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _64x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x64_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x12_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _12x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x4_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _4x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x24_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _24x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _64x48_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _48x64_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _64x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x64_ ## cpu)(__VA_ARGS__) #define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \ FUNCDEF_PU(ret, name, cpu, __VA_ARGS__);\ ret FPFX(name ## _4x2_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _2x4_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x2_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _2x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x6_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _6x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x12_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _12x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _6x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x6_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _2x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x2_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _4x12_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _12x4_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x12_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _12x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x4_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _4x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x48_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _48x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x24_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _24x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x64_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _64x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _64x24_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _24x64_ ## cpu)(__VA_ARGS__); #define DECL_PIXELS(cpu) \ FUNCDEF_PU(int, pixel_ssd, cpu, const pel_t*, intptr_t, const pel_t*, intptr_t);\ FUNCDEF_PU(int, pixel_sa8d, cpu, const pel_t*, intptr_t, const pel_t*, intptr_t);\ FUNCDEF_PU(void, pixel_sad_x3, cpu, const pel_t*, const pel_t*, const pel_t*, const pel_t*, intptr_t, int32_t*);\ FUNCDEF_PU(void, pixel_sad_x4, cpu, const pel_t*, const pel_t*, const pel_t*, const pel_t*, const pel_t*, intptr_t, int32_t*);\ FUNCDEF_PU(void, pixel_avg, cpu, pel_t* dst, intptr_t dstride, const pel_t* src0, intptr_t sstride0, const pel_t* src1, intptr_t sstride1, int);\ FUNCDEF_PU(void, pixel_add_ps, cpu, pel_t* a, intptr_t dstride, const pel_t* b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1);\ FUNCDEF_PU(void, pixel_sub_ps, cpu, coeff_t* a, intptr_t dstride, const pel_t* b0, const pel_t* b1, intptr_t sstride0, intptr_t sstride1);\ FUNCDEF_PU(int, pixel_satd, cpu, const pel_t*, intptr_t, const pel_t*, intptr_t);\ FUNCDEF_PU(int, pixel_sad, cpu, const pel_t*, intptr_t, const pel_t*, intptr_t);\ FUNCDEF_PU(int, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t);\ FUNCDEF_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pel_t*, intptr_t, intptr_t, intptr_t);\ FUNCDEF_PU(int, pixel_ssd_s, cpu, const int16_t*, intptr_t);\ FUNCDEF_TU_S(int, pixel_ssd_s, cpu, const int16_t*, intptr_t);\ FUNCDEF_TU(uint64_t, pixel_var, cpu, const pel_t*, intptr_t);\ FUNCDEF_TU(int, psyCost_pp, cpu, const pel_t* source, intptr_t sstride, const pel_t* recon, intptr_t rstride);\ FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) DECL_PIXELS(mmx); DECL_PIXELS(mmx2); DECL_PIXELS(sse2); DECL_PIXELS(sse3); DECL_PIXELS(sse4); DECL_PIXELS(ssse3); DECL_PIXELS(avx); DECL_PIXELS(xop); DECL_PIXELS(avx2); #undef DECL_PIXELS #endif // XAVS2_X86_PIXEL_H xavs2-1.3/source/common/x86/pixeladd8.asm000066400000000000000000001051271340660520300202260ustar00rootroot00000000000000;***************************************************************************** ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Praveen Kumar Tiwari ;* Min Chen ;* Jiaqi Zhang ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;*****************************************************************************/ %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 SECTION .text cextern pw_pixel_max ;----------------------------------------------------------------------------- ; void pixel_add_ps_4x4(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m1, [pw_pixel_max] pxor m0, m0 add r4, r4 add r5, r5 add r1, r1 movh m2, [r2] movhps m2, [r2 + r4] movh m3, [r3] movhps m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] movh m4, [r2] movhps m4, [r2 + r4] movh m5, [r3] movhps m5, [r3 + r5] paddw m2, m3 paddw m4, m5 CLIPW2 m2, m4, m0, m1 movh [r0], m2 movhps [r0 + r1], m2 lea r0, [r0 + r1 * 2] movh [r0], m4 movhps [r0 + r1], m4 RET %else INIT_XMM sse4 cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcStride1 add r5, r5 pmovzxbw m0, [r2] pmovzxbw m2, [r2 + r4] movh m1, [r3] movh m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] pmovzxbw m4, [r2] pmovzxbw m6, [r2 + r4] movh m5, [r3] movh m7, [r3 + r5] paddw m0, m1 paddw m2, m3 paddw m4, m5 paddw m6, m7 packuswb m0, m0 packuswb m2, m2 packuswb m4, m4 packuswb m6, m6 movd [r0], m0 movd [r0 + r1], m2 lea r0, [r0 + r1 * 2] movd [r0], m4 movd [r0 + r1], m6 RET %endif ;----------------------------------------------------------------------------- ; void pixel_add_ps_4x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W4_H4 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m1, [pw_pixel_max] pxor m0, m0 mov r6d, %2/4 add r4, r4 add r5, r5 add r1, r1 .loop: movh m2, [r2] movhps m2, [r2 + r4] movh m3, [r3] movhps m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] movh m4, [r2] movhps m4, [r2 + r4] movh m5, [r3] movhps m5, [r3 + r5] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m2, m3 paddw m4, m5 CLIPW2 m2, m4, m0, m1 movh [r0], m2 movhps [r0 + r1], m2 lea r0, [r0 + r1 * 2] movh [r0], m4 movhps [r0 + r1], m4 lea r0, [r0 + r1 * 2] jnz .loop RET %else INIT_XMM sse4 cglobal pixel_add_ps_4x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/4 add r5, r5 .loop: pmovzxbw m0, [r2] pmovzxbw m2, [r2 + r4] movh m1, [r3] movh m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] pmovzxbw m4, [r2] pmovzxbw m6, [r2 + r4] movh m5, [r3] movh m7, [r3 + r5] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 paddw m4, m5 paddw m6, m7 packuswb m0, m0 packuswb m2, m2 packuswb m4, m4 packuswb m6, m6 movd [r0], m0 movd [r0 + r1], m2 lea r0, [r0 + r1 * 2] movd [r0], m4 movd [r0 + r1], m6 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro PIXEL_ADD_PS_W4_H4 4, 8 PIXEL_ADD_PS_W4_H4 4, 16 ;----------------------------------------------------------------------------- ; void pixel_add_ps_8x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W8_H4 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %2/4 add r4, r4 add r5, r5 add r1, r1 .loop: movu m0, [r2] movu m2, [r2 + r4] movu m1, [r3] movu m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0], m0 movu [r0 + r1], m2 movu m0, [r2] movu m2, [r2 + r4] movu m1, [r3] movu m3, [r3 + r5] dec r6d lea r0, [r0 + r1 * 2] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0], m0 movu [r0 + r1], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %else INIT_XMM sse4 cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/4 add r5, r5 .loop: pmovzxbw m0, [r2] pmovzxbw m2, [r2 + r4] movu m1, [r3] movu m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] pmovzxbw m4, [r2] pmovzxbw m6, [r2 + r4] movu m5, [r3] movu m7, [r3 + r5] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 paddw m4, m5 paddw m6, m7 packuswb m0, m0 packuswb m2, m2 packuswb m4, m4 packuswb m6, m6 movh [r0], m0 movh [r0 + r1], m2 lea r0, [r0 + r1 * 2] movh [r0], m4 movh [r0 + r1], m6 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro PIXEL_ADD_PS_W8_H4 8, 8 PIXEL_ADD_PS_W8_H4 8, 16 PIXEL_ADD_PS_W8_H4 8, 32 ;----------------------------------------------------------------------------- ; void pixel_add_ps_16x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W16_H4 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %2/4 add r4, r4 add r5, r5 add r1, r1 .loop: movu m0, [r2] movu m2, [r2 + 16] movu m1, [r3] movu m3, [r3 + 16] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0], m0 movu [r0 + 16], m2 movu m0, [r2 + r4] movu m2, [r2 + r4 + 16] movu m1, [r3 + r5] movu m3, [r3 + r5 + 16] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1], m0 movu [r0 + r1 + 16], m2 movu m0, [r2] movu m2, [r2 + 16] movu m1, [r3] movu m3, [r3 + 16] lea r0, [r0 + r1 * 2] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0], m0 movu [r0 + 16], m2 movu m0, [r2 + r4] movu m2, [r2 + r4 + 16] movu m1, [r3 + r5] movu m3, [r3 + r5 + 16] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1], m0 movu [r0 + r1 + 16], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %else INIT_XMM sse4 cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/4 add r5, r5 .loop: pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 8] pmovzxbw m4, [r2 + r4] pmovzxbw m5, [r2 + r4 + 8] movu m2, [r3] movu m3, [r3 + 16] movu m6, [r3 + r5] movu m7, [r3 + r5 + 16] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m2 paddw m1, m3 paddw m4, m6 paddw m5, m7 packuswb m0, m1 packuswb m4, m5 movu [r0], m0 movu [r0 + r1], m4 pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 8] pmovzxbw m4, [r2 + r4] pmovzxbw m5, [r2 + r4 + 8] movu m2, [r3] movu m3, [r3 + 16] movu m6, [r3 + r5] movu m7, [r3 + r5 + 16] dec r6d lea r0, [r0 + r1 * 2] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m2 paddw m1, m3 paddw m4, m6 paddw m5, m7 packuswb m0, m1 packuswb m4, m5 movu [r0], m0 movu [r0 + r1], m4 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro PIXEL_ADD_PS_W16_H4 16, 4 PIXEL_ADD_PS_W16_H4 16, 8 PIXEL_ADD_PS_W16_H4 16, 12 PIXEL_ADD_PS_W16_H4 16, 16 PIXEL_ADD_PS_W16_H4 16, 32 PIXEL_ADD_PS_W16_H4 16, 64 ;----------------------------------------------------------------------------- ; void pixel_add_ps_16x16(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W16_H4_avx2 1 %if HIGH_BIT_DEPTH %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_add_ps_16x%1, 6, 10, 4, dest, destride, src0, scr1, srcStride0, srcStride1 mova m3, [pw_pixel_max] pxor m2, m2 mov r6d, %1/4 add r4d, r4d add r5d, r5d add r1d, r1d lea r7, [r4 * 3] lea r8, [r5 * 3] lea r9, [r1 * 3] .loop: movu m0, [r2] movu m1, [r3] paddw m0, m1 CLIPW m0, m2, m3 movu [r0], m0 movu m0, [r2 + r4] movu m1, [r3 + r5] paddw m0, m1 CLIPW m0, m2, m3 movu [r0 + r1], m0 movu m0, [r2 + r4 * 2] movu m1, [r3 + r5 * 2] paddw m0, m1 CLIPW m0, m2, m3 movu [r0 + r1 * 2], m0 movu m0, [r2 + r7] movu m1, [r3 + r8] paddw m0, m1 CLIPW m0, m2, m3 movu [r0 + r9], m0 dec r6d lea r0, [r0 + r1 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] jnz .loop RET %endif %else INIT_YMM avx2 cglobal pixel_add_ps_16x%1, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %1/4 add r5, r5 .loop: pmovzxbw m0, [r2] ; row 0 of src0 pmovzxbw m1, [r2 + r4] ; row 1 of src0 movu m2, [r3] ; row 0 of src1 movu m3, [r3 + r5] ; row 1 of src1 paddw m0, m2 paddw m1, m3 packuswb m0, m1 lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] pmovzxbw m2, [r2] ; row 2 of src0 pmovzxbw m3, [r2 + r4] ; row 3 of src0 movu m4, [r3] ; row 2 of src1 movu m5, [r3 + r5] ; row 3 of src1 paddw m2, m4 paddw m3, m5 packuswb m2, m3 lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] vpermq m0, m0, 11011000b movu [r0], xm0 ; row 0 of dst vextracti128 xm3, m0, 1 movu [r0 + r1], xm3 ; row 1 of dst lea r0, [r0 + r1 * 2] vpermq m2, m2, 11011000b movu [r0], xm2 ; row 2 of dst vextracti128 xm3, m2, 1 movu [r0 + r1], xm3 ; row 3 of dst lea r0, [r0 + r1 * 2] dec r6d jnz .loop RET %endif %endmacro PIXEL_ADD_PS_W16_H4_avx2 4 PIXEL_ADD_PS_W16_H4_avx2 8 PIXEL_ADD_PS_W16_H4_avx2 16 PIXEL_ADD_PS_W16_H4_avx2 32 PIXEL_ADD_PS_W16_H4_avx2 64 ;----------------------------------------------------------------------------- ; void pixel_add_ps_32x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W32_H2 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %2/2 add r4, r4 add r5, r5 add r1, r1 .loop: movu m0, [r2] movu m2, [r2 + 16] movu m1, [r3] movu m3, [r3 + 16] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0], m0 movu [r0 + 16], m2 movu m0, [r2 + 32] movu m2, [r2 + 48] movu m1, [r3 + 32] movu m3, [r3 + 48] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + 32], m0 movu [r0 + 48], m2 movu m0, [r2 + r4] movu m2, [r2 + r4 + 16] movu m1, [r3 + r5] movu m3, [r3 + r5 + 16] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1], m0 movu [r0 + r1 + 16], m2 movu m0, [r2 + r4 + 32] movu m2, [r2 + r4 + 48] movu m1, [r3 + r5 + 32] movu m3, [r3 + r5 + 48] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %else INIT_XMM sse4 cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/2 add r5, r5 .loop: pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 8] pmovzxbw m2, [r2 + 16] pmovzxbw m3, [r2 + 24] movu m4, [r3] movu m5, [r3 + 16] movu m6, [r3 + 32] movu m7, [r3 + 48] paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 movu [r0], m0 movu [r0 + 16], m2 pmovzxbw m0, [r2 + r4] pmovzxbw m1, [r2 + r4 + 8] pmovzxbw m2, [r2 + r4 + 16] pmovzxbw m3, [r2 + r4 + 24] movu m4, [r3 + r5] movu m5, [r3 + r5 + 16] movu m6, [r3 + r5 + 32] movu m7, [r3 + r5 + 48] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 movu [r0 + r1], m0 movu [r0 + r1 + 16], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro PIXEL_ADD_PS_W32_H2 32, 8 PIXEL_ADD_PS_W32_H2 32, 16 PIXEL_ADD_PS_W32_H2 32, 32 PIXEL_ADD_PS_W32_H2 32, 64 ;----------------------------------------------------------------------------- ; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W32_H4_avx2 1 %if HIGH_BIT_DEPTH %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_add_ps_32x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %1/4 add r4d, r4d add r5d, r5d add r1d, r1d lea r7, [r4 * 3] lea r8, [r5 * 3] lea r9, [r1 * 3] .loop: movu m0, [r2] movu m2, [r2 + 32] movu m1, [r3] movu m3, [r3 + 32] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0], m0 movu [r0 + 32], m2 movu m0, [r2 + r4] movu m2, [r2 + r4 + 32] movu m1, [r3 + r5] movu m3, [r3 + r5 + 32] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1], m0 movu [r0 + r1 + 32], m2 movu m0, [r2 + r4 * 2] movu m2, [r2 + r4 * 2 + 32] movu m1, [r3 + r5 * 2] movu m3, [r3 + r5 * 2 + 32] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1 * 2], m0 movu [r0 + r1 * 2 + 32], m2 movu m0, [r2 + r7] movu m2, [r2 + r7 + 32] movu m1, [r3 + r8] movu m3, [r3 + r8 + 32] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r9], m0 movu [r0 + r9 + 32], m2 dec r6d lea r0, [r0 + r1 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] jnz .loop RET %endif %else %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_add_ps_32x%1, 6, 10, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %1/4 add r5, r5 lea r7, [r4 * 3] lea r8, [r5 * 3] lea r9, [r1 * 3] .loop: pmovzxbw m0, [r2] ; first half of row 0 of src0 pmovzxbw m1, [r2 + 16] ; second half of row 0 of src0 movu m2, [r3] ; first half of row 0 of src1 movu m3, [r3 + 32] ; second half of row 0 of src1 paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r0], m0 ; row 0 of dst pmovzxbw m0, [r2 + r4] ; first half of row 1 of src0 pmovzxbw m1, [r2 + r4 + 16] ; second half of row 1 of src0 movu m2, [r3 + r5] ; first half of row 1 of src1 movu m3, [r3 + r5 + 32] ; second half of row 1 of src1 paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r0 + r1], m0 ; row 1 of dst pmovzxbw m0, [r2 + r4 * 2] ; first half of row 2 of src0 pmovzxbw m1, [r2 + r4 * 2 + 16] ; second half of row 2 of src0 movu m2, [r3 + r5 * 2] ; first half of row 2 of src1 movu m3, [r3 + + r5 * 2 + 32]; second half of row 2 of src1 paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r0 + r1 * 2], m0 ; row 2 of dst pmovzxbw m0, [r2 + r7] ; first half of row 3 of src0 pmovzxbw m1, [r2 + r7 + 16] ; second half of row 3 of src0 movu m2, [r3 + r8] ; first half of row 3 of src1 movu m3, [r3 + r8 + 32] ; second half of row 3 of src1 paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r0 + r9], m0 ; row 3 of dst lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] lea r0, [r0 + r1 * 4] dec r6d jnz .loop RET %endif %endif %endmacro PIXEL_ADD_PS_W32_H4_avx2 8 PIXEL_ADD_PS_W32_H4_avx2 16 PIXEL_ADD_PS_W32_H4_avx2 24 PIXEL_ADD_PS_W32_H4_avx2 32 PIXEL_ADD_PS_W32_H4_avx2 64 ;----------------------------------------------------------------------------- ; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W64_H2 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %2/2 add r4, r4 add r5, r5 add r1, r1 .loop: movu m0, [r2] movu m2, [r2 + 16] movu m1, [r3] movu m3, [r3 + 16] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0], m0 movu [r0 + 16], m2 movu m0, [r2 + 32] movu m2, [r2 + 48] movu m1, [r3 + 32] movu m3, [r3 + 48] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + 32], m0 movu [r0 + 48], m2 movu m0, [r2 + 64] movu m2, [r2 + 80] movu m1, [r3 + 64] movu m3, [r3 + 80] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + 64], m0 movu [r0 + 80], m2 movu m0, [r2 + 96] movu m2, [r2 + 112] movu m1, [r3 + 96] movu m3, [r3 + 112] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + 96], m0 movu [r0 + 112], m2 movu m0, [r2 + r4] movu m2, [r2 + r4 + 16] movu m1, [r3 + r5] movu m3, [r3 + r5 + 16] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1], m0 movu [r0 + r1 + 16], m2 movu m0, [r2 + r4 + 32] movu m2, [r2 + r4 + 48] movu m1, [r3 + r5 + 32] movu m3, [r3 + r5 + 48] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m2 movu m0, [r2 + r4 + 64] movu m2, [r2 + r4 + 80] movu m1, [r3 + r5 + 64] movu m3, [r3 + r5 + 80] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1 + 64], m0 movu [r0 + r1 + 80], m2 movu m0, [r2 + r4 + 96] movu m2, [r2 + r4 + 112] movu m1, [r3 + r5 + 96] movu m3, [r3 + r5 + 112] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1 + 96], m0 movu [r0 + r1 + 112], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %else INIT_XMM sse4 cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/2 add r5, r5 .loop: pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 8] pmovzxbw m2, [r2 + 16] pmovzxbw m3, [r2 + 24] movu m4, [r3] movu m5, [r3 + 16] movu m6, [r3 + 32] movu m7, [r3 + 48] paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 movu [r0], m0 movu [r0 + 16], m2 pmovzxbw m0, [r2 + 32] pmovzxbw m1, [r2 + 40] pmovzxbw m2, [r2 + 48] pmovzxbw m3, [r2 + 56] movu m4, [r3 + 64] movu m5, [r3 + 80] movu m6, [r3 + 96] movu m7, [r3 + 112] paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 movu [r0 + 32], m0 movu [r0 + 48], m2 pmovzxbw m0, [r2 + r4] pmovzxbw m1, [r2 + r4 + 8] pmovzxbw m2, [r2 + r4 + 16] pmovzxbw m3, [r2 + r4 + 24] movu m4, [r3 + r5] movu m5, [r3 + r5 + 16] movu m6, [r3 + r5 + 32] movu m7, [r3 + r5 + 48] paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 movu [r0 + r1], m0 movu [r0 + r1 + 16], m2 pmovzxbw m0, [r2 + r4 + 32] pmovzxbw m1, [r2 + r4 + 40] pmovzxbw m2, [r2 + r4 + 48] pmovzxbw m3, [r2 + r4 + 56] movu m4, [r3 + r5 + 64] movu m5, [r3 + r5 + 80] movu m6, [r3 + r5 + 96] movu m7, [r3 + r5 + 112] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro PIXEL_ADD_PS_W64_H2 64, 16 PIXEL_ADD_PS_W64_H2 64, 64 ;----------------------------------------------------------------------------- ; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W64H4_avx2 1 %if HIGH_BIT_DEPTH %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_add_ps_64x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %1/4 add r4d, r4d add r5d, r5d add r1d, r1d lea r7, [r4 * 3] lea r8, [r5 * 3] lea r9, [r1 * 3] .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r3] movu m3, [r3 + 32] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0], m0 movu [r0 + 32], m1 movu m0, [r2 + 64] movu m1, [r2 + 96] movu m2, [r3 + 64] movu m3, [r3 + 96] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0 + 64], m0 movu [r0 + 96], m1 movu m0, [r2 + r4] movu m1, [r2 + r4 + 32] movu m2, [r3 + r5] movu m3, [r3 + r5 + 32] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 movu m0, [r2 + r4 + 64] movu m1, [r2 + r4 + 96] movu m2, [r3 + r5 + 64] movu m3, [r3 + r5 + 96] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0 + r1 + 64], m0 movu [r0 + r1 + 96], m1 movu m0, [r2 + r4 * 2] movu m1, [r2 + r4 * 2 + 32] movu m2, [r3 + r5 * 2] movu m3, [r3 + r5 * 2+ 32] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0 + r1 * 2], m0 movu [r0 + r1 * 2 + 32], m1 movu m0, [r2 + r4 * 2 + 64] movu m1, [r2 + r4 * 2 + 96] movu m2, [r3 + r5 * 2 + 64] movu m3, [r3 + r5 * 2 + 96] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0 + r1 * 2 + 64], m0 movu [r0 + r1 * 2 + 96], m1 movu m0, [r2 + r7] movu m1, [r2 + r7 + 32] movu m2, [r3 + r8] movu m3, [r3 + r8 + 32] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0 + r9], m0 movu [r0 + r9 + 32], m1 movu m0, [r2 + r7 + 64] movu m1, [r2 + r7 + 96] movu m2, [r3 + r8 + 64] movu m3, [r3 + r8 + 96] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0 + r9 + 64], m0 movu [r0 + r9 + 96], m1 dec r6d lea r0, [r0 + r1 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] jnz .loop RET %endif %else INIT_YMM avx2 cglobal pixel_add_ps_64x%1, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %1/2 add r5, r5 .loop: pmovzxbw m0, [r2] ; first 16 of row 0 of src0 pmovzxbw m1, [r2 + 16] ; second 16 of row 0 of src0 pmovzxbw m2, [r2 + 32] ; third 16 of row 0 of src0 pmovzxbw m3, [r2 + 48] ; forth 16 of row 0 of src0 movu m4, [r3] ; first 16 of row 0 of src1 movu m5, [r3 + 32] ; second 16 of row 0 of src1 movu m6, [r3 + 64] ; third 16 of row 0 of src1 movu m7, [r3 + 96] ; forth 16 of row 0 of src1 paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b movu [r0], m0 ; first 32 of row 0 of dst vpermq m2, m2, 11011000b movu [r0 + 32], m2 ; second 32 of row 0 of dst pmovzxbw m0, [r2 + r4] ; first 16 of row 1 of src0 pmovzxbw m1, [r2 + r4 + 16] ; second 16 of row 1 of src0 pmovzxbw m2, [r2 + r4 + 32] ; third 16 of row 1 of src0 pmovzxbw m3, [r2 + r4 + 48] ; forth 16 of row 1 of src0 movu m4, [r3 + r5] ; first 16 of row 1 of src1 movu m5, [r3 + r5 + 32] ; second 16 of row 1 of src1 movu m6, [r3 + r5 + 64] ; third 16 of row 1 of src1 movu m7, [r3 + r5 + 96] ; forth 16 of row 1 of src1 paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b movu [r0 + r1], m0 ; first 32 of row 1 of dst vpermq m2, m2, 11011000b movu [r0 + r1 + 32], m2 ; second 32 of row 1 of dst lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] lea r0, [r0 + r1 * 2] dec r6d jnz .loop RET %endif %endmacro PIXEL_ADD_PS_W64H4_avx2 16 PIXEL_ADD_PS_W64H4_avx2 32 PIXEL_ADD_PS_W64H4_avx2 48 PIXEL_ADD_PS_W64H4_avx2 64 xavs2-1.3/source/common/x86/quant8.asm000066400000000000000000000256171340660520300175710ustar00rootroot00000000000000;***************************************************************************** ;* quant8.asm: x86 quantization functions ;***************************************************************************** ;* xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Falei LUO ;* Jiaqi Zhang ;* ;* Homepage1: http://vcl.idm.pku.edu.cn/xavs2 ;* Homepage2: https://github.com/pkuvcl/xavs2 ;* Homepage3: https://gitee.com/pkuvcl/xavs2 ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at sswang @ pku.edu.cn. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION .text cextern pw_1 cextern pd_32767 cextern pd_n32768 ; ---------------------------------------------------------------------------- ; int quant(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add); ; ---------------------------------------------------------------------------- ; ---------------------------------------------------------------------------- ; quant_sse4 INIT_XMM sse4 cglobal quant, 2,3,8 ;{ movd m4, r2mp ; m4[0] = scale movd m5, r3mp ; m5[0] = shift movd m6, r4mp ; m6[0] = add mov r2, r1 ; r2 = i_coef shr r1, 3 ; r1 = i_coef/8 pxor m7, m7 ; m7 <-- num_non_zero = 0 pshufd m4, m4, 0 ; m4[3210] = scale pshufd m6, m6, 0 ; m6[3210] = add ; .loop: ; >>>>> LOOP pmovsxwd m0, [r0 ] ; m0 = level, 4 coeff pmovsxwd m1, [r0 + 8] ; m1 = level, 4 coeff ; pabsd m2, m0 ; m2 <-- XAVS2_ABS(coef[i]) pabsd m3, m1 ; pmulld m2, m4 ; m2 <-- XAVS2_ABS(coef[i]) * scale pmulld m3, m4 ; paddd m2, m6 ; m2 <-- XAVS2_ABS(coef[i]) * scale + add paddd m3, m6 ; psrad m2, m5 ; m2 <-- (XAVS2_ABS(coef[i]) * scale + add) >> shift psrad m3, m5 ; psignd m2, m0 ; m2 <-- ((XAVS2_ABS(coef[i]) * scale + add) >> shift) * xavs2_sign2(coef[i]) psignd m3, m1 ; ; packssdw m2, m3 ; pack to 8 coeff ; mova [r0], m2 ; store add r0, 16 ; ; pxor m0, m0 ; m0 <-- 0 pcmpeqw m2, m0 ; m2[i] == 0 ? psubw m7, m2 ; m7[i] <-- count the number of 0 ; dec r1 ; jnz .loop ; <<<<< LOOP ; packuswb m7, m7 ; psadbw m7, m0 ; movifnidn rax, r2 ; eax <-- i_coef movd r1, m7 ; sub rax, r1 ; return value: num_non_zero RET ; return ;} ; ---------------------------------------------------------------------------- ; void dequant(coeff_t *coef, const int i_coef, const int scale, const int shift); ; ---------------------------------------------------------------------------- ; ---------------------------------------------------------------------------- ; dequant_sse4 INIT_XMM sse4 cglobal dequant, 2,4,7 ;{ mov r3, r3mp ; r3 <-- shift movd m4, r2mp ; m4[0] = scale movd m6, r3 ; m6[0] = shift dec r3 ; r3d <-- shift - 1 xor r2, r2 ; r2 <-- 0 shr r1, 4 ; r1 = i_coef/16 bts r2, r3 ; r2 <-- add = 1 < (shift - 1) movd m5, r2 ; m5[0] = add pshufd m4, m4, 0 ; m4[3210] = scale pshufd m5, m5, 0 ; m5[3210] = add ; .loop: ; pmovsxwd m0, [r0 ] ; load 4 coeff pmovsxwd m1, [r0 + 8] ; pmovsxwd m2, [r0 + 16] ; pmovsxwd m3, [r0 + 24] ; ; pmulld m0, m4 ; coef[i] * scale pmulld m1, m4 ; pmulld m2, m4 ; pmulld m3, m4 ; paddd m0, m5 ; coef[i] * scale + add paddd m1, m5 ; paddd m2, m5 ; paddd m3, m5 ; psrad m0, m6 ; (coef[i] * scale + add) >> shift psrad m1, m6 ; psrad m2, m6 ; psrad m3, m6 ; ; packssdw m0, m1 ; pack to 8 coeff packssdw m2, m3 ; ; mova [r0 ], m0 ; store mova [r0+16], m2 ; add r0, 32 ; r0 <-- coef + 16 dec r1 ; jnz .loop ; ; RET ; return ;} ; ---------------------------------------------------------------------------- ; int quant(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add); ; ---------------------------------------------------------------------------- ; ---------------------------------------------------------------------------- ; quant_avx2 INIT_YMM avx2 cglobal quant, 2,3,8 ;{ vpbroadcastd m6, r4m ; m6[3210] = add movd xm4, r2m ; m4[0] = scale vpbroadcastd m4, xm4 ; m4[3210] = scale movd xm5, r3m ; m5[0] = shift shr r1d, 4 ; r1 = i_coef/16 pxor m7, m7 ; m7 <-- num_non_zero = 0 ; lea r5, [pw_1] .loop: pmovsxwd m0, [r0] ; m0 = level, 8 coeff pabsd m2, m0 ; m2 <-- XAVS2_ABS(coef[i]) pmulld m2, m4 ; m2 <-- XAVS2_ABS(coef[i]) * scale paddd m2, m2, m6 ; m2 <-- XAVS2_ABS(coef[i]) * scale + add psrad m2, xm5 ; m2 <-- (XAVS2_ABS(coef[i]) * scale + add) >> shift psignd m2, m0 ; m2 <-- ((XAVS2_ABS(coef[i]) * scale + add) >> shift) * xavs2_sign2(coef[i]) ; pmovsxwd m1, [r0 + 16] ; pabsd m3, m1 ; pmulld m3, m4 ; paddd m3, m3, m6 ; psrad m3, xm5 ; psignd m3, m1 ; ; packssdw m2, m3 ; vpermq m2, m2, q3120 ; vmovdqa [r0], m2 ; add r0, 32 ; r0 = r0 + 16 ; pminuw m2, [r5] paddw m7, m2 dec r1d ; jnz .loop ; <<<<< LOOP ; xorpd m0, m0 psadbw m7, m0 vextracti128 xm1, m7, 1 paddd xm7, xm1 movhlps xm0, xm7 paddd xm7, xm0 movd eax, xm7 RET ; return ;} ; ---------------------------------------------------------------------------- ; void dequant(coeff_t *coef, const int i_coef, const int scale, const int shift); ; ---------------------------------------------------------------------------- ; ---------------------------------------------------------------------------- %if ARCH_X86_64 ; dequant_avx2 INIT_YMM avx2 cglobal dequant, 2,4,7 ;{ movd xm4, r2m ; m4[0] = scale vpbroadcastd m4 , xm4 ; m4[3210] = scale movd xm5, r3m ; m5[0] = shift shr r1d, 4 ; r1 = i_coef/16 xor r2 , r2 ; r2 <--- 0 dec r3 ; shift -= 1 bts r2m , r3m ; r2 <-- add = 1 < (shift - 1) movd xm6 , r2m vpbroadcastd m6, xm6 ; ;m4 <--- scale ;m5 <--- shift ;m6 <--- add .loop: pmovsxwd m0, [r0 ] ; load 8 coeff pmovsxwd m1, [r0 + 16] ; pmulld m0, m4 ; coef[i] * scale pmulld m1, m4 ; paddd m0, m0, m6 ; coef[i] * scale + add paddd m1, m1, m6 ; psrad m0, xm5 ; (coef[i] * scale + add) >> shift psrad m1, xm5 ; packssdw m0, m1 ; pack to 16 coeff vpermq m0, m0, q3120 ; vmovdqa [r0], m0 ; add r0, 32 dec r1d jnz .loop RET ;} %endif xavs2-1.3/source/common/x86/quant8.h000066400000000000000000000040251340660520300172260ustar00rootroot00000000000000/***************************************************************************** * quant8.h: x86 quantization functions ***************************************************************************** * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * Jiaqi Zhang * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. *****************************************************************************/ #ifndef XAVS2_QUANT8_H #define XAVS2_QUANT8_H int FPFX(quant_sse4)(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add); int FPFX(quant_avx2)(coeff_t *coef, const int i_coef, const int scale, const int shift, const int add); void FPFX(dequant_sse4)(coeff_t *coef, const int i_coef, const int scale, const int shift); void FPFX(dequant_avx2)(coeff_t *coef, const int i_coef, const int scale, const int shift); #endif // ifndef XAVS2_QUANT8_H xavs2-1.3/source/common/x86/sad-a.asm000066400000000000000000004663661340660520300173500ustar00rootroot00000000000000;***************************************************************************** ;* sad-a.asm: x86 sad functions ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Laurent Aimar ;* Alex Izvorski ;* Min Chen ;* Jiaqi Zhang ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 MSK: db 255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0 SECTION .text %if HIGH_BIT_DEPTH %else cextern pb_3 cextern pb_shuf8x8c cextern pw_8 cextern pd_64 ;============================================================================= ; SAD MMX ;============================================================================= %macro SAD_INC_2x16P 0 movq mm1, [r0] movq mm2, [r0+8] movq mm3, [r0+r1] movq mm4, [r0+r1+8] psadbw mm1, [r2] psadbw mm2, [r2+8] psadbw mm3, [r2+r3] psadbw mm4, [r2+r3+8] lea r0, [r0+2*r1] paddw mm1, mm2 paddw mm3, mm4 lea r2, [r2+2*r3] paddw mm0, mm1 paddw mm0, mm3 %endmacro %macro SAD_INC_2x8P 0 movq mm1, [r0] movq mm2, [r0+r1] psadbw mm1, [r2] psadbw mm2, [r2+r3] lea r0, [r0+2*r1] paddw mm0, mm1 paddw mm0, mm2 lea r2, [r2+2*r3] %endmacro %macro SAD_INC_2x4P 0 movd mm1, [r0] movd mm2, [r2] punpckldq mm1, [r0+r1] punpckldq mm2, [r2+r3] psadbw mm1, mm2 paddw mm0, mm1 lea r0, [r0+2*r1] lea r2, [r2+2*r3] %endmacro ;----------------------------------------------------------------------------- ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SAD 2 cglobal pixel_sad_%1x%2_mmx2, 4,4 pxor mm0, mm0 %rep %2/2 SAD_INC_2x%1P %endrep movd eax, mm0 RET %endmacro SAD 16, 16 SAD 16, 8 SAD 8, 16 SAD 8, 8 SAD 8, 4 SAD 16, 4 SAD 4, 16 SAD 4, 8 SAD 4, 4 ;============================================================================= ; SAD XMM ;============================================================================= %macro SAD_END_SSE2 0 movhlps m1, m0 paddw m0, m1 movd eax, m0 RET %endmacro %macro PROCESS_SAD_12x4 0 movu m1, [r2] movu m2, [r0] pand m1, m4 pand m2, m4 psadbw m1, m2 paddd m0, m1 lea r2, [r2 + r3] lea r0, [r0 + r1] movu m1, [r2] movu m2, [r0] pand m1, m4 pand m2, m4 psadbw m1, m2 paddd m0, m1 lea r2, [r2 + r3] lea r0, [r0 + r1] movu m1, [r2] movu m2, [r0] pand m1, m4 pand m2, m4 psadbw m1, m2 paddd m0, m1 lea r2, [r2 + r3] lea r0, [r0 + r1] movu m1, [r2] movu m2, [r0] pand m1, m4 pand m2, m4 psadbw m1, m2 paddd m0, m1 %endmacro %macro PROCESS_SAD_16x4 0 movu m1, [r2] movu m2, [r2 + r3] psadbw m1, [r0] psadbw m2, [r0 + r1] paddd m1, m2 paddd m0, m1 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m1, [r2] movu m2, [r2 + r3] psadbw m1, [r0] psadbw m2, [r0 + r1] paddd m1, m2 paddd m0, m1 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] %endmacro %macro PROCESS_SAD_24x4 0 movu m1, [r2] movq m2, [r2 + 16] lea r2, [r2 + r3] movu m3, [r2] movq m4, [r2 + 16] %ifdef USING_OLD_SAD_24x32 ; --- fixed by sfdong psadbw m1, [r0] ; !!!r0 align 16-byte? psadbw m3, [r0 + r1] ; !!!r0 align 16-byte? %else movu m5, [r0] movu m6, [r0 + r1] psadbw m1, m5 psadbw m3, m6 %endif paddd m0, m1 paddd m0, m3 movq m1, [r0 + 16] lea r0, [r0 + r1] movq m3, [r0 + 16] punpcklqdq m2, m4 punpcklqdq m1, m3 psadbw m2, m1 paddd m0, m2 lea r2, [r2 + r3] lea r0, [r0 + r1] movu m1, [r2] movq m2, [r2 + 16] lea r2, [r2 + r3] movu m3, [r2] movq m4, [r2 + 16] %ifdef USING_OLD_SAD_24x32 ; --- fixed by sfdong psadbw m1, [r0] ; !!!r0 align 16-byte? psadbw m3, [r0 + r1] ; !!!r0 align 16-byte? %else movu m5, [r0] movu m6, [r0 + r1] psadbw m1, m5 psadbw m3, m6 %endif paddd m0, m1 paddd m0, m3 movq m1, [r0 + 16] lea r0, [r0 + r1] movq m3, [r0 + 16] punpcklqdq m2, m4 punpcklqdq m1, m3 psadbw m2, m1 paddd m0, m2 %endmacro %macro PROCESS_SAD_32x4 0 movu m1, [r2] movu m2, [r2 + 16] psadbw m1, [r0] psadbw m2, [r0 + 16] paddd m1, m2 paddd m0, m1 lea r2, [r2 + r3] lea r0, [r0 + r1] movu m1, [r2] movu m2, [r2 + 16] psadbw m1, [r0] psadbw m2, [r0 + 16] paddd m1, m2 paddd m0, m1 lea r2, [r2 + r3] lea r0, [r0 + r1] movu m1, [r2] movu m2, [r2 + 16] psadbw m1, [r0] psadbw m2, [r0 + 16] paddd m1, m2 paddd m0, m1 lea r2, [r2 + r3] lea r0, [r0 + r1] movu m1, [r2] movu m2, [r2 + 16] psadbw m1, [r0] psadbw m2, [r0 + 16] paddd m1, m2 paddd m0, m1 lea r2, [r2 + r3] lea r0, [r0 + r1] %endmacro %macro PROCESS_SAD_48x4 0 movu m1, [r2] movu m2, [r2 + 16] movu m3, [r2 + 32] psadbw m1, [r0] psadbw m2, [r0 + 16] psadbw m3, [r0 + 32] paddd m1, m2 paddd m0, m1 paddd m0, m3 lea r2, [r2 + r3] lea r0, [r0 + r1] movu m1, [r2] movu m2, [r2 + 16] movu m3, [r2 + 32] psadbw m1, [r0] psadbw m2, [r0 + 16] psadbw m3, [r0 + 32] paddd m1, m2 paddd m0, m1 paddd m0, m3 lea r2, [r2 + r3] lea r0, [r0 + r1] movu m1, [r2] movu m2, [r2 + 16] movu m3, [r2 + 32] psadbw m1, [r0] psadbw m2, [r0 + 16] psadbw m3, [r0 + 32] paddd m1, m2 paddd m0, m1 paddd m0, m3 lea r2, [r2 + r3] lea r0, [r0 + r1] movu m1, [r2] movu m2, [r2 + 16] movu m3, [r2 + 32] psadbw m1, [r0] psadbw m2, [r0 + 16] psadbw m3, [r0 + 32] paddd m1, m2 paddd m0, m1 paddd m0, m3 %endmacro %macro PROCESS_SAD_8x4 0 movq m1, [r2] movq m2, [r2 + r3] lea r2, [r2 + 2 * r3] movq m3, [r0] movq m4, [r0 + r1] lea r0, [r0 + 2 * r1] punpcklqdq m1, m2 punpcklqdq m3, m4 psadbw m1, m3 paddd m0, m1 movq m1, [r2] movq m2, [r2 + r3] lea r2, [r2 + 2 * r3] movq m3, [r0] movq m4, [r0 + r1] lea r0, [r0 + 2 * r1] punpcklqdq m1, m2 punpcklqdq m3, m4 psadbw m1, m3 paddd m0, m1 %endmacro %macro PROCESS_SAD_64x4 0 movu m1, [r2] movu m2, [r2 + 16] movu m3, [r2 + 32] movu m4, [r2 + 48] psadbw m1, [r0] psadbw m2, [r0 + 16] psadbw m3, [r0 + 32] psadbw m4, [r0 + 48] paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 lea r2, [r2 + r3] lea r0, [r0 + r1] movu m1, [r2] movu m2, [r2 + 16] movu m3, [r2 + 32] movu m4, [r2 + 48] psadbw m1, [r0] psadbw m2, [r0 + 16] psadbw m3, [r0 + 32] psadbw m4, [r0 + 48] paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 lea r2, [r2 + r3] lea r0, [r0 + r1] movu m1, [r2] movu m2, [r2 + 16] movu m3, [r2 + 32] movu m4, [r2 + 48] psadbw m1, [r0] psadbw m2, [r0 + 16] psadbw m3, [r0 + 32] psadbw m4, [r0 + 48] paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 lea r2, [r2 + r3] lea r0, [r0 + r1] movu m1, [r2] movu m2, [r2 + 16] movu m3, [r2 + 32] movu m4, [r2 + 48] psadbw m1, [r0] psadbw m2, [r0 + 16] psadbw m3, [r0 + 32] psadbw m4, [r0 + 48] paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 lea r2, [r2 + r3] lea r0, [r0 + r1] %endmacro %macro SAD_W16 0 ;----------------------------------------------------------------------------- ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_16x16, 4,4,8 movu m0, [r2] movu m1, [r2+r3] lea r2, [r2+2*r3] movu m2, [r2] movu m3, [r2+r3] lea r2, [r2+2*r3] psadbw m0, [r0] psadbw m1, [r0+r1] lea r0, [r0+2*r1] movu m4, [r2] paddw m0, m1 psadbw m2, [r0] psadbw m3, [r0+r1] lea r0, [r0+2*r1] movu m5, [r2+r3] lea r2, [r2+2*r3] paddw m2, m3 movu m6, [r2] movu m7, [r2+r3] lea r2, [r2+2*r3] paddw m0, m2 psadbw m4, [r0] psadbw m5, [r0+r1] lea r0, [r0+2*r1] movu m1, [r2] paddw m4, m5 psadbw m6, [r0] psadbw m7, [r0+r1] lea r0, [r0+2*r1] movu m2, [r2+r3] lea r2, [r2+2*r3] paddw m6, m7 movu m3, [r2] paddw m0, m4 movu m4, [r2+r3] lea r2, [r2+2*r3] paddw m0, m6 psadbw m1, [r0] psadbw m2, [r0+r1] lea r0, [r0+2*r1] movu m5, [r2] paddw m1, m2 psadbw m3, [r0] psadbw m4, [r0+r1] lea r0, [r0+2*r1] movu m6, [r2+r3] lea r2, [r2+2*r3] paddw m3, m4 movu m7, [r2] paddw m0, m1 movu m1, [r2+r3] paddw m0, m3 psadbw m5, [r0] psadbw m6, [r0+r1] lea r0, [r0+2*r1] paddw m5, m6 psadbw m7, [r0] psadbw m1, [r0+r1] paddw m7, m1 paddw m0, m5 paddw m0, m7 SAD_END_SSE2 ;----------------------------------------------------------------------------- ; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_16x8, 4,4 movu m0, [r2] movu m2, [r2+r3] lea r2, [r2+2*r3] movu m3, [r2] movu m4, [r2+r3] psadbw m0, [r0] psadbw m2, [r0+r1] lea r0, [r0+2*r1] psadbw m3, [r0] psadbw m4, [r0+r1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] paddw m0, m2 paddw m3, m4 paddw m0, m3 movu m1, [r2] movu m2, [r2+r3] lea r2, [r2+2*r3] movu m3, [r2] movu m4, [r2+r3] psadbw m1, [r0] psadbw m2, [r0+r1] lea r0, [r0+2*r1] psadbw m3, [r0] psadbw m4, [r0+r1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] paddw m1, m2 paddw m3, m4 paddw m0, m1 paddw m0, m3 SAD_END_SSE2 ;----------------------------------------------------------------------------- ; int pixel_sad_16x12( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_16x12, 4,4,3 pxor m0, m0 PROCESS_SAD_16x4 PROCESS_SAD_16x4 PROCESS_SAD_16x4 movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_16x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_16x32, 4,5,3 pxor m0, m0 mov r4d, 4 .loop: PROCESS_SAD_16x4 PROCESS_SAD_16x4 dec r4d jnz .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_16x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_16x64, 4,5,3 pxor m0, m0 mov r4d, 8 .loop: PROCESS_SAD_16x4 PROCESS_SAD_16x4 dec r4d jnz .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_16x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_16x4, 4,4,3 movu m0, [r2] movu m1, [r2 + r3] psadbw m0, [r0] psadbw m1, [r0 + r1] paddd m0, m1 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m1, [r2] movu m2, [r2 + r3] psadbw m1, [r0] psadbw m2, [r0 + r1] paddd m1, m2 paddd m0, m1 movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_32x8, 4,4,3 pxor m0, m0 PROCESS_SAD_32x4 PROCESS_SAD_32x4 movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_32x24, 4,5,3 pxor m0, m0 mov r4d, 3 .loop: PROCESS_SAD_32x4 PROCESS_SAD_32x4 dec r4d jnz .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_32x32, 4,5,3 pxor m0, m0 mov r4d, 4 .loop: PROCESS_SAD_32x4 PROCESS_SAD_32x4 dec r4d jnz .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_32x16, 4,4,3 pxor m0, m0 PROCESS_SAD_32x4 PROCESS_SAD_32x4 PROCESS_SAD_32x4 PROCESS_SAD_32x4 movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_32x64, 4,5,3 pxor m0, m0 mov r4d, 8 .loop: PROCESS_SAD_32x4 PROCESS_SAD_32x4 dec r4d jnz .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_8x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_8x32, 4,5,3 pxor m0, m0 mov r4d, 4 .loop: PROCESS_SAD_8x4 PROCESS_SAD_8x4 dec r4d jnz .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_64x16, 4,4,5 pxor m0, m0 PROCESS_SAD_64x4 PROCESS_SAD_64x4 PROCESS_SAD_64x4 PROCESS_SAD_64x4 movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_64x32, 4,5,5 pxor m0, m0 mov r4, 4 .loop: PROCESS_SAD_64x4 PROCESS_SAD_64x4 dec r4 jnz .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_64x48, 4,5,5 pxor m0, m0 mov r4, 6 .loop: PROCESS_SAD_64x4 PROCESS_SAD_64x4 dec r4d jnz .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_64x64, 4,5,5 pxor m0, m0 mov r4, 8 .loop: PROCESS_SAD_64x4 PROCESS_SAD_64x4 dec r4 jnz .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_48x64, 4,5,5 pxor m0, m0 mov r4, 64 .loop: PROCESS_SAD_48x4 lea r2, [r2 + r3] lea r0, [r0 + r1] PROCESS_SAD_48x4 lea r2, [r2 + r3] lea r0, [r0 + r1] sub r4, 8 cmp r4, 8 jnz .loop PROCESS_SAD_48x4 lea r2, [r2 + r3] lea r0, [r0 + r1] PROCESS_SAD_48x4 movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_24x32, 4,5,7 pxor m0, m0 mov r4, 32 .loop: PROCESS_SAD_24x4 lea r2, [r2 + r3] lea r0, [r0 + r1] PROCESS_SAD_24x4 lea r2, [r2 + r3] lea r0, [r0 + r1] sub r4, 8 cmp r4, 8 jnz .loop PROCESS_SAD_24x4 lea r2, [r2 + r3] lea r0, [r0 + r1] PROCESS_SAD_24x4 movhlps m1, m0 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; int pixel_sad_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_12x16, 4,4,4 mova m4, [MSK] pxor m0, m0 PROCESS_SAD_12x4 lea r2, [r2 + r3] lea r0, [r0 + r1] PROCESS_SAD_12x4 lea r2, [r2 + r3] lea r0, [r0 + r1] PROCESS_SAD_12x4 lea r2, [r2 + r3] lea r0, [r0 + r1] PROCESS_SAD_12x4 movhlps m1, m0 paddd m0, m1 movd eax, m0 RET %endmacro INIT_XMM sse2 SAD_W16 INIT_XMM sse3 SAD_W16 INIT_XMM sse2, aligned SAD_W16 %macro SAD_INC_4x8P_SSE 1 movq m1, [r0] movq m2, [r0+r1] lea r0, [r0+2*r1] movq m3, [r2] movq m4, [r2+r3] lea r2, [r2+2*r3] movhps m1, [r0] movhps m2, [r0+r1] movhps m3, [r2] movhps m4, [r2+r3] lea r0, [r0+2*r1] psadbw m1, m3 psadbw m2, m4 lea r2, [r2+2*r3] ACCUM paddw, 0, 1, %1 paddw m0, m2 %endmacro INIT_XMM ;Even on Nehalem, no sizes other than 8x16 benefit from this method. cglobal pixel_sad_8x16_sse2, 4,4 SAD_INC_4x8P_SSE 0 SAD_INC_4x8P_SSE 1 SAD_INC_4x8P_SSE 1 SAD_INC_4x8P_SSE 1 SAD_END_SSE2 RET ;============================================================================= ; SAD x3/x4 MMX ;============================================================================= %macro SAD_X3_START_1x8P 0 movq mm3, [r0] movq mm0, [r1] movq mm1, [r2] movq mm2, [r3] psadbw mm0, mm3 psadbw mm1, mm3 psadbw mm2, mm3 %endmacro %macro SAD_X3_1x8P 2 movq mm3, [r0+%1] movq mm4, [r1+%2] movq mm5, [r2+%2] movq mm6, [r3+%2] psadbw mm4, mm3 psadbw mm5, mm3 psadbw mm6, mm3 paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 %endmacro %macro SAD_X3_START_2x4P 3 movd mm3, [r0] movd %1, [r1] movd %2, [r2] movd %3, [r3] punpckldq mm3, [r0+FENC_STRIDE] punpckldq %1, [r1+r4] punpckldq %2, [r2+r4] punpckldq %3, [r3+r4] psadbw %1, mm3 psadbw %2, mm3 psadbw %3, mm3 %endmacro %macro SAD_X3_2x16P 1 %if %1 SAD_X3_START_1x8P %else SAD_X3_1x8P 0, 0 %endif SAD_X3_1x8P 8, 8 SAD_X3_1x8P FENC_STRIDE, r4 SAD_X3_1x8P FENC_STRIDE+8, r4+8 add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4] %endmacro %macro SAD_X3_2x8P 1 %if %1 SAD_X3_START_1x8P %else SAD_X3_1x8P 0, 0 %endif SAD_X3_1x8P FENC_STRIDE, r4 add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4] %endmacro %macro SAD_X3_2x4P 1 %if %1 SAD_X3_START_2x4P mm0, mm1, mm2 %else SAD_X3_START_2x4P mm4, mm5, mm6 paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 %endif add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4] %endmacro %macro SAD_X4_START_1x8P 0 movq mm7, [r0] movq mm0, [r1] movq mm1, [r2] movq mm2, [r3] movq mm3, [r4] psadbw mm0, mm7 psadbw mm1, mm7 psadbw mm2, mm7 psadbw mm3, mm7 %endmacro %macro SAD_X4_1x8P 2 movq mm7, [r0+%1] movq mm4, [r1+%2] movq mm5, [r2+%2] movq mm6, [r3+%2] psadbw mm4, mm7 psadbw mm5, mm7 psadbw mm6, mm7 psadbw mm7, [r4+%2] paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 paddw mm3, mm7 %endmacro %macro SAD_X4_START_2x4P 0 movd mm7, [r0] movd mm0, [r1] movd mm1, [r2] movd mm2, [r3] movd mm3, [r4] punpckldq mm7, [r0+FENC_STRIDE] punpckldq mm0, [r1+r5] punpckldq mm1, [r2+r5] punpckldq mm2, [r3+r5] punpckldq mm3, [r4+r5] psadbw mm0, mm7 psadbw mm1, mm7 psadbw mm2, mm7 psadbw mm3, mm7 %endmacro %macro SAD_X4_INC_2x4P 0 movd mm7, [r0] movd mm4, [r1] movd mm5, [r2] punpckldq mm7, [r0+FENC_STRIDE] punpckldq mm4, [r1+r5] punpckldq mm5, [r2+r5] psadbw mm4, mm7 psadbw mm5, mm7 paddw mm0, mm4 paddw mm1, mm5 movd mm4, [r3] movd mm5, [r4] punpckldq mm4, [r3+r5] punpckldq mm5, [r4+r5] psadbw mm4, mm7 psadbw mm5, mm7 paddw mm2, mm4 paddw mm3, mm5 %endmacro %macro SAD_X4_2x16P 1 %if %1 SAD_X4_START_1x8P %else SAD_X4_1x8P 0, 0 %endif SAD_X4_1x8P 8, 8 SAD_X4_1x8P FENC_STRIDE, r5 SAD_X4_1x8P FENC_STRIDE+8, r5+8 add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5] %endmacro %macro SAD_X4_2x8P 1 %if %1 SAD_X4_START_1x8P %else SAD_X4_1x8P 0, 0 %endif SAD_X4_1x8P FENC_STRIDE, r5 add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5] %endmacro %macro SAD_X4_2x4P 1 %if %1 SAD_X4_START_2x4P %else SAD_X4_INC_2x4P %endif add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5] %endmacro %macro SAD_X3_END 0 %if UNIX64 movd [r5+0], mm0 movd [r5+4], mm1 movd [r5+8], mm2 %else mov r0, r5mp movd [r0+0], mm0 movd [r0+4], mm1 movd [r0+8], mm2 %endif RET %endmacro %macro SAD_X4_END 0 mov r0, r6mp movd [r0+0], mm0 movd [r0+4], mm1 movd [r0+8], mm2 movd [r0+12], mm3 RET %endmacro %macro SAD_X3_12x4 0 movu m3, [r0] ; mova - fixed by sfdong movu m5, [r1] pand m3, m4 pand m5, m4 psadbw m5, m3 paddd m0, m5 movu m5, [r2] pand m5, m4 psadbw m5, m3 paddd m1, m5 movu m5, [r3] pand m5, m4 psadbw m5, m3 paddd m2, m5 movu m3, [r0 + FENC_STRIDE] ; mova - fixed by sfdong movu m5, [r1 + r4] pand m3, m4 pand m5, m4 psadbw m5, m3 paddd m0, m5 movu m5, [r2 + r4] pand m5, m4 psadbw m5, m3 paddd m1, m5 movu m5, [r3 + r4] pand m5, m4 psadbw m5, m3 paddd m2, m5 movu m3, [r0 + FENC_STRIDE * 2] ; mova - fixed by sfdong movu m5, [r1 + r4 * 2] pand m3, m4 pand m5, m4 psadbw m5, m3 paddd m0, m5 movu m5, [r2 + r4 * 2] pand m5, m4 psadbw m5, m3 paddd m1, m5 movu m5, [r3 + r4 * 2] pand m5, m4 psadbw m5, m3 paddd m2, m5 lea r1, [r1 + r4 * 2] lea r2, [r2 + r4 * 2] lea r3, [r3 + r4 * 2] movu m3, [r0 + FENC_STRIDE * 3] ; mova - fixed by sfdong movu m5, [r1 + r4] pand m3, m4 pand m5, m4 psadbw m5, m3 paddd m0, m5 movu m5, [r2 + r4] pand m5, m4 psadbw m5, m3 paddd m1, m5 movu m5, [r3 + r4] pand m5, m4 psadbw m5, m3 paddd m2, m5 lea r0, [r0 + FENC_STRIDE * 4] lea r1, [r1 + r4 * 2] lea r2, [r2 + r4 * 2] lea r3, [r3 + r4 * 2] %endmacro %macro SAD_X4_12x4 0 movu m4, [r0] ; mova - fixed by sfdong movu m5, [r1] pand m4, m6 pand m5, m6 psadbw m5, m4 paddd m0, m5 movu m5, [r2] pand m5, m6 psadbw m5, m4 paddd m1, m5 movu m5, [r3] pand m5, m6 psadbw m5, m4 paddd m2, m5 movu m5, [r4] pand m5, m6 psadbw m5, m4 paddd m3, m5 movu m4, [r0 + FENC_STRIDE] ; mova - fixed by sfdong movu m5, [r1 + r5] pand m4, m6 pand m5, m6 psadbw m5, m4 paddd m0, m5 movu m5, [r2 + r5] pand m5, m6 psadbw m5, m4 paddd m1, m5 movu m5, [r3 + r5] pand m5, m6 psadbw m5, m4 paddd m2, m5 movu m5, [r4 + r5] pand m5, m6 psadbw m5, m4 paddd m3, m5 movu m4, [r0 + FENC_STRIDE * 2] ; mova - fixed by sfdong movu m5, [r1 + r5 * 2] pand m4, m6 pand m5, m6 psadbw m5, m4 paddd m0, m5 movu m5, [r2 + r5 * 2] pand m5, m6 psadbw m5, m4 paddd m1, m5 movu m5, [r3 + r5 * 2] pand m5, m6 psadbw m5, m4 paddd m2, m5 movu m5, [r4 + r5 * 2] pand m5, m6 psadbw m5, m4 paddd m3, m5 lea r1, [r1 + r5 * 2] lea r2, [r2 + r5 * 2] lea r3, [r3 + r5 * 2] lea r4, [r4 + r5 * 2] movu m4, [r0 + FENC_STRIDE * 3] ; mova - fixed by sfdong movu m5, [r1 + r5] pand m4, m6 pand m5, m6 psadbw m5, m4 paddd m0, m5 movu m5, [r2 + r5] pand m5, m6 psadbw m5, m4 paddd m1, m5 movu m5, [r3 + r5] pand m5, m6 psadbw m5, m4 paddd m2, m5 movu m5, [r4 + r5] pand m5, m6 psadbw m5, m4 paddd m3, m5 lea r0, [r0 + FENC_STRIDE * 4] lea r1, [r1 + r5 * 2] lea r2, [r2 + r5 * 2] lea r3, [r3 + r5 * 2] lea r4, [r4 + r5 * 2] %endmacro %macro SAD_X3_24x4 0 movu m3, [r0 ] ; mova - fixed by sfdong movu m4, [r0 + 16] ; mova - fixed by sfdong movu m5, [r1] movu m6, [r1 + 16] psadbw m5, m3 psadbw m6, m4 pshufd m6, m6, 84 paddd m5, m6 paddd m0, m5 movu m5, [r2] movu m6, [r2 + 16] psadbw m5, m3 psadbw m6, m4 pshufd m6, m6, 84 paddd m5, m6 paddd m1, m5 movu m5, [r3] movu m6, [r3 + 16] psadbw m5, m3 psadbw m6, m4 pshufd m6, m6, 84 paddd m5, m6 paddd m2, m5 movu m3, [r0 + FENC_STRIDE] ; mova - fixed by sfdong movu m4, [r0 + 16 + FENC_STRIDE] ; mova - fixed by sfdong movu m5, [r1 + r4] movu m6, [r1 + 16 + r4] psadbw m5, m3 psadbw m6, m4 pshufd m6, m6, 84 paddd m5, m6 paddd m0, m5 movu m5, [r2 + r4] movu m6, [r2 + 16 + r4] psadbw m5, m3 psadbw m6, m4 pshufd m6, m6, 84 paddd m5, m6 paddd m1, m5 movu m5, [r3 + r4] movu m6, [r3 + 16 + r4] psadbw m5, m3 psadbw m6, m4 pshufd m6, m6, 84 paddd m5, m6 paddd m2, m5 movu m3, [r0 + FENC_STRIDE * 2] ; mova - fixed by sfdong movu m4, [r0 + 16 + FENC_STRIDE * 2] ; mova - fixed by sfdong movu m5, [r1 + r4 * 2] movu m6, [r1 + 16 + r4 * 2] psadbw m5, m3 psadbw m6, m4 pshufd m6, m6, 84 paddd m5, m6 paddd m0, m5 movu m5, [r2 + r4 * 2] movu m6, [r2 + 16 + r4 * 2] psadbw m5, m3 psadbw m6, m4 pshufd m6, m6, 84 paddd m5, m6 paddd m1, m5 movu m5, [r3 + r4 * 2] movu m6, [r3 + 16 + r4 * 2] psadbw m5, m3 psadbw m6, m4 pshufd m6, m6, 84 paddd m5, m6 paddd m2, m5 lea r0, [r0 + FENC_STRIDE * 2] lea r1, [r1 + r4 * 2] lea r2, [r2 + r4 * 2] lea r3, [r3 + r4 * 2] movu m3, [r0 + FENC_STRIDE] ; mova - fixed by sfdong movu m4, [r0 + 16 + FENC_STRIDE] ; mova - fixed by sfdong movu m5, [r1 + r4] movu m6, [r1 + 16 + r4] psadbw m5, m3 psadbw m6, m4 pshufd m6, m6, 84 paddd m5, m6 paddd m0, m5 movu m5, [r2 + r4] movu m6, [r2 + 16 + r4] psadbw m5, m3 psadbw m6, m4 pshufd m6, m6, 84 paddd m5, m6 paddd m1, m5 movu m5, [r3 + r4] movu m6, [r3 + 16 + r4] psadbw m5, m3 psadbw m6, m4 pshufd m6, m6, 84 paddd m5, m6 paddd m2, m5 lea r0, [r0 + FENC_STRIDE * 2] lea r1, [r1 + r4 * 2] lea r2, [r2 + r4 * 2] lea r3, [r3 + r4 * 2] %endmacro %macro SAD_X4_24x4 0 movu m4, [r0 ] ; mova - fixed by sfdong movu m5, [r0 + 16] ; mova - fixed by sfdong movu m6, [r1] movu m7, [r1 + 16] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m0, m6 movu m6, [r2] movu m7, [r2 + 16] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m1, m6 movu m6, [r3] movu m7, [r3 + 16] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m2, m6 movu m6, [r4] movu m7, [r4 + 16] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m3, m6 movu m4, [r0 + FENC_STRIDE] ; mova - fixed by sfdong movu m5, [r0 + 16 + FENC_STRIDE] ; mova - fixed by sfdong movu m6, [r1 + r5] movu m7, [r1 + 16 + r5] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m0, m6 movu m6, [r2 + r5] movu m7, [r2 + 16 + r5] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m1, m6 movu m6, [r3 + r5] movu m7, [r3 + 16 + r5] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m2, m6 movu m6, [r4 + r5] movu m7, [r4 + 16 + r5] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m3, m6 movu m4, [r0 + FENC_STRIDE * 2] ; mova - fixed by sfdong movu m5, [r0 + 16 + FENC_STRIDE * 2] ; mova - fixed by sfdong movu m6, [r1 + r5 * 2] movu m7, [r1 + 16 + r5 * 2] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m0, m6 movu m6, [r2 + r5 * 2] movu m7, [r2 + 16 + r5 * 2] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m1, m6 movu m6, [r3 + r5 * 2] movu m7, [r3 + 16 + r5 * 2] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m2, m6 movu m6, [r4 + r5 * 2] movu m7, [r4 + 16 + r5 * 2] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m3, m6 lea r0, [r0 + FENC_STRIDE * 2] lea r1, [r1 + r5 * 2] lea r2, [r2 + r5 * 2] lea r3, [r3 + r5 * 2] lea r4, [r4 + r5 * 2] movu m4, [r0 + FENC_STRIDE] ; mova - fixed by sfdong movu m5, [r0 + 16 + FENC_STRIDE] ; mova - fixed by sfdong movu m6, [r1 + r5] movu m7, [r1 + 16 + r5] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m0, m6 movu m6, [r2 + r5] movu m7, [r2 + 16 + r5] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m1, m6 movu m6, [r3 + r5] movu m7, [r3 + 16 + r5] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m2, m6 movu m6, [r4 + r5] movu m7, [r4 + 16 + r5] psadbw m6, m4 psadbw m7, m5 pshufd m7, m7, 84 paddd m6, m7 paddd m3, m6 lea r0, [r0 + FENC_STRIDE * 2] lea r1, [r1 + r5 * 2] lea r2, [r2 + r5 * 2] lea r3, [r3 + r5 * 2] lea r4, [r4 + r5 * 2] %endmacro %macro SAD_X3_32x4 0 mova m3, [r0] mova m4, [r0 + 16] movu m5, [r1] movu m6, [r1 + 16] psadbw m5, m3 psadbw m6, m4 paddd m5, m6 paddd m0, m5 movu m5, [r2] movu m6, [r2 + 16] psadbw m5, m3 psadbw m6, m4 paddd m5, m6 paddd m1, m5 movu m5, [r3] movu m6, [r3 + 16] psadbw m5, m3 psadbw m6, m4 paddd m5, m6 paddd m2, m5 lea r0, [r0 + FENC_STRIDE] lea r1, [r1 + r4] lea r2, [r2 + r4] lea r3, [r3 + r4] mova m3, [r0] mova m4, [r0 + 16] movu m5, [r1] movu m6, [r1 + 16] psadbw m5, m3 psadbw m6, m4 paddd m5, m6 paddd m0, m5 movu m5, [r2] movu m6, [r2 + 16] psadbw m5, m3 psadbw m6, m4 paddd m5, m6 paddd m1, m5 movu m5, [r3] movu m6, [r3 + 16] psadbw m5, m3 psadbw m6, m4 paddd m5, m6 paddd m2, m5 lea r0, [r0 + FENC_STRIDE] lea r1, [r1 + r4] lea r2, [r2 + r4] lea r3, [r3 + r4] mova m3, [r0] mova m4, [r0 + 16] movu m5, [r1] movu m6, [r1 + 16] psadbw m5, m3 psadbw m6, m4 paddd m5, m6 paddd m0, m5 movu m5, [r2] movu m6, [r2 + 16] psadbw m5, m3 psadbw m6, m4 paddd m5, m6 paddd m1, m5 movu m5, [r3] movu m6, [r3 + 16] psadbw m5, m3 psadbw m6, m4 paddd m5, m6 paddd m2, m5 lea r0, [r0 + FENC_STRIDE] lea r1, [r1 + r4] lea r2, [r2 + r4] lea r3, [r3 + r4] mova m3, [r0] mova m4, [r0 + 16] movu m5, [r1] movu m6, [r1 + 16] psadbw m5, m3 psadbw m6, m4 paddd m5, m6 paddd m0, m5 movu m5, [r2] movu m6, [r2 + 16] psadbw m5, m3 psadbw m6, m4 paddd m5, m6 paddd m1, m5 movu m5, [r3] movu m6, [r3 + 16] psadbw m5, m3 psadbw m6, m4 paddd m5, m6 paddd m2, m5 lea r0, [r0 + FENC_STRIDE] lea r1, [r1 + r4] lea r2, [r2 + r4] lea r3, [r3 + r4] %endmacro %macro SAD_X4_32x4 0 mova m4, [r0] mova m5, [r0 + 16] movu m6, [r1] movu m7, [r1 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m0, m6 movu m6, [r2] movu m7, [r2 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m1, m6 movu m6, [r3] movu m7, [r3 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m2, m6 movu m6, [r4] movu m7, [r4 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m3, m6 lea r0, [r0 + FENC_STRIDE] lea r1, [r1 + r5] lea r2, [r2 + r5] lea r3, [r3 + r5] lea r4, [r4 + r5] mova m4, [r0] mova m5, [r0 + 16] movu m6, [r1] movu m7, [r1 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m0, m6 movu m6, [r2] movu m7, [r2 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m1, m6 movu m6, [r3] movu m7, [r3 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m2, m6 movu m6, [r4] movu m7, [r4 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m3, m6 lea r0, [r0 + FENC_STRIDE] lea r1, [r1 + r5] lea r2, [r2 + r5] lea r3, [r3 + r5] lea r4, [r4 + r5] mova m4, [r0] mova m5, [r0 + 16] movu m6, [r1] movu m7, [r1 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m0, m6 movu m6, [r2] movu m7, [r2 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m1, m6 movu m6, [r3] movu m7, [r3 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m2, m6 movu m6, [r4] movu m7, [r4 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m3, m6 lea r0, [r0 + FENC_STRIDE] lea r1, [r1 + r5] lea r2, [r2 + r5] lea r3, [r3 + r5] lea r4, [r4 + r5] mova m4, [r0] mova m5, [r0 + 16] movu m6, [r1] movu m7, [r1 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m0, m6 movu m6, [r2] movu m7, [r2 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m1, m6 movu m6, [r3] movu m7, [r3 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m2, m6 movu m6, [r4] movu m7, [r4 + 16] psadbw m6, m4 psadbw m7, m5 paddd m6, m7 paddd m3, m6 lea r0, [r0 + FENC_STRIDE] lea r1, [r1 + r5] lea r2, [r2 + r5] lea r3, [r3 + r5] lea r4, [r4 + r5] %endmacro %macro SAD_X3_48x4 0 mova m3, [r0] mova m4, [r0 + 16] mova m5, [r0 + 32] movu m6, [r1] psadbw m6, m3 paddd m0, m6 movu m6, [r1 + 16] psadbw m6, m4 paddd m0, m6 movu m6, [r1 + 32] psadbw m6, m5 paddd m0, m6 movu m6, [r2] psadbw m6, m3 paddd m1, m6 movu m6, [r2 + 16] psadbw m6, m4 paddd m1, m6 movu m6, [r2 + 32] psadbw m6, m5 paddd m1, m6 movu m6, [r3] psadbw m6, m3 paddd m2, m6 movu m6, [r3 + 16] psadbw m6, m4 paddd m2, m6 movu m6, [r3 + 32] psadbw m6, m5 paddd m2, m6 mova m3, [r0 + FENC_STRIDE] mova m4, [r0 + 16 + FENC_STRIDE] mova m5, [r0 + 32 + FENC_STRIDE] movu m6, [r1 + r4] psadbw m6, m3 paddd m0, m6 movu m6, [r1 + 16 + r4] psadbw m6, m4 paddd m0, m6 movu m6, [r1 + 32 + r4] psadbw m6, m5 paddd m0, m6 movu m6, [r2 + r4] psadbw m6, m3 paddd m1, m6 movu m6, [r2 + 16 + r4] psadbw m6, m4 paddd m1, m6 movu m6, [r2 + 32 + r4] psadbw m6, m5 paddd m1, m6 movu m6, [r3 + r4] psadbw m6, m3 paddd m2, m6 movu m6, [r3 + 16 + r4] psadbw m6, m4 paddd m2, m6 movu m6, [r3 + 32 + r4] psadbw m6, m5 paddd m2, m6 mova m3, [r0 + FENC_STRIDE * 2] mova m4, [r0 + 16 + FENC_STRIDE * 2] mova m5, [r0 + 32 + FENC_STRIDE * 2] movu m6, [r1 + r4 * 2] psadbw m6, m3 paddd m0, m6 movu m6, [r1 + 16 + r4 * 2] psadbw m6, m4 paddd m0, m6 movu m6, [r1 + 32 + r4 * 2] psadbw m6, m5 paddd m0, m6 movu m6, [r2 + r4 * 2] psadbw m6, m3 paddd m1, m6 movu m6, [r2 + 16 + r4 * 2] psadbw m6, m4 paddd m1, m6 movu m6, [r2 + 32 + r4 * 2] psadbw m6, m5 paddd m1, m6 movu m6, [r3 + r4 * 2] psadbw m6, m3 paddd m2, m6 movu m6, [r3 + 16 + r4 * 2] psadbw m6, m4 paddd m2, m6 movu m6, [r3 + 32 + r4 * 2] psadbw m6, m5 paddd m2, m6 lea r0, [r0 + FENC_STRIDE * 2] lea r1, [r1 + r4 * 2] lea r2, [r2 + r4 * 2] lea r3, [r3 + r4 * 2] mova m3, [r0 + FENC_STRIDE] mova m4, [r0 + 16 + FENC_STRIDE] mova m5, [r0 + 32 + FENC_STRIDE] movu m6, [r1 + r4] psadbw m6, m3 paddd m0, m6 movu m6, [r1 + 16 + r4] psadbw m6, m4 paddd m0, m6 movu m6, [r1 + 32 + r4] psadbw m6, m5 paddd m0, m6 movu m6, [r2 + r4] psadbw m6, m3 paddd m1, m6 movu m6, [r2 + 16 + r4] psadbw m6, m4 paddd m1, m6 movu m6, [r2 + 32 + r4] psadbw m6, m5 paddd m1, m6 movu m6, [r3 + r4] psadbw m6, m3 paddd m2, m6 movu m6, [r3 + 16 + r4] psadbw m6, m4 paddd m2, m6 movu m6, [r3 + 32 + r4] psadbw m6, m5 paddd m2, m6 lea r0, [r0 + FENC_STRIDE * 2] lea r1, [r1 + r4 * 2] lea r2, [r2 + r4 * 2] lea r3, [r3 + r4 * 2] %endmacro %macro SAD_X4_48x4 0 mova m4, [r0] mova m5, [r0 + 16] mova m6, [r0 + 32] movu m7, [r1] psadbw m7, m4 paddd m0, m7 movu m7, [r1 + 16] psadbw m7, m5 paddd m0, m7 movu m7, [r1 + 32] psadbw m7, m6 paddd m0, m7 movu m7, [r2] psadbw m7, m4 paddd m1, m7 movu m7, [r2 + 16] psadbw m7, m5 paddd m1, m7 movu m7, [r2 + 32] psadbw m7, m6 paddd m1, m7 movu m7, [r3] psadbw m7, m4 paddd m2, m7 movu m7, [r3 + 16] psadbw m7, m5 paddd m2, m7 movu m7, [r3 + 32] psadbw m7, m6 paddd m2, m7 movu m7, [r4] psadbw m7, m4 paddd m3, m7 movu m7, [r4 + 16] psadbw m7, m5 paddd m3, m7 movu m7, [r4 + 32] psadbw m7, m6 paddd m3, m7 mova m4, [r0 + FENC_STRIDE] mova m5, [r0 + 16 + FENC_STRIDE] mova m6, [r0 + 32 + FENC_STRIDE] movu m7, [r1 + r5] psadbw m7, m4 paddd m0, m7 movu m7, [r1 + 16 + r5] psadbw m7, m5 paddd m0, m7 movu m7, [r1 + 32 + r5] psadbw m7, m6 paddd m0, m7 movu m7, [r2 + r5] psadbw m7, m4 paddd m1, m7 movu m7, [r2 + 16 + r5] psadbw m7, m5 paddd m1, m7 movu m7, [r2 + 32 + r5] psadbw m7, m6 paddd m1, m7 movu m7, [r3 + r5] psadbw m7, m4 paddd m2, m7 movu m7, [r3 + 16 + r5] psadbw m7, m5 paddd m2, m7 movu m7, [r3 + 32 + r5] psadbw m7, m6 paddd m2, m7 movu m7, [r4 + r5] psadbw m7, m4 paddd m3, m7 movu m7, [r4 + 16 + r5] psadbw m7, m5 paddd m3, m7 movu m7, [r4 + 32 + r5] psadbw m7, m6 paddd m3, m7 mova m4, [r0 + FENC_STRIDE * 2] mova m5, [r0 + 16 + FENC_STRIDE * 2] mova m6, [r0 + 32 + FENC_STRIDE * 2] movu m7, [r1 + r5 * 2] psadbw m7, m4 paddd m0, m7 movu m7, [r1 + 16 + r5 * 2] psadbw m7, m5 paddd m0, m7 movu m7, [r1 + 32 + r5 * 2] psadbw m7, m6 paddd m0, m7 movu m7, [r2 + r5 * 2] psadbw m7, m4 paddd m1, m7 movu m7, [r2 + 16 + r5 * 2] psadbw m7, m5 paddd m1, m7 movu m7, [r2 + 32 + r5 * 2] psadbw m7, m6 paddd m1, m7 movu m7, [r3 + r5 * 2] psadbw m7, m4 paddd m2, m7 movu m7, [r3 + 16 + r5 * 2] psadbw m7, m5 paddd m2, m7 movu m7, [r3 + 32 + r5 * 2] psadbw m7, m6 paddd m2, m7 movu m7, [r4 + r5 * 2] psadbw m7, m4 paddd m3, m7 movu m7, [r4 + 16 + r5 * 2] psadbw m7, m5 paddd m3, m7 movu m7, [r4 + 32 + r5 * 2] psadbw m7, m6 paddd m3, m7 lea r0, [r0 + FENC_STRIDE * 2] lea r1, [r1 + r5 * 2] lea r2, [r2 + r5 * 2] lea r3, [r3 + r5 * 2] lea r4, [r4 + r5 * 2] mova m4, [r0 + FENC_STRIDE] mova m5, [r0 + 16 + FENC_STRIDE] mova m6, [r0 + 32 + FENC_STRIDE] movu m7, [r1 + r5] psadbw m7, m4 paddd m0, m7 movu m7, [r1 + 16 + r5] psadbw m7, m5 paddd m0, m7 movu m7, [r1 + 32 + r5] psadbw m7, m6 paddd m0, m7 movu m7, [r2 + r5] psadbw m7, m4 paddd m1, m7 movu m7, [r2 + 16 + r5] psadbw m7, m5 paddd m1, m7 movu m7, [r2 + 32 + r5] psadbw m7, m6 paddd m1, m7 movu m7, [r3 + r5] psadbw m7, m4 paddd m2, m7 movu m7, [r3 + 16 + r5] psadbw m7, m5 paddd m2, m7 movu m7, [r3 + 32 + r5] psadbw m7, m6 paddd m2, m7 movu m7, [r4 + r5] psadbw m7, m4 paddd m3, m7 movu m7, [r4 + 16 + r5] psadbw m7, m5 paddd m3, m7 movu m7, [r4 + 32 + r5] psadbw m7, m6 paddd m3, m7 lea r0, [r0 + FENC_STRIDE * 2] lea r1, [r1 + r5 * 2] lea r2, [r2 + r5 * 2] lea r3, [r3 + r5 * 2] lea r4, [r4 + r5 * 2] %endmacro %macro SAD_X3_64x4 0 mova m3, [r0] mova m4, [r0 + 16] movu m5, [r1] psadbw m5, m3 paddd m0, m5 movu m5, [r1 + 16] psadbw m5, m4 paddd m0, m5 movu m5, [r2] psadbw m5, m3 paddd m1, m5 movu m5, [r2 + 16] psadbw m5, m4 paddd m1, m5 movu m5, [r3] psadbw m5, m3 paddd m2, m5 movu m5, [r3 + 16] psadbw m5, m4 paddd m2, m5 mova m3, [r0 + 32] mova m4, [r0 + 48] movu m5, [r1 + 32] psadbw m5, m3 paddd m0, m5 movu m5, [r1 + 48] psadbw m5, m4 paddd m0, m5 movu m5, [r2 + 32] psadbw m5, m3 paddd m1, m5 movu m5, [r2 + 48] psadbw m5, m4 paddd m1, m5 movu m5, [r3 + 32] psadbw m5, m3 paddd m2, m5 movu m5, [r3 + 48] psadbw m5, m4 paddd m2, m5 mova m3, [r0 + FENC_STRIDE] mova m4, [r0 + 16 + FENC_STRIDE] movu m5, [r1 + r4] psadbw m5, m3 paddd m0, m5 movu m5, [r1 + 16 + r4] psadbw m5, m4 paddd m0, m5 movu m5, [r2 + r4] psadbw m5, m3 paddd m1, m5 movu m5, [r2 + 16 + r4] psadbw m5, m4 paddd m1, m5 movu m5, [r3 + r4] psadbw m5, m3 paddd m2, m5 movu m5, [r3 + 16 + r4] psadbw m5, m4 paddd m2, m5 mova m3, [r0 + 32 + FENC_STRIDE] mova m4, [r0 + 48 + FENC_STRIDE] movu m5, [r1 + 32 + r4] psadbw m5, m3 paddd m0, m5 movu m5, [r1 + 48 + r4] psadbw m5, m4 paddd m0, m5 movu m5, [r2 + 32 + r4] psadbw m5, m3 paddd m1, m5 movu m5, [r2 + 48 + r4] psadbw m5, m4 paddd m1, m5 movu m5, [r3 + 32 + r4] psadbw m5, m3 paddd m2, m5 movu m5, [r3 + 48 + r4] psadbw m5, m4 paddd m2, m5 mova m3, [r0 + FENC_STRIDE * 2] mova m4, [r0 + 16 + FENC_STRIDE * 2] movu m5, [r1 + r4 * 2] psadbw m5, m3 paddd m0, m5 movu m5, [r1 + 16 + r4 * 2] psadbw m5, m4 paddd m0, m5 movu m5, [r2 + r4 * 2] psadbw m5, m3 paddd m1, m5 movu m5, [r2 + 16 + r4 * 2] psadbw m5, m4 paddd m1, m5 movu m5, [r3 + r4 * 2] psadbw m5, m3 paddd m2, m5 movu m5, [r3 + 16 + r4 * 2] psadbw m5, m4 paddd m2, m5 mova m3, [r0 + 32 + FENC_STRIDE * 2] mova m4, [r0 + 48 + FENC_STRIDE * 2] movu m5, [r1 + 32 + r4 * 2] psadbw m5, m3 paddd m0, m5 movu m5, [r1 + 48 + r4 * 2] psadbw m5, m4 paddd m0, m5 movu m5, [r2 + 32 + r4 * 2] psadbw m5, m3 paddd m1, m5 movu m5, [r2 + 48 + r4 * 2] psadbw m5, m4 paddd m1, m5 movu m5, [r3 + 32 + r4 * 2] psadbw m5, m3 paddd m2, m5 movu m5, [r3 + 48 + r4 * 2] psadbw m5, m4 paddd m2, m5 lea r0, [r0 + FENC_STRIDE * 2] lea r1, [r1 + r4 * 2] lea r2, [r2 + r4 * 2] lea r3, [r3 + r4 * 2] mova m3, [r0 + FENC_STRIDE] mova m4, [r0 + 16 + FENC_STRIDE] movu m5, [r1 + r4] psadbw m5, m3 paddd m0, m5 movu m5, [r1 + 16 + r4] psadbw m5, m4 paddd m0, m5 movu m5, [r2 + r4] psadbw m5, m3 paddd m1, m5 movu m5, [r2 + 16 + r4] psadbw m5, m4 paddd m1, m5 movu m5, [r3 + r4] psadbw m5, m3 paddd m2, m5 movu m5, [r3 + 16 + r4] psadbw m5, m4 paddd m2, m5 mova m3, [r0 + 32 + FENC_STRIDE] mova m4, [r0 + 48 + FENC_STRIDE] movu m5, [r1 + 32 + r4] psadbw m5, m3 paddd m0, m5 movu m5, [r1 + 48 + r4] psadbw m5, m4 paddd m0, m5 movu m5, [r2 + 32 + r4] psadbw m5, m3 paddd m1, m5 movu m5, [r2 + 48 + r4] psadbw m5, m4 paddd m1, m5 movu m5, [r3 + 32 + r4] psadbw m5, m3 paddd m2, m5 movu m5, [r3 + 48 + r4] psadbw m5, m4 paddd m2, m5 lea r0, [r0 + FENC_STRIDE * 2] lea r1, [r1 + r4 * 2] lea r2, [r2 + r4 * 2] lea r3, [r3 + r4 * 2] %endmacro %macro SAD_X4_64x4 0 mova m4, [r0] mova m5, [r0 + 16] movu m6, [r1] psadbw m6, m4 paddd m0, m6 movu m6, [r1 + 16] psadbw m6, m5 paddd m0, m6 movu m6, [r2] psadbw m6, m4 paddd m1, m6 movu m6, [r2 + 16] psadbw m6, m5 paddd m1, m6 movu m6, [r3] psadbw m6, m4 paddd m2, m6 movu m6, [r3 + 16] psadbw m6, m5 paddd m2, m6 movu m6, [r4] psadbw m6, m4 paddd m3, m6 movu m6, [r4 + 16] psadbw m6, m5 paddd m3, m6 mova m4, [r0 + 32] mova m5, [r0 + 48] movu m6, [r1 + 32] psadbw m6, m4 paddd m0, m6 movu m6, [r1 + 48] psadbw m6, m5 paddd m0, m6 movu m6, [r2 + 32] psadbw m6, m4 paddd m1, m6 movu m6, [r2 + 48] psadbw m6, m5 paddd m1, m6 movu m6, [r3 + 32] psadbw m6, m4 paddd m2, m6 movu m6, [r3 + 48] psadbw m6, m5 paddd m2, m6 movu m6, [r4 + 32] psadbw m6, m4 paddd m3, m6 movu m6, [r4 + 48] psadbw m6, m5 paddd m3, m6 mova m4, [r0 + FENC_STRIDE] mova m5, [r0 + 16 + FENC_STRIDE] movu m6, [r1 + r5] psadbw m6, m4 paddd m0, m6 movu m6, [r1 + 16 + r5] psadbw m6, m5 paddd m0, m6 movu m6, [r2 + r5] psadbw m6, m4 paddd m1, m6 movu m6, [r2 + 16 + r5] psadbw m6, m5 paddd m1, m6 movu m6, [r3 + r5] psadbw m6, m4 paddd m2, m6 movu m6, [r3 + 16 + r5] psadbw m6, m5 paddd m2, m6 movu m6, [r4 + r5] psadbw m6, m4 paddd m3, m6 movu m6, [r4 + 16 + r5] psadbw m6, m5 paddd m3, m6 mova m4, [r0 + 32 + FENC_STRIDE] mova m5, [r0 + 48 + FENC_STRIDE] movu m6, [r1 + 32 + r5] psadbw m6, m4 paddd m0, m6 movu m6, [r1 + 48 + r5] psadbw m6, m5 paddd m0, m6 movu m6, [r2 + 32 + r5] psadbw m6, m4 paddd m1, m6 movu m6, [r2 + 48 + r5] psadbw m6, m5 paddd m1, m6 movu m6, [r3 + 32 + r5] psadbw m6, m4 paddd m2, m6 movu m6, [r3 + 48 + r5] psadbw m6, m5 paddd m2, m6 movu m6, [r4 + 32 + r5] psadbw m6, m4 paddd m3, m6 movu m6, [r4 + 48 + r5] psadbw m6, m5 paddd m3, m6 mova m4, [r0 + FENC_STRIDE * 2] mova m5, [r0 + 16 + FENC_STRIDE * 2] movu m6, [r1 + r5 * 2] psadbw m6, m4 paddd m0, m6 movu m6, [r1 + 16 + r5 * 2] psadbw m6, m5 paddd m0, m6 movu m6, [r2 + r5 * 2] psadbw m6, m4 paddd m1, m6 movu m6, [r2 + 16 + r5 * 2] psadbw m6, m5 paddd m1, m6 movu m6, [r3 + r5 * 2] psadbw m6, m4 paddd m2, m6 movu m6, [r3 + 16 + r5 * 2] psadbw m6, m5 paddd m2, m6 movu m6, [r4 + r5 * 2] psadbw m6, m4 paddd m3, m6 movu m6, [r4 + 16 + r5 * 2] psadbw m6, m5 paddd m3, m6 mova m4, [r0 + 32 + FENC_STRIDE * 2] mova m5, [r0 + 48 + FENC_STRIDE * 2] movu m6, [r1 + 32 + r5 * 2] psadbw m6, m4 paddd m0, m6 movu m6, [r1 + 48 + r5 * 2] psadbw m6, m5 paddd m0, m6 movu m6, [r2 + 32 + r5 * 2] psadbw m6, m4 paddd m1, m6 movu m6, [r2 + 48 + r5 * 2] psadbw m6, m5 paddd m1, m6 movu m6, [r3 + 32 + r5 * 2] psadbw m6, m4 paddd m2, m6 movu m6, [r3 + 48 + r5 * 2] psadbw m6, m5 paddd m2, m6 movu m6, [r4 + 32 + r5 * 2] psadbw m6, m4 paddd m3, m6 movu m6, [r4 + 48 + r5 * 2] psadbw m6, m5 paddd m3, m6 lea r0, [r0 + FENC_STRIDE * 2] lea r1, [r1 + r5 * 2] lea r2, [r2 + r5 * 2] lea r3, [r3 + r5 * 2] lea r4, [r4 + r5 * 2] mova m4, [r0 + FENC_STRIDE] mova m5, [r0 + 16 + FENC_STRIDE] movu m6, [r1 + r5] psadbw m6, m4 paddd m0, m6 movu m6, [r1 + 16 + r5] psadbw m6, m5 paddd m0, m6 movu m6, [r2 + r5] psadbw m6, m4 paddd m1, m6 movu m6, [r2 + 16 + r5] psadbw m6, m5 paddd m1, m6 movu m6, [r3 + r5] psadbw m6, m4 paddd m2, m6 movu m6, [r3 + 16 + r5] psadbw m6, m5 paddd m2, m6 movu m6, [r4 + r5] psadbw m6, m4 paddd m3, m6 movu m6, [r4 + 16 + r5] psadbw m6, m5 paddd m3, m6 mova m4, [r0 + 32 + FENC_STRIDE] mova m5, [r0 + 48 + FENC_STRIDE] movu m6, [r1 + 32 + r5] psadbw m6, m4 paddd m0, m6 movu m6, [r1 + 48 + r5] psadbw m6, m5 paddd m0, m6 movu m6, [r2 + 32 + r5] psadbw m6, m4 paddd m1, m6 movu m6, [r2 + 48 + r5] psadbw m6, m5 paddd m1, m6 movu m6, [r3 + 32 + r5] psadbw m6, m4 paddd m2, m6 movu m6, [r3 + 48 + r5] psadbw m6, m5 paddd m2, m6 movu m6, [r4 + 32 + r5] psadbw m6, m4 paddd m3, m6 movu m6, [r4 + 48 + r5] psadbw m6, m5 paddd m3, m6 lea r0, [r0 + FENC_STRIDE * 2] lea r1, [r1 + r5 * 2] lea r2, [r2 + r5 * 2] lea r3, [r3 + r5 * 2] lea r4, [r4 + r5 * 2] %endmacro ;----------------------------------------------------------------------------- ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, ; uint8_t *pix2, intptr_t i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X 3 cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2 SAD_X%1_2x%2P 1 %rep %3/2-1 SAD_X%1_2x%2P 0 %endrep SAD_X%1_END %endmacro INIT_MMX SAD_X 3, 16, 16 SAD_X 3, 16, 8 SAD_X 3, 8, 16 SAD_X 3, 8, 8 SAD_X 3, 8, 4 SAD_X 3, 4, 16 SAD_X 3, 4, 8 SAD_X 3, 4, 4 SAD_X 4, 16, 16 SAD_X 4, 16, 8 SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 SAD_X 4, 4, 16 SAD_X 4, 4, 8 SAD_X 4, 4, 4 ;============================================================================= ; SAD x3/x4 XMM ;============================================================================= %macro SAD_X3_START_1x16P_SSE2 0 mova m2, [r0] %if cpuflag(avx) psadbw m0, m2, [r1] psadbw m1, m2, [r2] psadbw m2, [r3] %else movu m0, [r1] movu m1, [r2] movu m3, [r3] psadbw m0, m2 psadbw m1, m2 psadbw m2, m3 %endif %endmacro %macro SAD_X3_1x16P_SSE2 2 mova m3, [r0+%1] %if cpuflag(avx) psadbw m4, m3, [r1+%2] psadbw m5, m3, [r2+%2] psadbw m3, [r3+%2] %else movu m4, [r1+%2] movu m5, [r2+%2] movu m6, [r3+%2] psadbw m4, m3 psadbw m5, m3 psadbw m3, m6 %endif paddd m0, m4 paddd m1, m5 paddd m2, m3 %endmacro %if ARCH_X86_64 DECLARE_REG_TMP 6 %else DECLARE_REG_TMP 5 %endif %macro SAD_X3_4x16P_SSE2 2 %if %1==0 lea t0, [r4*3] SAD_X3_START_1x16P_SSE2 %else SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0 %endif SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1 SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2 SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*FENC_STRIDE %endif lea r1, [r1+4*r4] lea r2, [r2+4*r4] lea r3, [r3+4*r4] %endif %endmacro %macro SAD_X3_START_2x8P_SSE2 0 movq m3, [r0] movq m0, [r1] movq m1, [r2] movq m2, [r3] movhps m3, [r0+FENC_STRIDE] movhps m0, [r1+r4] movhps m1, [r2+r4] movhps m2, [r3+r4] psadbw m0, m3 psadbw m1, m3 psadbw m2, m3 %endmacro %macro SAD_X3_2x8P_SSE2 4 movq m6, [r0+%1] movq m3, [r1+%2] movq m4, [r2+%2] movq m5, [r3+%2] movhps m6, [r0+%3] movhps m3, [r1+%4] movhps m4, [r2+%4] movhps m5, [r3+%4] psadbw m3, m6 psadbw m4, m6 psadbw m5, m6 paddd m0, m3 paddd m1, m4 paddd m2, m5 %endmacro %macro SAD_X4_START_2x8P_SSE2 0 movq m4, [r0] movq m0, [r1] movq m1, [r2] movq m2, [r3] movq m3, [r4] movhps m4, [r0+FENC_STRIDE] movhps m0, [r1+r5] movhps m1, [r2+r5] movhps m2, [r3+r5] movhps m3, [r4+r5] psadbw m0, m4 psadbw m1, m4 psadbw m2, m4 psadbw m3, m4 %endmacro %macro SAD_X4_2x8P_SSE2 4 movq m6, [r0+%1] movq m4, [r1+%2] movq m5, [r2+%2] movhps m6, [r0+%3] movhps m4, [r1+%4] movhps m5, [r2+%4] psadbw m4, m6 psadbw m5, m6 paddd m0, m4 paddd m1, m5 movq m4, [r3+%2] movq m5, [r4+%2] movhps m4, [r3+%4] movhps m5, [r4+%4] psadbw m4, m6 psadbw m5, m6 paddd m2, m4 paddd m3, m5 %endmacro %macro SAD_X4_START_1x16P_SSE2 0 mova m3, [r0] %if cpuflag(avx) psadbw m0, m3, [r1] psadbw m1, m3, [r2] psadbw m2, m3, [r3] psadbw m3, [r4] %else movu m0, [r1] movu m1, [r2] movu m2, [r3] movu m4, [r4] psadbw m0, m3 psadbw m1, m3 psadbw m2, m3 psadbw m3, m4 %endif %endmacro %macro SAD_X4_1x16P_SSE2 2 mova m6, [r0+%1] %if cpuflag(avx) psadbw m4, m6, [r1+%2] psadbw m5, m6, [r2+%2] %else movu m4, [r1+%2] movu m5, [r2+%2] psadbw m4, m6 psadbw m5, m6 %endif paddd m0, m4 paddd m1, m5 %if cpuflag(avx) psadbw m4, m6, [r3+%2] psadbw m5, m6, [r4+%2] %else movu m4, [r3+%2] movu m5, [r4+%2] psadbw m4, m6 psadbw m5, m6 %endif paddd m2, m4 paddd m3, m5 %endmacro %macro SAD_X4_4x16P_SSE2 2 %if %1==0 lea r6, [r5*3] SAD_X4_START_1x16P_SSE2 %else SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0 %endif SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1 SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2 SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*FENC_STRIDE %endif lea r1, [r1+4*r5] lea r2, [r2+4*r5] lea r3, [r3+4*r5] lea r4, [r4+4*r5] %endif %endmacro %macro SAD_X3_4x8P_SSE2 2 %if %1==0 lea t0, [r4*3] SAD_X3_START_2x8P_SSE2 %else SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1 %endif SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*FENC_STRIDE %endif lea r1, [r1+4*r4] lea r2, [r2+4*r4] lea r3, [r3+4*r4] %endif %endmacro %macro SAD_X4_4x8P_SSE2 2 %if %1==0 lea r6, [r5*3] SAD_X4_START_2x8P_SSE2 %else SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1 %endif SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*FENC_STRIDE %endif lea r1, [r1+4*r5] lea r2, [r2+4*r5] lea r3, [r3+4*r5] lea r4, [r4+4*r5] %endif %endmacro %macro SAD_X3_END_SSE2 1 movifnidn r5, r5mp movhlps m3, m0 movhlps m4, m1 movhlps m5, m2 paddd m0, m3 paddd m1, m4 paddd m2, m5 movd [r5+0], m0 movd [r5+4], m1 movd [r5+8], m2 RET %endmacro %macro SAD_X4_END_SSE2 1 mov r0, r6mp psllq m1, 32 psllq m3, 32 paddd m0, m1 paddd m2, m3 movhlps m1, m0 movhlps m3, m2 paddd m0, m1 paddd m2, m3 movq [r0+0], m0 movq [r0+8], m2 RET %endmacro %macro SAD_X3_START_2x16P_AVX2 0 movu m3, [r0] ; assumes FENC_STRIDE == 16 movu xm0, [r1] movu xm1, [r2] movu xm2, [r3] vinserti128 m0, m0, [r1+r4], 1 vinserti128 m1, m1, [r2+r4], 1 vinserti128 m2, m2, [r3+r4], 1 psadbw m0, m3 psadbw m1, m3 psadbw m2, m3 %endmacro %macro SAD_X3_2x16P_AVX2 3 movu m3, [r0+%1] ; assumes FENC_STRIDE == 16 movu xm4, [r1+%2] movu xm5, [r2+%2] movu xm6, [r3+%2] vinserti128 m4, m4, [r1+%3], 1 vinserti128 m5, m5, [r2+%3], 1 vinserti128 m6, m6, [r3+%3], 1 psadbw m4, m3 psadbw m5, m3 psadbw m6, m3 paddw m0, m4 paddw m1, m5 paddw m2, m6 %endmacro %macro SAD_X3_4x16P_AVX2 2 %if %1==0 lea t0, [r4*3] SAD_X3_START_2x16P_AVX2 %else SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1 %endif SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*FENC_STRIDE %endif lea r1, [r1+4*r4] lea r2, [r2+4*r4] lea r3, [r3+4*r4] %endif %endmacro %macro SAD_X4_START_2x16P_AVX2 0 vbroadcasti128 m4, [r0] vbroadcasti128 m5, [r0+FENC_STRIDE] movu xm0, [r1] movu xm1, [r2] movu xm2, [r1+r5] movu xm3, [r2+r5] vinserti128 m0, m0, [r3], 1 vinserti128 m1, m1, [r4], 1 vinserti128 m2, m2, [r3+r5], 1 vinserti128 m3, m3, [r4+r5], 1 psadbw m0, m4 psadbw m1, m4 psadbw m2, m5 psadbw m3, m5 paddw m0, m2 paddw m1, m3 %endmacro %macro SAD_X4_2x16P_AVX2 4 vbroadcasti128 m6, [r0+%1] vbroadcasti128 m7, [r0+%3] movu xm2, [r1+%2] movu xm3, [r2+%2] movu xm4, [r1+%4] movu xm5, [r2+%4] vinserti128 m2, m2, [r3+%2], 1 vinserti128 m3, m3, [r4+%2], 1 vinserti128 m4, m4, [r3+%4], 1 vinserti128 m5, m5, [r4+%4], 1 psadbw m2, m6 psadbw m3, m6 psadbw m4, m7 psadbw m5, m7 paddd m0, m2 paddd m1, m3 paddd m0, m4 paddd m1, m5 %endmacro %macro SAD_X4_4x16P_AVX2 2 %if %1==0 lea r6, [r5*3] SAD_X4_START_2x16P_AVX2 %else SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1 %endif SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*FENC_STRIDE %endif lea r1, [r1+4*r5] lea r2, [r2+4*r5] lea r3, [r3+4*r5] lea r4, [r4+4*r5] %endif %endmacro %macro SAD_X4_START_2x32P_AVX2 0 mova m4, [r0] movu m0, [r1] movu m2, [r2] movu m1, [r3] movu m3, [r4] psadbw m0, m4 psadbw m2, m4 psadbw m1, m4 psadbw m3, m4 packusdw m0, m2 packusdw m1, m3 mova m6, [r0+FENC_STRIDE] movu m2, [r1+r5] movu m4, [r2+r5] movu m3, [r3+r5] movu m5, [r4+r5] psadbw m2, m6 psadbw m4, m6 psadbw m3, m6 psadbw m5, m6 packusdw m2, m4 packusdw m3, m5 paddd m0, m2 paddd m1, m3 %endmacro %macro SAD_X4_2x32P_AVX2 4 mova m6, [r0+%1] movu m2, [r1+%2] movu m4, [r2+%2] movu m3, [r3+%2] movu m5, [r4+%2] psadbw m2, m6 psadbw m4, m6 psadbw m3, m6 psadbw m5, m6 packusdw m2, m4 packusdw m3, m5 paddd m0, m2 paddd m1, m3 mova m6, [r0+%3] movu m2, [r1+%4] movu m4, [r2+%4] movu m3, [r3+%4] movu m5, [r4+%4] psadbw m2, m6 psadbw m4, m6 psadbw m3, m6 psadbw m5, m6 packusdw m2, m4 packusdw m3, m5 paddd m0, m2 paddd m1, m3 %endmacro %macro SAD_X4_4x32P_AVX2 2 %if %1==0 lea r6, [r5*3] SAD_X4_START_2x32P_AVX2 %else SAD_X4_2x32P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1 %endif SAD_X4_2x32P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*FENC_STRIDE %endif lea r1, [r1+4*r5] lea r2, [r2+4*r5] lea r3, [r3+4*r5] lea r4, [r4+4*r5] %endif %endmacro %macro SAD_X3_END_AVX2 0 movifnidn r5, r5mp packssdw m0, m1 ; 0 0 1 1 0 0 1 1 packssdw m2, m2 ; 2 2 _ _ 2 2 _ _ phaddd m0, m2 ; 0 1 2 _ 0 1 2 _ vextracti128 xm1, m0, 1 paddd xm0, xm1 ; 0 1 2 _ mova [r5], xm0 RET %endmacro %macro SAD_X4_END_AVX2 0 mov r0, r6mp pshufd m0, m0, 0x8 pshufd m1, m1, 0x8 vextracti128 xm2, m0, 1 vextracti128 xm3, m1, 1 punpcklqdq xm0, xm1 punpcklqdq xm2, xm3 phaddd xm0, xm2 ; 0 1 2 3 mova [r0], xm0 RET %endmacro %macro SAD_X4_32P_END_AVX2 0 mov r0, r6mp vextracti128 xm2, m0, 1 vextracti128 xm3, m1, 1 paddd xm0, xm2 paddd xm1, xm3 phaddd xm0, xm1 mova [r0], xm0 RET %endmacro ;----------------------------------------------------------------------------- ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, ; uint8_t *pix2, intptr_t i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X_SSE2 4 cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4 %assign x 0 %rep %3/4 SAD_X%1_4x%2P_SSE2 x, %3/4 %assign x x+1 %endrep %if %3 == 64 SAD_X%1_END_SSE2 1 %else SAD_X%1_END_SSE2 0 %endif %endmacro %macro SAD_X3_W12 0 cglobal pixel_sad_x3_12x16, 5, 7, 8 mova m4, [MSK] pxor m0, m0 pxor m1, m1 pxor m2, m2 SAD_X3_12x4 SAD_X3_12x4 SAD_X3_12x4 SAD_X3_12x4 SAD_X3_END_SSE2 1 %endmacro %macro SAD_X4_W12 0 cglobal pixel_sad_x4_12x16, 6, 8, 8 mova m6, [MSK] pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 SAD_X4_12x4 SAD_X4_12x4 SAD_X4_12x4 SAD_X4_12x4 SAD_X4_END_SSE2 1 %endmacro %macro SAD_X3_W24 0 cglobal pixel_sad_x3_24x32, 5, 7, 8 pxor m0, m0 pxor m1, m1 pxor m2, m2 mov r6, 32 .loop: SAD_X3_24x4 SAD_X3_24x4 SAD_X3_24x4 SAD_X3_24x4 sub r6, 16 cmp r6, 0 jnz .loop SAD_X3_END_SSE2 1 %endmacro %macro SAD_X4_W24 0 %if ARCH_X86_64 == 1 cglobal pixel_sad_x4_24x32, 6, 8, 8 %define count r7 %else cglobal pixel_sad_x4_24x32, 6, 7, 8, 0-4 %define count dword [rsp] %endif pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 mov count, 32 .loop: SAD_X4_24x4 SAD_X4_24x4 SAD_X4_24x4 SAD_X4_24x4 sub count, 16 jnz .loop SAD_X4_END_SSE2 1 %endmacro %macro SAD_X3_W32 0 cglobal pixel_sad_x3_32x8, 5, 6, 8 pxor m0, m0 pxor m1, m1 pxor m2, m2 SAD_X3_32x4 SAD_X3_32x4 SAD_X3_END_SSE2 1 cglobal pixel_sad_x3_32x16, 5, 6, 8 pxor m0, m0 pxor m1, m1 pxor m2, m2 SAD_X3_32x4 SAD_X3_32x4 SAD_X3_32x4 SAD_X3_32x4 SAD_X3_END_SSE2 1 cglobal pixel_sad_x3_32x24, 5, 6, 8 pxor m0, m0 pxor m1, m1 pxor m2, m2 SAD_X3_32x4 SAD_X3_32x4 SAD_X3_32x4 SAD_X3_32x4 SAD_X3_32x4 SAD_X3_32x4 SAD_X3_END_SSE2 1 cglobal pixel_sad_x3_32x32, 5, 7, 8 pxor m0, m0 pxor m1, m1 pxor m2, m2 mov r6, 32 .loop: SAD_X3_32x4 SAD_X3_32x4 SAD_X3_32x4 SAD_X3_32x4 sub r6, 16 cmp r6, 0 jnz .loop SAD_X3_END_SSE2 1 cglobal pixel_sad_x3_32x64, 5, 7, 8 pxor m0, m0 pxor m1, m1 pxor m2, m2 mov r6, 64 .loop1: SAD_X3_32x4 SAD_X3_32x4 SAD_X3_32x4 SAD_X3_32x4 sub r6, 16 cmp r6, 0 jnz .loop1 SAD_X3_END_SSE2 1 %endmacro %macro SAD_X4_W32 0 cglobal pixel_sad_x4_32x8, 6, 7, 8 pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 SAD_X4_32x4 SAD_X4_32x4 SAD_X4_END_SSE2 1 cglobal pixel_sad_x4_32x16, 6, 7, 8 pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 SAD_X4_32x4 SAD_X4_32x4 SAD_X4_32x4 SAD_X4_32x4 SAD_X4_END_SSE2 1 cglobal pixel_sad_x4_32x24, 6, 7, 8 pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 SAD_X4_32x4 SAD_X4_32x4 SAD_X4_32x4 SAD_X4_32x4 SAD_X4_32x4 SAD_X4_32x4 SAD_X4_END_SSE2 1 %if ARCH_X86_64 == 1 cglobal pixel_sad_x4_32x32, 6, 8, 8 %define count r7 %else cglobal pixel_sad_x4_32x32, 6, 7, 8, 0-4 %define count dword [rsp] %endif pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 mov count, 32 .loop: SAD_X4_32x4 SAD_X4_32x4 SAD_X4_32x4 SAD_X4_32x4 sub count, 16 jnz .loop SAD_X4_END_SSE2 1 %if ARCH_X86_64 == 1 cglobal pixel_sad_x4_32x64, 6, 8, 8 %define count r7 %else cglobal pixel_sad_x4_32x64, 6, 7, 8, 0-4 %define count dword [rsp] %endif pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 mov count, 64 .loop: SAD_X4_32x4 SAD_X4_32x4 SAD_X4_32x4 SAD_X4_32x4 sub count, 16 jnz .loop SAD_X4_END_SSE2 1 %endmacro %macro SAD_X3_W48 0 cglobal pixel_sad_x3_48x64, 5, 7, 8 pxor m0, m0 pxor m1, m1 pxor m2, m2 mov r6, 64 .loop: SAD_X3_48x4 SAD_X3_48x4 SAD_X3_48x4 SAD_X3_48x4 sub r6, 16 jnz .loop SAD_X3_END_SSE2 1 %endmacro %macro SAD_X4_W48 0 %if ARCH_X86_64 == 1 cglobal pixel_sad_x4_48x64, 6, 8, 8 %define count r7 %else cglobal pixel_sad_x4_48x64, 6, 7, 8, 0-4 %define count dword [rsp] %endif pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 mov count, 64 .loop: SAD_X4_48x4 SAD_X4_48x4 SAD_X4_48x4 SAD_X4_48x4 sub count, 16 jnz .loop SAD_X4_END_SSE2 1 %endmacro %macro SAD_X3_W64 0 cglobal pixel_sad_x3_64x16, 5, 7, 7 pxor m0, m0 pxor m1, m1 pxor m2, m2 mov r6, 16 .loop: SAD_X3_64x4 SAD_X3_64x4 sub r6, 8 jnz .loop SAD_X3_END_SSE2 1 cglobal pixel_sad_x3_64x32, 5, 7, 7 pxor m0, m0 pxor m1, m1 pxor m2, m2 mov r6, 32 .loop: SAD_X3_64x4 SAD_X3_64x4 sub r6, 8 jnz .loop SAD_X3_END_SSE2 1 cglobal pixel_sad_x3_64x48, 5, 7, 7 pxor m0, m0 pxor m1, m1 pxor m2, m2 mov r6, 48 .loop: SAD_X3_64x4 SAD_X3_64x4 sub r6, 8 jnz .loop SAD_X3_END_SSE2 1 cglobal pixel_sad_x3_64x64, 5, 7, 7 pxor m0, m0 pxor m1, m1 pxor m2, m2 mov r6, 64 .loop: SAD_X3_64x4 SAD_X3_64x4 sub r6, 8 jnz .loop SAD_X3_END_SSE2 1 %endmacro %macro SAD_X4_W64 0 %if ARCH_X86_64 == 1 cglobal pixel_sad_x4_64x16, 6, 8, 8 %define count r7 %else cglobal pixel_sad_x4_64x16, 6, 7, 8, 0-4 %define count dword [rsp] %endif pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 mov count, 16 .loop: SAD_X4_64x4 SAD_X4_64x4 sub count, 8 jnz .loop SAD_X4_END_SSE2 1 %if ARCH_X86_64 == 1 cglobal pixel_sad_x4_64x32, 6, 8, 8 %define count r7 %else cglobal pixel_sad_x4_64x32, 6, 7, 8, 0-4 %define count dword [rsp] %endif pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 mov count, 32 .loop: SAD_X4_64x4 SAD_X4_64x4 sub count, 8 jnz .loop SAD_X4_END_SSE2 1 %if ARCH_X86_64 == 1 cglobal pixel_sad_x4_64x48, 6, 8, 8 %define count r7 %else cglobal pixel_sad_x4_64x48, 6, 7, 8, 0-4 %define count dword [rsp] %endif pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 mov count, 48 .loop: SAD_X4_64x4 SAD_X4_64x4 sub count, 8 jnz .loop SAD_X4_END_SSE2 1 %if ARCH_X86_64 == 1 cglobal pixel_sad_x4_64x64, 6, 8, 8 %define count r7 %else cglobal pixel_sad_x4_64x64, 6, 7, 8, 0-4 %define count dword [rsp] %endif pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 mov count, 64 .loop: SAD_X4_64x4 SAD_X4_64x4 sub count, 8 jnz .loop SAD_X4_END_SSE2 1 %endmacro %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 INIT_YMM avx2 %macro SAD_X4_64x8_AVX2 0 movu m4, [r0] movu m5, [r1] movu m6, [r2] movu m7, [r3] movu m8, [r4] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + mmsize] movu m5, [r1 + mmsize] movu m6, [r2 + mmsize] movu m7, [r3 + mmsize] movu m8, [r4 + mmsize] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE] movu m5, [r1 + r5] movu m6, [r2 + r5] movu m7, [r3 + r5] movu m8, [r4 + r5] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE + mmsize] movu m5, [r1 + r5 + mmsize] movu m6, [r2 + r5 + mmsize] movu m7, [r3 + r5 + mmsize] movu m8, [r4 + r5 + mmsize] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE * 2] movu m5, [r1 + r5 * 2] movu m6, [r2 + r5 * 2] movu m7, [r3 + r5 * 2] movu m8, [r4 + r5 * 2] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE * 2 + mmsize] movu m5, [r1 + r5 * 2 + mmsize] movu m6, [r2 + r5 * 2 + mmsize] movu m7, [r3 + r5 * 2 + mmsize] movu m8, [r4 + r5 * 2 + mmsize] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE * 3] movu m5, [r1 + r7] movu m6, [r2 + r7] movu m7, [r3 + r7] movu m8, [r4 + r7] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE * 3 + mmsize] movu m5, [r1 + r7 + mmsize] movu m6, [r2 + r7 + mmsize] movu m7, [r3 + r7 + mmsize] movu m8, [r4 + r7 + mmsize] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] movu m4, [r0] movu m5, [r1] movu m6, [r2] movu m7, [r3] movu m8, [r4] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + mmsize] movu m5, [r1 + mmsize] movu m6, [r2 + mmsize] movu m7, [r3 + mmsize] movu m8, [r4 + mmsize] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE] movu m5, [r1 + r5] movu m6, [r2 + r5] movu m7, [r3 + r5] movu m8, [r4 + r5] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE + mmsize] movu m5, [r1 + r5 + mmsize] movu m6, [r2 + r5 + mmsize] movu m7, [r3 + r5 + mmsize] movu m8, [r4 + r5 + mmsize] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE * 2] movu m5, [r1 + r5 * 2] movu m6, [r2 + r5 * 2] movu m7, [r3 + r5 * 2] movu m8, [r4 + r5 * 2] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE * 2 + mmsize] movu m5, [r1 + r5 * 2 + mmsize] movu m6, [r2 + r5 * 2 + mmsize] movu m7, [r3 + r5 * 2 + mmsize] movu m8, [r4 + r5 * 2 + mmsize] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE * 3] movu m5, [r1 + r7] movu m6, [r2 + r7] movu m7, [r3 + r7] movu m8, [r4 + r7] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE * 3 + mmsize] movu m5, [r1 + r7 + mmsize] movu m6, [r2 + r7 + mmsize] movu m7, [r3 + r7 + mmsize] movu m8, [r4 + r7 + mmsize] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 %endmacro %macro PIXEL_SAD_X4_END_AVX2 0 vextracti128 xm4, m0, 1 vextracti128 xm5, m1, 1 vextracti128 xm6, m2, 1 vextracti128 xm7, m3, 1 paddd m0, m4 paddd m1, m5 paddd m2, m6 paddd m3, m7 pshufd xm4, xm0, 2 pshufd xm5, xm1, 2 pshufd xm6, xm2, 2 pshufd xm7, xm3, 2 paddd m0, m4 paddd m1, m5 paddd m2, m6 paddd m3, m7 movd [r6 + 0], xm0 movd [r6 + 4], xm1 movd [r6 + 8], xm2 movd [r6 + 12], xm3 %endmacro cglobal pixel_sad_x4_64x16, 7,8,10 pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 lea r7, [r5 * 3] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 PIXEL_SAD_X4_END_AVX2 RET cglobal pixel_sad_x4_64x32, 7,8,10 pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 lea r7, [r5 * 3] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 PIXEL_SAD_X4_END_AVX2 RET cglobal pixel_sad_x4_64x48, 7,8,10 pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 lea r7, [r5 * 3] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 PIXEL_SAD_X4_END_AVX2 RET cglobal pixel_sad_x4_64x64, 7,8,10 pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 lea r7, [r5 * 3] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_64x8_AVX2 PIXEL_SAD_X4_END_AVX2 RET %macro SAD_X4_48x8_AVX2 0 movu m4, [r0] movu m5, [r1] movu m6, [r2] movu m7, [r3] movu m8, [r4] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu xm4, [r0 + mmsize] movu xm5, [r1 + mmsize] movu xm6, [r2 + mmsize] movu xm7, [r3 + mmsize] movu xm8, [r4 + mmsize] vinserti128 m4, m4, [r0 + FENC_STRIDE], 1 vinserti128 m5, m5, [r1 + r5], 1 vinserti128 m6, m6, [r2 + r5], 1 vinserti128 m7, m7, [r3 + r5], 1 vinserti128 m8, m8, [r4 + r5], 1 psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE + mmsize/2] movu m5, [r1 + r5 + mmsize/2] movu m6, [r2 + r5 + mmsize/2] movu m7, [r3 + r5 + mmsize/2] movu m8, [r4 + r5 + mmsize/2] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE * 2] movu m5, [r1 + r5 * 2] movu m6, [r2 + r5 * 2] movu m7, [r3 + r5 * 2] movu m8, [r4 + r5 * 2] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu xm4, [r0 + FENC_STRIDE * 2 + mmsize] movu xm5, [r1 + r5 * 2 + mmsize] movu xm6, [r2 + r5 * 2 + mmsize] movu xm7, [r3 + r5 * 2 + mmsize] movu xm8, [r4 + r5 * 2 + mmsize] vinserti128 m4, m4, [r0 + FENC_STRIDE * 3], 1 vinserti128 m5, m5, [r1 + r7], 1 vinserti128 m6, m6, [r2 + r7], 1 vinserti128 m7, m7, [r3 + r7], 1 vinserti128 m8, m8, [r4 + r7], 1 psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE * 3 + mmsize/2] movu m5, [r1 + r7 + mmsize/2] movu m6, [r2 + r7 + mmsize/2] movu m7, [r3 + r7 + mmsize/2] movu m8, [r4 + r7 + mmsize/2] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] movu m4, [r0] movu m5, [r1] movu m6, [r2] movu m7, [r3] movu m8, [r4] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu xm4, [r0 + mmsize] movu xm5, [r1 + mmsize] movu xm6, [r2 + mmsize] movu xm7, [r3 + mmsize] movu xm8, [r4 + mmsize] vinserti128 m4, m4, [r0 + FENC_STRIDE], 1 vinserti128 m5, m5, [r1 + r5], 1 vinserti128 m6, m6, [r2 + r5], 1 vinserti128 m7, m7, [r3 + r5], 1 vinserti128 m8, m8, [r4 + r5], 1 psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE + mmsize/2] movu m5, [r1 + r5 + mmsize/2] movu m6, [r2 + r5 + mmsize/2] movu m7, [r3 + r5 + mmsize/2] movu m8, [r4 + r5 + mmsize/2] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE * 2] movu m5, [r1 + r5 * 2] movu m6, [r2 + r5 * 2] movu m7, [r3 + r5 * 2] movu m8, [r4 + r5 * 2] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu xm4, [r0 + FENC_STRIDE * 2 + mmsize] movu xm5, [r1 + r5 * 2 + mmsize] movu xm6, [r2 + r5 * 2 + mmsize] movu xm7, [r3 + r5 * 2 + mmsize] movu xm8, [r4 + r5 * 2 + mmsize] vinserti128 m4, m4, [r0 + FENC_STRIDE * 3], 1 vinserti128 m5, m5, [r1 + r7], 1 vinserti128 m6, m6, [r2 + r7], 1 vinserti128 m7, m7, [r3 + r7], 1 vinserti128 m8, m8, [r4 + r7], 1 psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 movu m4, [r0 + FENC_STRIDE * 3 + mmsize/2] movu m5, [r1 + r7 + mmsize/2] movu m6, [r2 + r7 + mmsize/2] movu m7, [r3 + r7 + mmsize/2] movu m8, [r4 + r7 + mmsize/2] psadbw m9, m4, m5 paddd m0, m9 psadbw m5, m4, m6 paddd m1, m5 psadbw m6, m4, m7 paddd m2, m6 psadbw m4, m8 paddd m3, m4 %endmacro INIT_YMM avx2 cglobal pixel_sad_x4_48x64, 7,8,10 pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 lea r7, [r5 * 3] SAD_X4_48x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_48x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_48x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_48x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_48x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_48x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_48x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r5 * 4] lea r2, [r2 + r5 * 4] lea r3, [r3 + r5 * 4] lea r4, [r4 + r5 * 4] SAD_X4_48x8_AVX2 PIXEL_SAD_X4_END_AVX2 RET %endif INIT_XMM sse2 SAD_X_SSE2 3, 16, 16, 7 SAD_X_SSE2 3, 16, 8, 7 SAD_X_SSE2 3, 8, 16, 7 SAD_X_SSE2 3, 8, 8, 7 SAD_X_SSE2 3, 8, 4, 7 SAD_X_SSE2 4, 16, 16, 7 SAD_X_SSE2 4, 16, 8, 7 SAD_X_SSE2 4, 8, 16, 7 SAD_X_SSE2 4, 8, 8, 7 SAD_X_SSE2 4, 8, 4, 7 INIT_XMM sse3 SAD_X_SSE2 3, 16, 16, 7 SAD_X_SSE2 3, 16, 8, 7 SAD_X_SSE2 3, 16, 4, 7 SAD_X_SSE2 4, 16, 16, 7 SAD_X_SSE2 4, 16, 8, 7 SAD_X_SSE2 4, 16, 4, 7 INIT_XMM ssse3 SAD_X3_W12 SAD_X3_W32 SAD_X3_W24 SAD_X3_W48 SAD_X3_W64 SAD_X_SSE2 3, 16, 64, 7 SAD_X_SSE2 3, 16, 32, 7 SAD_X_SSE2 3, 16, 16, 7 SAD_X_SSE2 3, 16, 12, 7 SAD_X_SSE2 3, 16, 8, 7 SAD_X_SSE2 3, 8, 32, 7 SAD_X_SSE2 3, 8, 16, 7 SAD_X4_W12 SAD_X4_W24 SAD_X4_W32 SAD_X4_W48 SAD_X4_W64 SAD_X_SSE2 4, 16, 64, 7 SAD_X_SSE2 4, 16, 32, 7 SAD_X_SSE2 4, 16, 16, 7 SAD_X_SSE2 4, 16, 12, 7 SAD_X_SSE2 4, 16, 8, 7 SAD_X_SSE2 4, 8, 32, 7 SAD_X_SSE2 4, 8, 16, 7 SAD_X_SSE2 4, 8, 8, 7 SAD_X_SSE2 4, 8, 4, 7 INIT_XMM avx SAD_X3_W12 SAD_X3_W32 SAD_X3_W24 SAD_X3_W48 SAD_X3_W64 SAD_X_SSE2 3, 16, 64, 7 SAD_X_SSE2 3, 16, 32, 6 SAD_X_SSE2 3, 16, 16, 6 SAD_X_SSE2 3, 16, 12, 6 SAD_X_SSE2 3, 16, 8, 6 SAD_X_SSE2 3, 16, 4, 6 SAD_X4_W12 SAD_X4_W24 SAD_X4_W32 SAD_X4_W48 SAD_X4_W64 SAD_X_SSE2 4, 16, 64, 7 SAD_X_SSE2 4, 16, 32, 7 SAD_X_SSE2 4, 16, 16, 7 SAD_X_SSE2 4, 16, 12, 7 SAD_X_SSE2 4, 16, 8, 7 SAD_X_SSE2 4, 16, 4, 7 %macro SAD_X_AVX2 4 cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4 %assign x 0 %rep %3/4 SAD_X%1_4x%2P_AVX2 x, %3/4 %assign x x+1 %endrep %if (%1==4) && (%2==32) SAD_X%1_32P_END_AVX2 %else SAD_X%1_END_AVX2 %endif %endmacro INIT_YMM avx2 SAD_X_AVX2 3, 16, 32, 7 SAD_X_AVX2 3, 16, 16, 7 SAD_X_AVX2 3, 16, 12, 7 SAD_X_AVX2 3, 16, 8, 7 SAD_X_AVX2 4, 16, 32, 8 SAD_X_AVX2 4, 16, 16, 8 SAD_X_AVX2 4, 16, 12, 8 SAD_X_AVX2 4, 16, 8, 8 SAD_X_AVX2 4, 32, 8, 8 SAD_X_AVX2 4, 32, 16, 8 SAD_X_AVX2 4, 32, 24, 8 SAD_X_AVX2 4, 32, 32, 8 SAD_X_AVX2 4, 32, 64, 8 ;============================================================================= ; SAD cacheline split ;============================================================================= ; Core2 (Conroe) can load unaligned data just as quickly as aligned data... ; unless the unaligned data spans the border between 2 cachelines, in which ; case it's really slow. The exact numbers may differ, but all Intel cpus prior ; to Nehalem have a large penalty for cacheline splits. ; (8-byte alignment exactly half way between two cachelines is ok though.) ; LDDQU was supposed to fix this, but it only works on Pentium 4. ; So in the split case we load aligned data and explicitly perform the ; alignment between registers. Like on archs that have only aligned loads, ; except complicated by the fact that PALIGNR takes only an immediate, not ; a variable alignment. ; It is also possible to hoist the realignment to the macroblock level (keep ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory ; needed for that method makes it often slower. ; sad 16x16 costs on Core2: ; good offsets: 49 cycles (50/64 of all mvs) ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles) ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles) ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles) ; computed jump assumes this loop is exactly 80 bytes %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment ALIGN 16 sad_w16_align%1_sse2: movdqa xmm1, [r2+16] movdqa xmm2, [r2+r3+16] movdqa xmm3, [r2] movdqa xmm4, [r2+r3] pslldq xmm1, 16-%1 pslldq xmm2, 16-%1 psrldq xmm3, %1 psrldq xmm4, %1 por xmm1, xmm3 por xmm2, xmm4 psadbw xmm1, [r0] psadbw xmm2, [r0+r1] paddw xmm0, xmm1 paddw xmm0, xmm2 lea r0, [r0+2*r1] lea r2, [r2+2*r3] dec r4 jg sad_w16_align%1_sse2 ret %endmacro ; computed jump assumes this loop is exactly 64 bytes %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment ALIGN 16 sad_w16_align%1_ssse3: movdqa xmm1, [r2+16] movdqa xmm2, [r2+r3+16] palignr xmm1, [r2], %1 palignr xmm2, [r2+r3], %1 psadbw xmm1, [r0] psadbw xmm2, [r0+r1] paddw xmm0, xmm1 paddw xmm0, xmm2 lea r0, [r0+2*r1] lea r2, [r2+2*r3] dec r4 jg sad_w16_align%1_ssse3 ret %endmacro %macro SAD16_CACHELINE_FUNC 2 ; cpu, height cglobal pixel_sad_16x%2_cache64_%1 mov eax, r2m and eax, 0x37 cmp eax, 0x30 jle pixel_sad_16x%2_sse2 PROLOGUE 4,6 mov r4d, r2d and r4d, 15 %ifidn %1, ssse3 shl r4d, 6 ; code size = 64 %else lea r4, [r4*5] shl r4d, 4 ; code size = 80 %endif %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1)) %ifdef PIC lea r5, [sad_w16_addr] add r5, r4 %else lea r5, [sad_w16_addr + r4] %endif and r2, ~15 mov r4d, %2/2 pxor xmm0, xmm0 call r5 movhlps xmm1, xmm0 paddw xmm0, xmm1 movd eax, xmm0 RET %endmacro %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline mov eax, r2m and eax, 0x17|%1|(%4>>1) cmp eax, 0x10|%1|(%4>>1) jle pixel_sad_%1x%2_mmx2 and eax, 7 shl eax, 3 movd mm6, [pd_64] movd mm7, eax psubw mm6, mm7 PROLOGUE 4,5 and r2, ~7 mov r4d, %3 pxor mm0, mm0 %endmacro %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline cglobal pixel_sad_16x%1_cache%2_mmx2 SAD_CACHELINE_START_MMX2 16, %1, %1, %2 .loop: movq mm1, [r2] movq mm2, [r2+8] movq mm3, [r2+16] movq mm4, mm2 psrlq mm1, mm7 psllq mm2, mm6 psllq mm3, mm6 psrlq mm4, mm7 por mm1, mm2 por mm3, mm4 psadbw mm1, [r0] psadbw mm3, [r0+8] paddw mm0, mm1 paddw mm0, mm3 add r2, r3 add r0, r1 dec r4 jg .loop movd eax, mm0 RET %endmacro %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline cglobal pixel_sad_8x%1_cache%2_mmx2 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2 .loop: movq mm1, [r2+8] movq mm2, [r2+r3+8] movq mm3, [r2] movq mm4, [r2+r3] psllq mm1, mm6 psllq mm2, mm6 psrlq mm3, mm7 psrlq mm4, mm7 por mm1, mm3 por mm2, mm4 psadbw mm1, [r0] psadbw mm2, [r0+r1] paddw mm0, mm1 paddw mm0, mm2 lea r2, [r2+2*r3] lea r0, [r0+2*r1] dec r4 jg .loop movd eax, mm0 RET %endmacro ; sad_x3/x4_cache64: check each mv. ; if they're all within a cacheline, use normal sad_x3/x4. ; otherwise, send them individually to sad_cache64. %macro CHECK_SPLIT 3 ; pix, width, cacheline mov eax, %1 and eax, 0x17|%2|(%3>>1) cmp eax, 0x10|%2|(%3>>1) jg .split %endmacro %macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name cglobal pixel_sad_x3_%1x%2_cache%3_%6 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 jmp pixel_sad_x3_%1x%2_%4 .split: %if ARCH_X86_64 PROLOGUE 6,9 push r3 push r2 %if WIN64 movsxd r4, r4d sub rsp, 40 ; shadow space and alignment %endif mov r2, r1 mov r1, FENC_STRIDE mov r3, r4 mov r7, r0 mov r8, r5 call pixel_sad_%1x%2_cache%3_%5 mov [r8], eax %if WIN64 mov r2, [rsp+40+0*8] %else pop r2 %endif mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 mov [r8+4], eax %if WIN64 mov r2, [rsp+40+1*8] %else pop r2 %endif mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 mov [r8+8], eax %if WIN64 add rsp, 40+2*8 %endif RET %else push edi mov edi, [esp+28] push dword [esp+24] push dword [esp+16] push dword 16 push dword [esp+20] call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+32] mov [edi], eax mov [esp+8], ecx call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+36] mov [edi+4], eax mov [esp+8], ecx call pixel_sad_%1x%2_cache%3_%5 mov [edi+8], eax add esp, 16 pop edi ret %endif %endmacro %macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name cglobal pixel_sad_x4_%1x%2_cache%3_%6 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 CHECK_SPLIT r4m, %1, %3 jmp pixel_sad_x4_%1x%2_%4 .split: %if ARCH_X86_64 PROLOGUE 6,9 mov r8, r6mp push r4 push r3 push r2 %if WIN64 sub rsp, 32 ; shadow space %endif mov r2, r1 mov r1, FENC_STRIDE mov r3, r5 mov r7, r0 call pixel_sad_%1x%2_cache%3_%5 mov [r8], eax %if WIN64 mov r2, [rsp+32+0*8] %else pop r2 %endif mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 mov [r8+4], eax %if WIN64 mov r2, [rsp+32+1*8] %else pop r2 %endif mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 mov [r8+8], eax %if WIN64 mov r2, [rsp+32+2*8] %else pop r2 %endif mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 mov [r8+12], eax %if WIN64 add rsp, 32+3*8 %endif RET %else push edi mov edi, [esp+32] push dword [esp+28] push dword [esp+16] push dword 16 push dword [esp+20] call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+32] mov [edi], eax mov [esp+8], ecx call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+36] mov [edi+4], eax mov [esp+8], ecx call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+40] mov [edi+8], eax mov [esp+8], ecx call pixel_sad_%1x%2_cache%3_%5 mov [edi+12], eax add esp, 16 pop edi ret %endif %endmacro %macro SADX34_CACHELINE_FUNC 1+ SADX3_CACHELINE_FUNC %1 SADX4_CACHELINE_FUNC %1 %endmacro ; instantiate the aligned sads INIT_MMX %if ARCH_X86_64 == 0 SAD16_CACHELINE_FUNC_MMX2 8, 32 SAD16_CACHELINE_FUNC_MMX2 16, 32 SAD8_CACHELINE_FUNC_MMX2 4, 32 SAD8_CACHELINE_FUNC_MMX2 8, 32 SAD8_CACHELINE_FUNC_MMX2 16, 32 SAD16_CACHELINE_FUNC_MMX2 8, 64 SAD16_CACHELINE_FUNC_MMX2 16, 64 %endif ; !ARCH_X86_64 SAD8_CACHELINE_FUNC_MMX2 4, 64 SAD8_CACHELINE_FUNC_MMX2 8, 64 SAD8_CACHELINE_FUNC_MMX2 16, 64 %if ARCH_X86_64 == 0 SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2 SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2 SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2 SADX34_CACHELINE_FUNC 8, 8, 32, mmx2, mmx2, mmx2 SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2 SADX34_CACHELINE_FUNC 16, 8, 64, mmx2, mmx2, mmx2 %endif ; !ARCH_X86_64 SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2 SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2 %if ARCH_X86_64 == 0 SAD16_CACHELINE_FUNC sse2, 8 SAD16_CACHELINE_FUNC sse2, 16 %assign i 1 %rep 15 SAD16_CACHELINE_LOOP_SSE2 i %assign i i+1 %endrep SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2 %endif ; !ARCH_X86_64 SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmx2, sse2 SAD16_CACHELINE_FUNC ssse3, 8 SAD16_CACHELINE_FUNC ssse3, 16 %assign i 1 %rep 15 SAD16_CACHELINE_LOOP_SSSE3 i %assign i i+1 %endrep SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3 %if HIGH_BIT_DEPTH==0 INIT_YMM avx2 cglobal pixel_sad_x3_8x4, 6,6,5 xorps m0, m0 xorps m1, m1 sub r2, r1 ; rebase on pointer r1 sub r3, r1 ; row 0 vpbroadcastq xm2, [r0 + 0 * FENC_STRIDE] movq xm3, [r1] movhps xm3, [r1 + r2] movq xm4, [r1 + r3] psadbw xm3, xm2 psadbw xm4, xm2 paddd xm0, xm3 paddd xm1, xm4 add r1, r4 ; row 1 vpbroadcastq xm2, [r0 + 1 * FENC_STRIDE] movq xm3, [r1] movhps xm3, [r1 + r2] movq xm4, [r1 + r3] psadbw xm3, xm2 psadbw xm4, xm2 paddd xm0, xm3 paddd xm1, xm4 add r1, r4 ; row 2 vpbroadcastq xm2, [r0 + 2 * FENC_STRIDE] movq xm3, [r1] movhps xm3, [r1 + r2] movq xm4, [r1 + r3] psadbw xm3, xm2 psadbw xm4, xm2 paddd xm0, xm3 paddd xm1, xm4 add r1, r4 ; row 3 vpbroadcastq xm2, [r0 + 3 * FENC_STRIDE] movq xm3, [r1] movhps xm3, [r1 + r2] movq xm4, [r1 + r3] psadbw xm3, xm2 psadbw xm4, xm2 paddd xm0, xm3 paddd xm1, xm4 pshufd xm0, xm0, q0020 movq [r5 + 0], xm0 movd [r5 + 8], xm1 RET INIT_YMM avx2 cglobal pixel_sad_x3_8x8, 6,6,5 xorps m0, m0 xorps m1, m1 sub r2, r1 ; rebase on pointer r1 sub r3, r1 %assign x 0 %rep 4 ; row 0 vpbroadcastq xm2, [r0 + 0 * FENC_STRIDE] movq xm3, [r1] movhps xm3, [r1 + r2] movq xm4, [r1 + r3] psadbw xm3, xm2 psadbw xm4, xm2 paddd xm0, xm3 paddd xm1, xm4 add r1, r4 ; row 1 vpbroadcastq xm2, [r0 + 1 * FENC_STRIDE] movq xm3, [r1] movhps xm3, [r1 + r2] movq xm4, [r1 + r3] psadbw xm3, xm2 psadbw xm4, xm2 paddd xm0, xm3 paddd xm1, xm4 %assign x x+1 %if x < 4 add r1, r4 add r0, 2 * FENC_STRIDE %endif %endrep pshufd xm0, xm0, q0020 movq [r5 + 0], xm0 movd [r5 + 8], xm1 RET INIT_YMM avx2 cglobal pixel_sad_x3_8x16, 6,6,5 xorps m0, m0 xorps m1, m1 sub r2, r1 ; rebase on pointer r1 sub r3, r1 %assign x 0 %rep 8 ; row 0 vpbroadcastq xm2, [r0 + 0 * FENC_STRIDE] movq xm3, [r1] movhps xm3, [r1 + r2] movq xm4, [r1 + r3] psadbw xm3, xm2 psadbw xm4, xm2 paddd xm0, xm3 paddd xm1, xm4 add r1, r4 ; row 1 vpbroadcastq xm2, [r0 + 1 * FENC_STRIDE] movq xm3, [r1] movhps xm3, [r1 + r2] movq xm4, [r1 + r3] psadbw xm3, xm2 psadbw xm4, xm2 paddd xm0, xm3 paddd xm1, xm4 %assign x x+1 %if x < 8 add r1, r4 add r0, 2 * FENC_STRIDE %endif %endrep pshufd xm0, xm0, q0020 movq [r5 + 0], xm0 movd [r5 + 8], xm1 RET %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 INIT_YMM avx2 %macro SAD_X3_32x8_AVX2 0 movu m3, [r0] movu m4, [r1] movu m5, [r2] movu m6, [r3] psadbw m7, m3, m4 paddd m0, m7 psadbw m7, m3, m5 paddd m1, m7 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE] movu m4, [r1 + r4] movu m5, [r2 + r4] movu m6, [r3 + r4] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 2] movu m4, [r1 + r4 * 2] movu m5, [r2 + r4 * 2] movu m6, [r3 + r4 * 2] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 3] movu m4, [r1 + r6] movu m5, [r2 + r6] movu m6, [r3 + r6] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] movu m3, [r0] movu m4, [r1] movu m5, [r2] movu m6, [r3] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE] movu m4, [r1 + r4] movu m5, [r2 + r4] movu m6, [r3 + r4] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 2] movu m4, [r1 + r4 * 2] movu m5, [r2 + r4 * 2] movu m6, [r3 + r4 * 2] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 3] movu m4, [r1 + r6] movu m5, [r2 + r6] movu m6, [r3 + r6] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 %endmacro %macro SAD_X3_64x8_AVX2 0 movu m3, [r0] movu m4, [r1] movu m5, [r2] movu m6, [r3] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + mmsize] movu m4, [r1 + mmsize] movu m5, [r2 + mmsize] movu m6, [r3 + mmsize] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE] movu m4, [r1 + r4] movu m5, [r2 + r4] movu m6, [r3 + r4] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE + mmsize] movu m4, [r1 + r4 + mmsize] movu m5, [r2 + r4 + mmsize] movu m6, [r3 + r4 + mmsize] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 2] movu m4, [r1 + r4 * 2] movu m5, [r2 + r4 * 2] movu m6, [r3 + r4 * 2] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 2 + mmsize] movu m4, [r1 + r4 * 2 + mmsize] movu m5, [r2 + r4 * 2 + mmsize] movu m6, [r3 + r4 * 2 + mmsize] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 3] movu m4, [r1 + r6] movu m5, [r2 + r6] movu m6, [r3 + r6] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 3 + mmsize] movu m4, [r1 + r6 + mmsize] movu m5, [r2 + r6 + mmsize] movu m6, [r3 + r6 + mmsize] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] movu m3, [r0] movu m4, [r1] movu m5, [r2] movu m6, [r3] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + mmsize] movu m4, [r1 + mmsize] movu m5, [r2 + mmsize] movu m6, [r3 + mmsize] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE] movu m4, [r1 + r4] movu m5, [r2 + r4] movu m6, [r3 + r4] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE + mmsize] movu m4, [r1 + r4 + mmsize] movu m5, [r2 + r4 + mmsize] movu m6, [r3 + r4 + mmsize] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 2] movu m4, [r1 + r4 * 2] movu m5, [r2 + r4 * 2] movu m6, [r3 + r4 * 2] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 2 + mmsize] movu m4, [r1 + r4 * 2 + mmsize] movu m5, [r2 + r4 * 2 + mmsize] movu m6, [r3 + r4 * 2 + mmsize] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 3] movu m4, [r1 + r6] movu m5, [r2 + r6] movu m6, [r3 + r6] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 3 + mmsize] movu m4, [r1 + r6 + mmsize] movu m5, [r2 + r6 + mmsize] movu m6, [r3 + r6 + mmsize] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 %endmacro %macro SAD_X3_48x8_AVX2 0 movu m3, [r0] movu m4, [r1] movu m5, [r2] movu m6, [r3] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu xm3, [r0 + mmsize] movu xm4, [r1 + mmsize] movu xm5, [r2 + mmsize] movu xm6, [r3 + mmsize] vinserti128 m3, m3, [r0 + FENC_STRIDE], 1 vinserti128 m4, m4, [r1 + r4], 1 vinserti128 m5, m5, [r2 + r4], 1 vinserti128 m6, m6, [r3 + r4], 1 psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE + mmsize/2] movu m4, [r1 + r4 + mmsize/2] movu m5, [r2 + r4 + mmsize/2] movu m6, [r3 + r4 + mmsize/2] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 2] movu m4, [r1 + r4 * 2] movu m5, [r2 + r4 * 2] movu m6, [r3 + r4 * 2] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu xm3, [r0 + FENC_STRIDE * 2 + mmsize] movu xm4, [r1 + r4 * 2 + mmsize] movu xm5, [r2 + r4 * 2 + mmsize] movu xm6, [r3 + r4 * 2 + mmsize] vinserti128 m3, m3, [r0 + FENC_STRIDE * 3], 1 vinserti128 m4, m4, [r1 + r6], 1 vinserti128 m5, m5, [r2 + r6], 1 vinserti128 m6, m6, [r3 + r6], 1 psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 3 + mmsize/2] movu m4, [r1 + r6 + mmsize/2] movu m5, [r2 + r6 + mmsize/2] movu m6, [r3 + r6 + mmsize/2] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] movu m3, [r0] movu m4, [r1] movu m5, [r2] movu m6, [r3] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu xm3, [r0 + mmsize] movu xm4, [r1 + mmsize] movu xm5, [r2 + mmsize] movu xm6, [r3 + mmsize] vinserti128 m3, m3, [r0 + FENC_STRIDE], 1 vinserti128 m4, m4, [r1 + r4], 1 vinserti128 m5, m5, [r2 + r4], 1 vinserti128 m6, m6, [r3 + r4], 1 psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE + mmsize/2] movu m4, [r1 + r4 + mmsize/2] movu m5, [r2 + r4 + mmsize/2] movu m6, [r3 + r4 + mmsize/2] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 2] movu m4, [r1 + r4 * 2] movu m5, [r2 + r4 * 2] movu m6, [r3 + r4 * 2] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu xm3, [r0 + FENC_STRIDE * 2 + mmsize] movu xm4, [r1 + r4 * 2 + mmsize] movu xm5, [r2 + r4 * 2 + mmsize] movu xm6, [r3 + r4 * 2 + mmsize] vinserti128 m3, m3, [r0 + FENC_STRIDE * 3], 1 vinserti128 m4, m4, [r1 + r6], 1 vinserti128 m5, m5, [r2 + r6], 1 vinserti128 m6, m6, [r3 + r6], 1 psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 movu m3, [r0 + FENC_STRIDE * 3 + mmsize/2] movu m4, [r1 + r6 + mmsize/2] movu m5, [r2 + r6 + mmsize/2] movu m6, [r3 + r6 + mmsize/2] psadbw m7, m3, m4 paddd m0, m7 psadbw m4, m3, m5 paddd m1, m4 psadbw m3, m6 paddd m2, m3 %endmacro %macro PIXEL_SAD_X3_END_AVX2 0 vextracti128 xm3, m0, 1 vextracti128 xm4, m1, 1 vextracti128 xm5, m2, 1 paddd m0, m3 paddd m1, m4 paddd m2, m5 pshufd xm3, xm0, 2 pshufd xm4, xm1, 2 pshufd xm5, xm2, 2 paddd m0, m3 paddd m1, m4 paddd m2, m5 movd [r5 + 0], xm0 movd [r5 + 4], xm1 movd [r5 + 8], xm2 %endmacro cglobal pixel_sad_x3_32x8, 6,7,8 pxor m0, m0 pxor m1, m1 pxor m2, m2 lea r6, [r4 * 3] SAD_X3_32x8_AVX2 PIXEL_SAD_X3_END_AVX2 RET cglobal pixel_sad_x3_32x16, 6,7,8 pxor m0, m0 pxor m1, m1 pxor m2, m2 lea r6, [r4 * 3] SAD_X3_32x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_32x8_AVX2 PIXEL_SAD_X3_END_AVX2 RET cglobal pixel_sad_x3_32x24, 6,7,8 pxor m0, m0 pxor m1, m1 pxor m2, m2 lea r6, [r4 * 3] SAD_X3_32x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_32x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_32x8_AVX2 PIXEL_SAD_X3_END_AVX2 RET cglobal pixel_sad_x3_32x32, 6,7,8 pxor m0, m0 pxor m1, m1 pxor m2, m2 lea r6, [r4 * 3] SAD_X3_32x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_32x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_32x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_32x8_AVX2 PIXEL_SAD_X3_END_AVX2 RET cglobal pixel_sad_x3_32x64, 6,7,8 pxor m0, m0 pxor m1, m1 pxor m2, m2 lea r6, [r4 * 3] SAD_X3_32x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_32x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_32x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_32x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_32x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_32x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_32x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_32x8_AVX2 PIXEL_SAD_X3_END_AVX2 RET cglobal pixel_sad_x3_64x16, 6,7,8 pxor m0, m0 pxor m1, m1 pxor m2, m2 lea r6, [r4 * 3] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 PIXEL_SAD_X3_END_AVX2 RET cglobal pixel_sad_x3_64x32, 6,7,8 pxor m0, m0 pxor m1, m1 pxor m2, m2 lea r6, [r4 * 3] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 PIXEL_SAD_X3_END_AVX2 RET cglobal pixel_sad_x3_64x48, 6,7,8 pxor m0, m0 pxor m1, m1 pxor m2, m2 lea r6, [r4 * 3] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 PIXEL_SAD_X3_END_AVX2 RET cglobal pixel_sad_x3_64x64, 6,7,8 pxor m0, m0 pxor m1, m1 pxor m2, m2 lea r6, [r4 * 3] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_64x8_AVX2 PIXEL_SAD_X3_END_AVX2 RET cglobal pixel_sad_x3_48x64, 6,7,8 pxor m0, m0 pxor m1, m1 pxor m2, m2 lea r6, [r4 * 3] SAD_X3_48x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_48x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_48x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_48x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_48x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_48x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_48x8_AVX2 add r0, FENC_STRIDE * 4 lea r1, [r1 + r4 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r4 * 4] SAD_X3_48x8_AVX2 PIXEL_SAD_X3_END_AVX2 RET %endif INIT_YMM avx2 cglobal pixel_sad_x4_8x8, 7,7,5 xorps m0, m0 xorps m1, m1 sub r2, r1 ; rebase on pointer r1 sub r3, r1 sub r4, r1 %assign x 0 %rep 4 ; row 0 vpbroadcastq xm2, [r0 + 0 * FENC_STRIDE] movq xm3, [r1] movhps xm3, [r1 + r2] movq xm4, [r1 + r3] movhps xm4, [r1 + r4] psadbw xm3, xm2 psadbw xm4, xm2 paddd xm0, xm3 paddd xm1, xm4 add r1, r5 ; row 1 vpbroadcastq xm2, [r0 + 1 * FENC_STRIDE] movq xm3, [r1] movhps xm3, [r1 + r2] movq xm4, [r1 + r3] movhps xm4, [r1 + r4] psadbw xm3, xm2 psadbw xm4, xm2 paddd xm0, xm3 paddd xm1, xm4 %assign x x+1 %if x < 4 add r1, r5 add r0, 2 * FENC_STRIDE %endif %endrep pshufd xm0, xm0, q0020 pshufd xm1, xm1, q0020 movq [r6 + 0], xm0 movq [r6 + 8], xm1 RET INIT_YMM avx2 cglobal pixel_sad_32x8, 4,4,6 xorps m0, m0 xorps m5, m5 movu m1, [r0] ; row 0 of pix0 movu m2, [r2] ; row 0 of pix1 movu m3, [r0 + r1] ; row 1 of pix0 movu m4, [r2 + r3] ; row 1 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m1, [r0] ; row 2 of pix0 movu m2, [r2] ; row 2 of pix1 movu m3, [r0 + r1] ; row 3 of pix0 movu m4, [r2 + r3] ; row 3 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m1, [r0] ; row 4 of pix0 movu m2, [r2] ; row 4 of pix1 movu m3, [r0 + r1] ; row 5 of pix0 movu m4, [r2 + r3] ; row 5 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m1, [r0] ; row 6 of pix0 movu m2, [r2] ; row 6 of pix1 movu m3, [r0 + r1] ; row 7 of pix0 movu m4, [r2 + r3] ; row 7 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 paddd m0, m5 vextracti128 xm1, m0, 1 paddd xm0, xm1 pshufd xm1, xm0, 2 paddd xm0,xm1 movd eax, xm0 RET INIT_YMM avx2 cglobal pixel_sad_32x16, 4,5,6 xorps m0, m0 xorps m5, m5 mov r4d, 4 .loop: movu m1, [r0] ; row 0 of pix0 movu m2, [r2] ; row 0 of pix1 movu m3, [r0 + r1] ; row 1 of pix0 movu m4, [r2 + r3] ; row 1 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m1, [r0] ; row 2 of pix0 movu m2, [r2] ; row 2 of pix1 movu m3, [r0 + r1] ; row 3 of pix0 movu m4, [r2 + r3] ; row 3 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] dec r4d jnz .loop paddd m0, m5 vextracti128 xm1, m0, 1 paddd xm0, xm1 pshufd xm1, xm0, 2 paddd xm0,xm1 movd eax, xm0 RET INIT_YMM avx2 cglobal pixel_sad_32x24, 4,7,6 xorps m0, m0 xorps m5, m5 mov r4d, 6 lea r5, [r1 * 3] lea r6, [r3 * 3] .loop: movu m1, [r0] ; row 0 of pix0 movu m2, [r2] ; row 0 of pix1 movu m3, [r0 + r1] ; row 1 of pix0 movu m4, [r2 + r3] ; row 1 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 movu m1, [r0 + 2 * r1] ; row 2 of pix0 movu m2, [r2 + 2 * r3] ; row 2 of pix1 movu m3, [r0 + r5] ; row 3 of pix0 movu m4, [r2 + r6] ; row 3 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r4d jnz .loop paddd m0, m5 vextracti128 xm1, m0, 1 paddd xm0, xm1 pshufd xm1, xm0, 2 paddd xm0,xm1 movd eax, xm0 RET INIT_YMM avx2 cglobal pixel_sad_32x32, 4,7,5 xorps m0, m0 mov r4d, 32/4 lea r5, [r1 * 3] lea r6, [r3 * 3] .loop: movu m1, [r0] ; row 0 of pix0 movu m2, [r2] ; row 0 of pix1 movu m3, [r0 + r1] ; row 1 of pix0 movu m4, [r2 + r3] ; row 1 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m0, m3 movu m1, [r0 + 2 * r1] ; row 2 of pix0 movu m2, [r2 + 2 * r3] ; row 2 of pix1 movu m3, [r0 + r5] ; row 3 of pix0 movu m4, [r2 + r6] ; row 3 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m0, m3 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r4d jnz .loop vextracti128 xm1, m0, 1 paddd xm0, xm1 pshufd xm1, xm0, 2 paddd xm0,xm1 movd eax, xm0 RET INIT_YMM avx2 cglobal pixel_sad_32x64, 4,7,5 xorps m0, m0 mov r4d, 64/8 lea r5, [r1 * 3] lea r6, [r3 * 3] .loop: movu m1, [r0] ; row 0 of pix0 movu m2, [r2] ; row 0 of pix1 movu m3, [r0 + r1] ; row 1 of pix0 movu m4, [r2 + r3] ; row 1 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m0, m3 movu m1, [r0 + 2 * r1] ; row 2 of pix0 movu m2, [r2 + 2 * r3] ; row 2 of pix1 movu m3, [r0 + r5] ; row 3 of pix0 movu m4, [r2 + r6] ; row 3 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m0, m3 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] movu m1, [r0] ; row 4 of pix0 movu m2, [r2] ; row 4 of pix1 movu m3, [r0 + r1] ; row 5 of pix0 movu m4, [r2 + r3] ; row 5 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m0, m3 movu m1, [r0 + 2 * r1] ; row 6 of pix0 movu m2, [r2 + 2 * r3] ; row 6 of pix1 movu m3, [r0 + r5] ; row 7 of pix0 movu m4, [r2 + r6] ; row 7 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m0, m3 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r4d jnz .loop vextracti128 xm1, m0, 1 paddd xm0, xm1 pshufd xm1, xm0, 2 paddd xm0,xm1 movd eax, xm0 RET INIT_YMM avx2 cglobal pixel_sad_48x64, 4,7,7 xorps m0, m0 mov r4d, 64/4 lea r5, [r1 * 3] lea r6, [r3 * 3] .loop: movu m1, [r0] ; row 0 of pix0 movu m2, [r2] ; row 0 of pix1 movu m3, [r0 + r1] ; row 1 of pix0 movu m4, [r2 + r3] ; row 1 of pix1 movu xm5, [r0 +32] ; last 16 of row 0 of pix0 vinserti128 m5, m5, [r0 + r1 + 32], 1 movu xm6, [r2 +32] ; last 16 of row 0 of pix1 vinserti128 m6, m6, [r2 + r3 + 32], 1 psadbw m1, m2 psadbw m3, m4 psadbw m5, m6 paddd m0, m1 paddd m0, m3 paddd m0, m5 movu m1, [r0 + 2 * r1] ; row 2 of pix0 movu m2, [r2 + 2 * r3] ; row 2 of pix1 movu m3, [r0 + r5] ; row 3 of pix0 movu m4, [r2 + r6] ; row 3 of pix1 movu xm5, [r0 +32 + 2 * r1] vinserti128 m5, m5, [r0 + r5 + 32], 1 movu xm6, [r2 +32 + 2 * r3] vinserti128 m6, m6, [r2 + r6 + 32], 1 psadbw m1, m2 psadbw m3, m4 psadbw m5, m6 paddd m0, m1 paddd m0, m3 paddd m0, m5 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r4d jnz .loop vextracti128 xm1, m0, 1 paddd xm0, xm1 pshufd xm1, xm0, 2 paddd xm0,xm1 movd eax, xm0 RET INIT_YMM avx2 cglobal pixel_sad_64x16, 4,5,6 xorps m0, m0 xorps m5, m5 mov r4d, 4 .loop: movu m1, [r0] ; first 32 of row 0 of pix0 movu m2, [r2] ; first 32 of row 0 of pix1 movu m3, [r0 + 32] ; second 32 of row 0 of pix0 movu m4, [r2 + 32] ; second 32 of row 0 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 movu m1, [r0 + r1] ; first 32 of row 1 of pix0 movu m2, [r2 + r3] ; first 32 of row 1 of pix1 movu m3, [r0 + 32 + r1] ; second 32 of row 1 of pix0 movu m4, [r2 + 32 + r3] ; second 32 of row 1 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m1, [r0] ; first 32 of row 2 of pix0 movu m2, [r2] ; first 32 of row 2 of pix1 movu m3, [r0 + 32] ; second 32 of row 2 of pix0 movu m4, [r2 + 32] ; second 32 of row 2 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 movu m1, [r0 + r1] ; first 32 of row 3 of pix0 movu m2, [r2 + r3] ; first 32 of row 3 of pix1 movu m3, [r0 + 32 + r1] ; second 32 of row 3 of pix0 movu m4, [r2 + 32 + r3] ; second 32 of row 3 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] dec r4d jnz .loop paddd m0, m5 vextracti128 xm1, m0, 1 paddd xm0, xm1 pshufd xm1, xm0, 2 paddd xm0,xm1 movd eax, xm0 RET INIT_YMM avx2 cglobal pixel_sad_64x32, 4,5,6 xorps m0, m0 xorps m5, m5 mov r4d, 16 .loop: movu m1, [r0] ; first 32 of row 0 of pix0 movu m2, [r2] ; first 32 of row 0 of pix1 movu m3, [r0 + 32] ; second 32 of row 0 of pix0 movu m4, [r2 + 32] ; second 32 of row 0 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 movu m1, [r0 + r1] ; first 32 of row 1 of pix0 movu m2, [r2 + r3] ; first 32 of row 1 of pix1 movu m3, [r0 + 32 + r1] ; second 32 of row 1 of pix0 movu m4, [r2 + 32 + r3] ; second 32 of row 1 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] dec r4d jnz .loop paddd m0, m5 vextracti128 xm1, m0, 1 paddd xm0, xm1 pshufd xm1, xm0, 2 paddd xm0,xm1 movd eax, xm0 RET INIT_YMM avx2 cglobal pixel_sad_64x48, 4,7,6 xorps m0, m0 xorps m5, m5 mov r4d, 12 lea r5, [r1 * 3] lea r6, [r3 * 3] .loop: movu m1, [r0] ; first 32 of row 0 of pix0 movu m2, [r2] ; first 32 of row 0 of pix1 movu m3, [r0 + 32] ; second 32 of row 0 of pix0 movu m4, [r2 + 32] ; second 32 of row 0 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 movu m1, [r0 + r1] ; first 32 of row 1 of pix0 movu m2, [r2 + r3] ; first 32 of row 1 of pix1 movu m3, [r0 + 32 + r1] ; second 32 of row 1 of pix0 movu m4, [r2 + 32 + r3] ; second 32 of row 1 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 movu m1, [r0 + 2 * r1] ; first 32 of row 0 of pix0 movu m2, [r2 + 2 * r3] ; first 32 of row 0 of pix1 movu m3, [r0 + 2 * r1 + 32] ; second 32 of row 0 of pix0 movu m4, [r2 + 2 * r3 + 32] ; second 32 of row 0 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 movu m1, [r0 + r5] ; first 32 of row 1 of pix0 movu m2, [r2 + r6] ; first 32 of row 1 of pix1 movu m3, [r0 + 32 + r5] ; second 32 of row 1 of pix0 movu m4, [r2 + 32 + r6] ; second 32 of row 1 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r4d jnz .loop paddd m0, m5 vextracti128 xm1, m0, 1 paddd xm0, xm1 pshufd xm1, xm0, 2 paddd xm0,xm1 movd eax, xm0 RET INIT_YMM avx2 cglobal pixel_sad_64x64, 4,7,6 xorps m0, m0 xorps m5, m5 mov r4d, 8 lea r5, [r1 * 3] lea r6, [r3 * 3] .loop: movu m1, [r0] ; first 32 of row 0 of pix0 movu m2, [r2] ; first 32 of row 0 of pix1 movu m3, [r0 + 32] ; second 32 of row 0 of pix0 movu m4, [r2 + 32] ; second 32 of row 0 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 movu m1, [r0 + r1] ; first 32 of row 1 of pix0 movu m2, [r2 + r3] ; first 32 of row 1 of pix1 movu m3, [r0 + 32 + r1] ; second 32 of row 1 of pix0 movu m4, [r2 + 32 + r3] ; second 32 of row 1 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 movu m1, [r0 + 2 * r1] ; first 32 of row 2 of pix0 movu m2, [r2 + 2 * r3] ; first 32 of row 2 of pix1 movu m3, [r0 + 2 * r1 + 32] ; second 32 of row 2 of pix0 movu m4, [r2 + 2 * r3 + 32] ; second 32 of row 2 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 movu m1, [r0 + r5] ; first 32 of row 3 of pix0 movu m2, [r2 + r6] ; first 32 of row 3 of pix1 movu m3, [r0 + 32 + r5] ; second 32 of row 3 of pix0 movu m4, [r2 + 32 + r6] ; second 32 of row 3 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] movu m1, [r0] ; first 32 of row 4 of pix0 movu m2, [r2] ; first 32 of row 4 of pix1 movu m3, [r0 + 32] ; second 32 of row 4 of pix0 movu m4, [r2 + 32] ; second 32 of row 4 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 movu m1, [r0 + r1] ; first 32 of row 5 of pix0 movu m2, [r2 + r3] ; first 32 of row 5 of pix1 movu m3, [r0 + 32 + r1] ; second 32 of row 5 of pix0 movu m4, [r2 + 32 + r3] ; second 32 of row 5 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 movu m1, [r0 + 2 * r1] ; first 32 of row 6 of pix0 movu m2, [r2 + 2 * r3] ; first 32 of row 6 of pix1 movu m3, [r0 + 2 * r1 + 32] ; second 32 of row 6 of pix0 movu m4, [r2 + 2 * r3 + 32] ; second 32 of row 6 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 movu m1, [r0 + r5] ; first 32 of row 7 of pix0 movu m2, [r2 + r6] ; first 32 of row 7 of pix1 movu m3, [r0 + 32 + r5] ; second 32 of row 7 of pix0 movu m4, [r2 + 32 + r6] ; second 32 of row 7 of pix1 psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r4d jnz .loop paddd m0, m5 vextracti128 xm1, m0, 1 paddd xm0, xm1 pshufd xm1, xm0, 2 paddd xm0,xm1 movd eax, xm0 RET %endif %endif ; %if !HIGH_BIT_DEPTH xavs2-1.3/source/common/x86/sad-vpp.asm000066400000000000000000000234121340660520300177120ustar00rootroot00000000000000; ============================================================================ ; sad-vpp.asm ; - x86 sad functions ; ---------------------------------------------------------------------------- ; ; xavs2 encoder , the Chinese AVS2 video encoder library. ; ; ============================================================================ %include "x86inc.asm" %include "x86util.asm" ; ---------------------------------------------------------------------------- ; all functions in this file are same as in sad-a.asm, except the ; stride value: VPP_STRIDE ; ; ---------------------------------------------------------------------------- ; functions defined in this file: ; vpp_sad_x3_8x8_xmm2 ; vpp_sad_x4_8x8_xmm2 ; ; vpp_sad_x3_8x8_sse2 ; vpp_sad_x4_8x8_sse2 ; ; vpp_sad_x4_8x8_ssse3 ; ; #if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 ; vpp_sad_x3_8x8_avx2 ; vpp_sad_x4_8x8_avx2 ; #endif ; ---------------------------------------------------------------------------- SECTION_RODATA 32 SECTION .text %assign VPP_STRIDE 8 ;============================================================================= ; SAD x3/x4 MMX ;============================================================================= %macro SAD_X3_START_1x8P 0 movq mm3, [r0] movq mm0, [r1] movq mm1, [r2] movq mm2, [r3] psadbw mm0, mm3 psadbw mm1, mm3 psadbw mm2, mm3 %endmacro %macro SAD_X3_1x8P 2 movq mm3, [r0+%1] movq mm4, [r1+%2] movq mm5, [r2+%2] movq mm6, [r3+%2] psadbw mm4, mm3 psadbw mm5, mm3 psadbw mm6, mm3 paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 %endmacro %macro SAD_X3_2x8P 1 %if %1 SAD_X3_START_1x8P %else SAD_X3_1x8P 0, 0 %endif SAD_X3_1x8P VPP_STRIDE, r4 add r0, 2*VPP_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4] %endmacro %macro SAD_X4_START_1x8P 0 movq mm7, [r0] movq mm0, [r1] movq mm1, [r2] movq mm2, [r3] movq mm3, [r4] psadbw mm0, mm7 psadbw mm1, mm7 psadbw mm2, mm7 psadbw mm3, mm7 %endmacro %macro SAD_X4_1x8P 2 movq mm7, [r0+%1] movq mm4, [r1+%2] movq mm5, [r2+%2] movq mm6, [r3+%2] psadbw mm4, mm7 psadbw mm5, mm7 psadbw mm6, mm7 psadbw mm7, [r4+%2] paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 paddw mm3, mm7 %endmacro %macro SAD_X4_2x8P 1 %if %1 SAD_X4_START_1x8P %else SAD_X4_1x8P 0, 0 %endif SAD_X4_1x8P VPP_STRIDE, r5 add r0, 2*VPP_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5] %endmacro %macro SAD_X3_END 0 %if UNIX64 movd [r5+0], mm0 movd [r5+4], mm1 movd [r5+8], mm2 %else mov r0, r5mp movd [r0+0], mm0 movd [r0+4], mm1 movd [r0+8], mm2 %endif RET %endmacro %macro SAD_X4_END 0 mov r0, r6mp movd [r0+0], mm0 movd [r0+4], mm1 movd [r0+8], mm2 movd [r0+12], mm3 RET %endmacro ; ---------------------------------------------------------------------------- ; void vpp_sad_x3_8x8( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, ; uint8_t *pix2, intptr_t i_stride, int scores[3] ) ; ---------------------------------------------------------------------------- %macro SAD_X 3 cglobal vpp_sad_x%1_%2x%3_mmx2, %1+2, %1+2 SAD_X%1_2x%2P 1 %rep %3/2-1 SAD_X%1_2x%2P 0 %endrep SAD_X%1_END %endmacro INIT_MMX SAD_X 3, 8, 8 SAD_X 4, 8, 8 ; ============================================================================ ; SAD x3/x4 XMM ; ============================================================================ %if ARCH_X86_64 DECLARE_REG_TMP 6 %else DECLARE_REG_TMP 5 %endif %macro SAD_X3_START_2x8P_SSE2 0 movq m3, [r0] movq m0, [r1] movq m1, [r2] movq m2, [r3] movhps m3, [r0+VPP_STRIDE] movhps m0, [r1+r4] movhps m1, [r2+r4] movhps m2, [r3+r4] psadbw m0, m3 psadbw m1, m3 psadbw m2, m3 %endmacro %macro SAD_X3_2x8P_SSE2 4 movq m6, [r0+%1] movq m3, [r1+%2] movq m4, [r2+%2] movq m5, [r3+%2] movhps m6, [r0+%3] movhps m3, [r1+%4] movhps m4, [r2+%4] movhps m5, [r3+%4] psadbw m3, m6 psadbw m4, m6 psadbw m5, m6 paddd m0, m3 paddd m1, m4 paddd m2, m5 %endmacro %macro SAD_X4_START_2x8P_SSE2 0 movq m4, [r0] movq m0, [r1] movq m1, [r2] movq m2, [r3] movq m3, [r4] movhps m4, [r0+VPP_STRIDE] movhps m0, [r1+r5] movhps m1, [r2+r5] movhps m2, [r3+r5] movhps m3, [r4+r5] psadbw m0, m4 psadbw m1, m4 psadbw m2, m4 psadbw m3, m4 %endmacro %macro SAD_X4_2x8P_SSE2 4 movq m6, [r0+%1] movq m4, [r1+%2] movq m5, [r2+%2] movhps m6, [r0+%3] movhps m4, [r1+%4] movhps m5, [r2+%4] psadbw m4, m6 psadbw m5, m6 paddd m0, m4 paddd m1, m5 movq m4, [r3+%2] movq m5, [r4+%2] movhps m4, [r3+%4] movhps m5, [r4+%4] psadbw m4, m6 psadbw m5, m6 paddd m2, m4 paddd m3, m5 %endmacro %macro SAD_X3_4x8P_SSE2 2 %if %1==0 lea t0, [r4*3] SAD_X3_START_2x8P_SSE2 %else SAD_X3_2x8P_SSE2 VPP_STRIDE*(0+(%1&1)*4), r4*0, VPP_STRIDE*(1+(%1&1)*4), r4*1 %endif SAD_X3_2x8P_SSE2 VPP_STRIDE*(2+(%1&1)*4), r4*2, VPP_STRIDE*(3+(%1&1)*4), t0 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*VPP_STRIDE %endif lea r1, [r1+4*r4] lea r2, [r2+4*r4] lea r3, [r3+4*r4] %endif %endmacro %macro SAD_X4_4x8P_SSE2 2 %if %1==0 lea r6, [r5*3] SAD_X4_START_2x8P_SSE2 %else SAD_X4_2x8P_SSE2 VPP_STRIDE*(0+(%1&1)*4), r5*0, VPP_STRIDE*(1+(%1&1)*4), r5*1 %endif SAD_X4_2x8P_SSE2 VPP_STRIDE*(2+(%1&1)*4), r5*2, VPP_STRIDE*(3+(%1&1)*4), r6 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*VPP_STRIDE %endif lea r1, [r1+4*r5] lea r2, [r2+4*r5] lea r3, [r3+4*r5] lea r4, [r4+4*r5] %endif %endmacro %macro SAD_X3_END_SSE2 1 movifnidn r5, r5mp movhlps m3, m0 movhlps m4, m1 movhlps m5, m2 paddd m0, m3 paddd m1, m4 paddd m2, m5 movd [r5+0], m0 movd [r5+4], m1 movd [r5+8], m2 RET %endmacro %macro SAD_X4_END_SSE2 1 mov r0, r6mp psllq m1, 32 psllq m3, 32 paddd m0, m1 paddd m2, m3 movhlps m1, m0 movhlps m3, m2 paddd m0, m1 paddd m2, m3 movq [r0+0], m0 movq [r0+8], m2 RET %endmacro ; ---------------------------------------------------------------------------- ; void vpp_sad_x3_8x8( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, ; uint8_t *pix2, intptr_t i_stride, int scores[3] ) ; ---------------------------------------------------------------------------- %macro SAD_X_SSE2 4 cglobal vpp_sad_x%1_%2x%3, 2+%1,3+%1,%4 %assign x 0 %rep %3/4 SAD_X%1_4x%2P_SSE2 x, %3/4 %assign x x+1 %endrep %if %3 == 64 SAD_X%1_END_SSE2 1 %else SAD_X%1_END_SSE2 0 %endif %endmacro INIT_XMM sse2 SAD_X_SSE2 3, 8, 8, 7 SAD_X_SSE2 4, 8, 8, 7 INIT_XMM ssse3 SAD_X_SSE2 4, 8, 8, 7 %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 INIT_YMM avx2 cglobal vpp_sad_x3_8x8, 6,6,5 xorps m0, m0 xorps m1, m1 sub r2, r1 ; rebase on pointer r1 sub r3, r1 %assign x 0 %rep 4 ; row 0 vpbroadcastq xm2, [r0 + 0 * VPP_STRIDE] movq xm3, [r1] movhps xm3, [r1 + r2] movq xm4, [r1 + r3] psadbw xm3, xm2 psadbw xm4, xm2 paddd xm0, xm3 paddd xm1, xm4 add r1, r4 ; row 1 vpbroadcastq xm2, [r0 + 1 * VPP_STRIDE] movq xm3, [r1] movhps xm3, [r1 + r2] movq xm4, [r1 + r3] psadbw xm3, xm2 psadbw xm4, xm2 paddd xm0, xm3 paddd xm1, xm4 %assign x x+1 %if x < 4 add r1, r4 add r0, 2 * VPP_STRIDE %endif %endrep pshufd xm0, xm0, q0020 movq [r5 + 0], xm0 movd [r5 + 8], xm1 RET INIT_YMM avx2 cglobal vpp_sad_x4_8x8, 7,7,5 xorps m0, m0 xorps m1, m1 sub r2, r1 ; rebase on pointer r1 sub r3, r1 sub r4, r1 %assign x 0 %rep 4 ; row 0 vpbroadcastq xm2, [r0 + 0 * VPP_STRIDE] movq xm3, [r1] movhps xm3, [r1 + r2] movq xm4, [r1 + r3] movhps xm4, [r1 + r4] psadbw xm3, xm2 psadbw xm4, xm2 paddd xm0, xm3 paddd xm1, xm4 add r1, r5 ; row 1 vpbroadcastq xm2, [r0 + 1 * VPP_STRIDE] movq xm3, [r1] movhps xm3, [r1 + r2] movq xm4, [r1 + r3] movhps xm4, [r1 + r4] psadbw xm3, xm2 psadbw xm4, xm2 paddd xm0, xm3 paddd xm1, xm4 %assign x x+1 %if x < 4 add r1, r5 add r0, 2 * VPP_STRIDE %endif %endrep pshufd xm0, xm0, q0020 pshufd xm1, xm1, q0020 movq [r6 + 0], xm0 movq [r6 + 8], xm1 RET %endif xavs2-1.3/source/common/x86/satd-a.asm000066400000000000000000004077031340660520300175220ustar00rootroot00000000000000;***************************************************************************** ;* satd-a.asm: x86 satd functions ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Laurent Aimar ;* Alex Izvorski ;* Min Chen ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 hmul_8p: times 8 db 1 times 4 db 1, -1 times 8 db 1 times 4 db 1, -1 hmul_4p: times 4 db 1, 1, 1, 1, 1, -1, 1, -1 mask_10: times 4 dw 0, -1 mask_1100: times 2 dd 0, -1 hmul_8w: times 4 dw 1 times 2 dw 1, -1 times 4 dw 1 times 2 dw 1, -1 ALIGN 32 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 SECTION .text cextern pb_0 cextern pb_1 cextern pw_1 cextern pw_8 cextern pw_16 cextern pw_32 cextern pw_00ff cextern pw_ppppmmmm cextern pw_ppmmppmm cextern pw_pmpmpmpm cextern pw_pmmpzzzz cextern pd_1 cextern pd_2 cextern hmul_16p cextern pb_movemask cextern pb_movemask_32 cextern pw_pixel_max ;============================================================================= ; SATD ;============================================================================= %macro JDUP 2 %if cpuflag(sse4) ; just use shufps on anything post conroe shufps %1, %2, 0 %elif cpuflag(ssse3) && notcpuflag(atom) ; join 2x 32 bit and duplicate them ; emulating shufps is faster on conroe punpcklqdq %1, %2 movsldup %1, %1 %else ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d punpckldq %1, %2 %endif %endmacro %macro HSUMSUB 5 pmaddubsw m%2, m%5 pmaddubsw m%1, m%5 pmaddubsw m%4, m%5 pmaddubsw m%3, m%5 %endmacro %macro DIFF_UNPACK_SSE2 5 punpcklbw m%1, m%5 punpcklbw m%2, m%5 punpcklbw m%3, m%5 punpcklbw m%4, m%5 psubw m%1, m%2 psubw m%3, m%4 %endmacro %macro DIFF_SUMSUB_SSSE3 5 HSUMSUB %1, %2, %3, %4, %5 psubw m%1, m%2 psubw m%3, m%4 %endmacro %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer movd %1, %3 movd %2, %4 JDUP %1, %2 %endmacro %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer movddup m%3, %6 movddup m%4, %8 movddup m%1, %5 movddup m%2, %7 %endmacro %macro LOAD_DUP_4x8P_PENRYN 8 ; penryn and nehalem run punpcklqdq and movddup in different units movh m%3, %6 movh m%4, %8 punpcklqdq m%3, m%3 movddup m%1, %5 punpcklqdq m%4, m%4 movddup m%2, %7 %endmacro %macro LOAD_SUMSUB_8x2P 9 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr movddup m%1, [%7] movddup m%2, [%7+8] mova m%4, [%6] movddup m%3, m%4 punpckhqdq m%4, m%4 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr movu m%4, [%7] mova m%2, [%6] DEINTB %1, %2, %3, %4, %5 psubw m%1, m%3 psubw m%2, m%4 SUMSUB_BA w, %1, %2, %3 %endmacro %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp] LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5 %endmacro %macro LOAD_SUMSUB_16x2P_AVX2 9 ; 2*dst, 2*tmp, mul, 4*ptr vbroadcasti128 m%1, [%6] vbroadcasti128 m%3, [%7] vbroadcasti128 m%2, [%8] vbroadcasti128 m%4, [%9] DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3 LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5 %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro %macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer mova xm%3, %6 mova xm%4, %8 mova xm%1, %5 mova xm%2, %7 vpermq m%3, m%3, q0011 vpermq m%4, m%4, q0011 vpermq m%1, m%1, q0011 vpermq m%2, m%2, q0011 %endmacro %macro LOAD_SUMSUB8_16x2P_AVX2 9 ; 2*dst, 2*tmp, mul, 4*ptr LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro ; in: r4=3*stride1, r5=3*stride2 ; in: %2 = horizontal offset ; in: %3 = whether we need to increment pix1 and pix2 ; clobber: m3..m7 ; out: %1 = satd %macro SATD_4x4_MMX 3 %xdefine %%n n%1 %assign offset %2*SIZEOF_PIXEL LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset] LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset] LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset] LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset] %if %3 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif HADAMARD4_2D 4, 5, 6, 7, 3, %%n paddw m4, m6 ;%if HIGH_BIT_DEPTH && (BIT_DEPTH == 12) ; pxor m5, m5 ; punpcklwd m6, m4, m5 ; punpckhwd m4, m5 ; paddd m4, m6 ;%endif SWAP %%n, 4 %endmacro ; in: %1 = horizontal if 0, vertical if 1 %macro SATD_8x4_SSE 8-9 %if %1 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax %else HADAMARD4_V %2, %3, %4, %5, %6 ; doing the abs first is a slight advantage ABSW2 m%2, m%4, m%2, m%4, m%6, m%7 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7 HADAMARD 1, max, %2, %4, %6, %7 %endif %ifnidn %9, swap %if (BIT_DEPTH == 12) pxor m%6, m%6 punpcklwd m%7, m%2, m%6 punpckhwd m%2, m%6 paddd m%8, m%7 paddd m%8, m%2 %else paddw m%8, m%2 %endif %else SWAP %8, %2 %if (BIT_DEPTH == 12) pxor m%6, m%6 punpcklwd m%7, m%8, m%6 punpckhwd m%8, m%6 paddd m%8, m%7 %endif %endif %if %1 %if (BIT_DEPTH == 12) pxor m%6, m%6 punpcklwd m%7, m%4, m%6 punpckhwd m%4, m%6 paddd m%8, m%7 paddd m%8, m%4 %else paddw m%8, m%4 %endif %else HADAMARD 1, max, %3, %5, %6, %7 %if (BIT_DEPTH == 12) pxor m%6, m%6 punpcklwd m%7, m%3, m%6 punpckhwd m%3, m%6 paddd m%8, m%7 paddd m%8, m%3 %else paddw m%8, m%3 %endif %endif %endmacro %macro SATD_8x4_1_SSE 10 %if %1 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax %else HADAMARD4_V %2, %3, %4, %5, %6 ; doing the abs first is a slight advantage ABSW2 m%2, m%4, m%2, m%4, m%6, m%7 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7 HADAMARD 1, max, %2, %4, %6, %7 %endif pxor m%10, m%10 punpcklwd m%9, m%2, m%10 paddd m%8, m%9 punpckhwd m%9, m%2, m%10 paddd m%8, m%9 %if %1 pxor m%10, m%10 punpcklwd m%9, m%4, m%10 paddd m%8, m%9 punpckhwd m%9, m%4, m%10 paddd m%8, m%9 %else HADAMARD 1, max, %3, %5, %6, %7 pxor m%10, m%10 punpcklwd m%9, m%3, m%10 paddd m%8, m%9 punpckhwd m%9, m%3, m%10 paddd m%8, m%9 %endif %endmacro %macro SATD_START_MMX 0 FIX_STRIDES r1, r3 lea r4, [3*r1] ; 3*stride1 lea r5, [3*r3] ; 3*stride2 %endmacro %macro SATD_END_MMX 0 %if HIGH_BIT_DEPTH HADDUW m0, m1 movd eax, m0 %else ; !HIGH_BIT_DEPTH pshufw m1, m0, q1032 paddw m0, m1 pshufw m1, m0, q2301 paddw m0, m1 movd eax, m0 and eax, 0xffff %endif ; HIGH_BIT_DEPTH EMMS RET %endmacro ; FIXME avoid the spilling of regs to hold 3*stride. ; for small blocks on x86_32, modify pixel pointer instead. ;----------------------------------------------------------------------------- ; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal pixel_satd_4x4, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 0 SATD_END_MMX cglobal pixel_satd_16x4_internal SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 0 paddw m0, m2 SATD_4x4_MMX m2, 8, 0 paddw m0, m1 SATD_4x4_MMX m1, 12, 0 paddw m0, m2 paddw m0, m1 ret cglobal pixel_satd_8x8_internal SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 1 paddw m0, m2 paddw m0, m1 pixel_satd_8x4_internal_mmx2: SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 0 paddw m0, m2 paddw m0, m1 ret %if HIGH_BIT_DEPTH %macro SATD_MxN_MMX 3 cglobal pixel_satd_%1x%2, 4,7 SATD_START_MMX pxor m0, m0 call pixel_satd_%1x%3_internal_mmx2 HADDUW m0, m1 movd r6d, m0 %rep %2/%3-1 pxor m0, m0 lea r0, [r0+4*r1] lea r2, [r2+4*r3] call pixel_satd_%1x%3_internal_mmx2 movd m2, r4 HADDUW m0, m1 movd r4, m0 add r6, r4 movd r4, m2 %endrep movifnidn eax, r6d RET %endmacro SATD_MxN_MMX 16, 16, 4 SATD_MxN_MMX 16, 8, 4 SATD_MxN_MMX 8, 16, 8 %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 cglobal pixel_satd_16x16, 4,6 SATD_START_MMX pxor m0, m0 %rep 3 call pixel_satd_16x4_internal_mmx2 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endrep call pixel_satd_16x4_internal_mmx2 HADDUW m0, m1 movd eax, m0 RET cglobal pixel_satd_16x8, 4,6 SATD_START_MMX pxor m0, m0 call pixel_satd_16x4_internal_mmx2 lea r0, [r0+4*r1] lea r2, [r2+4*r3] call pixel_satd_16x4_internal_mmx2 SATD_END_MMX cglobal pixel_satd_8x16, 4,6 SATD_START_MMX pxor m0, m0 call pixel_satd_8x8_internal_mmx2 lea r0, [r0+4*r1] lea r2, [r2+4*r3] call pixel_satd_8x8_internal_mmx2 SATD_END_MMX %endif ; !HIGH_BIT_DEPTH cglobal pixel_satd_8x8, 4,6 SATD_START_MMX pxor m0, m0 call pixel_satd_8x8_internal_mmx2 SATD_END_MMX cglobal pixel_satd_8x4, 4,6 SATD_START_MMX pxor m0, m0 call pixel_satd_8x4_internal_mmx2 SATD_END_MMX cglobal pixel_satd_4x16, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 1 SATD_4x4_MMX m1, 0, 1 paddw m0, m1 SATD_4x4_MMX m1, 0, 1 paddw m0, m1 SATD_4x4_MMX m1, 0, 0 paddw m0, m1 SATD_END_MMX cglobal pixel_satd_4x8, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 1 SATD_4x4_MMX m1, 0, 0 paddw m0, m1 SATD_END_MMX %macro SATD_START_SSE2 2-3 0 FIX_STRIDES r1, r3 %if HIGH_BIT_DEPTH && %3 pxor %2, %2 %elif cpuflag(ssse3) && notcpuflag(atom) %if mmsize==32 mova %2, [hmul_16p] %else mova %2, [hmul_8p] %endif %endif lea r4, [3*r1] lea r5, [3*r3] pxor %1, %1 %endmacro %macro SATD_END_SSE2 1-2 %if HIGH_BIT_DEPTH %if BIT_DEPTH == 12 HADDD %1, xm0 %else ; BIT_DEPTH == 12 HADDUW %1, xm0 %endif ; BIT_DEPTH == 12 %if %0 == 2 paddd %1, %2 %endif %else HADDW %1, xm7 %endif movd eax, %1 RET %endmacro %macro SATD_ACCUM 3 %if HIGH_BIT_DEPTH HADDUW %1, %2 paddd %3, %1 pxor %1, %1 %endif %endmacro %macro BACKUP_POINTERS 0 %if ARCH_X86_64 %if WIN64 PUSH r7 %endif mov r6, r0 mov r7, r2 %endif %endmacro %macro RESTORE_AND_INC_POINTERS 0 %if ARCH_X86_64 lea r0, [r6+8*SIZEOF_PIXEL] lea r2, [r7+8*SIZEOF_PIXEL] %if WIN64 POP r7 %endif %else mov r0, r0mp mov r2, r2mp add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL %endif %endmacro %macro SATD_4x8_SSE 3-4 %if HIGH_BIT_DEPTH movh m0, [r0+0*r1] movh m4, [r2+0*r3] movh m1, [r0+1*r1] movh m5, [r2+1*r3] movhps m0, [r0+4*r1] movhps m4, [r2+4*r3] movh m2, [r0+2*r1] movh m6, [r2+2*r3] psubw m0, m4 movh m3, [r0+r4] movh m4, [r2+r5] lea r0, [r0+4*r1] lea r2, [r2+4*r3] movhps m1, [r0+1*r1] movhps m5, [r2+1*r3] movhps m2, [r0+2*r1] movhps m6, [r2+2*r3] psubw m1, m5 movhps m3, [r0+r4] movhps m4, [r2+r5] psubw m2, m6 psubw m3, m4 %else ; !HIGH_BIT_DEPTH movd m4, [r2] movd m5, [r2+r3] movd m6, [r2+2*r3] add r2, r5 movd m0, [r0] movd m1, [r0+r1] movd m2, [r0+2*r1] add r0, r4 movd m3, [r2+r3] JDUP m4, m3 movd m3, [r0+r1] JDUP m0, m3 movd m3, [r2+2*r3] JDUP m5, m3 movd m3, [r0+2*r1] JDUP m1, m3 %if %1==0 && %2==1 mova m3, [hmul_4p] DIFFOP 0, 4, 1, 5, 3 %else DIFFOP 0, 4, 1, 5, 7 %endif movd m5, [r2] add r2, r5 movd m3, [r0] add r0, r4 movd m4, [r2] JDUP m6, m4 movd m4, [r0] JDUP m2, m4 movd m4, [r2+r3] JDUP m5, m4 movd m4, [r0+r1] JDUP m3, m4 %if %1==0 && %2==1 mova m4, [hmul_4p] DIFFOP 2, 6, 3, 5, 4 %else DIFFOP 2, 6, 3, 5, 7 %endif %endif ; HIGH_BIT_DEPTH %if %0 == 4 SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4 %else SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3 %endif %endmacro ;----------------------------------------------------------------------------- ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SATDS_SSE2 0 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) %if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH) cglobal pixel_satd_4x4, 4, 6, 6 SATD_START_MMX mova m4, [hmul_4p] LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5] LOAD_DUP_2x4P m0, m5, [r0], [r0+r1] LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4] DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 HADAMARD 0, sumsub, 0, 1, 2, 3 HADAMARD 4, sumsub, 0, 1, 2, 3 HADAMARD 1, amax, 0, 1, 2, 3 HADDW m0, m1 movd eax, m0 RET %endif cglobal pixel_satd_4x8, 4, 6, 8 SATD_START_MMX %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap %if BIT_DEPTH == 12 HADDD m7, m1 %else HADDUW m7, m1 %endif movd eax, m7 RET cglobal pixel_satd_4x16, 4, 6, 8 SATD_START_MMX %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0+r1*2*SIZEOF_PIXEL] lea r2, [r2+r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add %if BIT_DEPTH == 12 HADDD m7, m1 %else HADDUW m7, m1 %endif movd eax, m7 RET cglobal pixel_satd_8x8_internal LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 %%pixel_satd_8x4_internal: LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 ret cglobal pixel_satd_8x8_internal2 %if WIN64 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13 %%pixel_satd_8x4_internal2: LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13 %else LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5 %%pixel_satd_8x4_internal2: LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5 %endif ret ; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers) ; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge) %if HIGH_BIT_DEPTH == 0 && (WIN64 || UNIX64) && notcpuflag(avx) cglobal pixel_satd_16x4_internal2 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11 lea r2, [r2+4*r3] lea r0, [r0+4*r1] SATD_8x4_1_SSE 0, 0, 1, 2, 3, 6, 11, 10, 12, 13 SATD_8x4_1_SSE 0, 4, 8, 5, 9, 6, 3, 10, 12, 13 ret cglobal pixel_satd_16x4, 4,6,14 SATD_START_SSE2 m10, m7 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_16x8, 4,6,14 SATD_START_SSE2 m10, m7 %if vertical mova m7, [pw_00ff] %endif jmp %%pixel_satd_16x8_internal cglobal pixel_satd_16x12, 4,6,14 SATD_START_SSE2 m10, m7 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 jmp %%pixel_satd_16x8_internal cglobal pixel_satd_16x32, 4,6,14 SATD_START_SSE2 m10, m7 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 jmp %%pixel_satd_16x8_internal cglobal pixel_satd_16x64, 4,6,14 SATD_START_SSE2 m10, m7 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 jmp %%pixel_satd_16x8_internal cglobal pixel_satd_16x16, 4,6,14 SATD_START_SSE2 m10, m7 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 %%pixel_satd_16x8_internal: call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 32] lea r2, [r7 + 32] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 32] lea r2, [r7 + 32] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 48] lea r2, [r7 + 48] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 32] lea r2, [r7 + 32] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 48] lea r2, [r7 + 48] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 32] lea r2, [r7 + 32] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 48] lea r2, [r7 + 48] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 32] lea r2, [r7 + 32] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 48] lea r2, [r7 + 48] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET %else %if WIN64 cglobal pixel_satd_16x24, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_16x24, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_32x48, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_32x48, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_24x64, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_24x64, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_8x64, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_8x64, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_8x12, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call %%pixel_satd_8x4_internal2 pxor m7, m7 movhlps m7, m6 paddd m6, m7 pshufd m7, m6, 1 paddd m6, m7 movd eax, m6 RET %else cglobal pixel_satd_8x12, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call %%pixel_satd_8x4_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if HIGH_BIT_DEPTH %if WIN64 cglobal pixel_satd_12x32, 4,8,8 ;if WIN64 && cpuflag(avx) SATD_START_MMX mov r6, r0 mov r7, r2 pxor m7, m7 SATD_4x8_SSE vertical, 0, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 4*SIZEOF_PIXEL] lea r2, [r7 + 4*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 HADDD m7, m0 movd eax, m7 RET %else cglobal pixel_satd_12x32, 4,7,8,0-gprsize SATD_START_MMX mov r6, r0 mov [rsp], r2 pxor m7, m7 SATD_4x8_SSE vertical, 0, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 4*SIZEOF_PIXEL] mov r2, [rsp] add r2, 4*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 HADDD m7, m0 movd eax, m7 RET %endif %else ;HIGH_BIT_DEPTH %if WIN64 cglobal pixel_satd_12x32, 4,8,8 ;if WIN64 && cpuflag(avx) SATD_START_MMX mov r6, r0 mov r7, r2 %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 4*SIZEOF_PIXEL] lea r2, [r7 + 4*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET %else cglobal pixel_satd_12x32, 4,7,8,0-gprsize SATD_START_MMX mov r6, r0 mov [rsp], r2 %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 4*SIZEOF_PIXEL] mov r2, [rsp] add r2, 4*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET %endif %endif %if HIGH_BIT_DEPTH %if WIN64 cglobal pixel_satd_4x32, 4,8,8 ;if WIN64 && cpuflag(avx) SATD_START_MMX mov r6, r0 mov r7, r2 pxor m7, m7 SATD_4x8_SSE vertical, 0, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 HADDD m7, m0 movd eax, m7 RET %else cglobal pixel_satd_4x32, 4,7,8,0-gprsize SATD_START_MMX mov r6, r0 mov [rsp], r2 pxor m7, m7 SATD_4x8_SSE vertical, 0, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 pxor m1, m1 movhlps m1, m7 paddd m7, m1 pshufd m1, m7, 1 paddd m7, m1 movd eax, m7 RET %endif %else %if WIN64 cglobal pixel_satd_4x32, 4,8,8 ;if WIN64 && cpuflag(avx) SATD_START_MMX mov r6, r0 mov r7, r2 %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET %else cglobal pixel_satd_4x32, 4,7,8,0-gprsize SATD_START_MMX mov r6, r0 mov [rsp], r2 %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET %endif %endif %if WIN64 cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_32x64, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] lea r2, [r7 + 32*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] lea r2, [r7 + 40*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_48x64, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2,8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2,16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2,24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] mov r2, [rsp] add r2,32*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] mov r2, [rsp] add r2,40*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] lea r2, [r7 + 32*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] lea r2, [r7 + 40*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] lea r2, [r7 + 48*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] lea r2, [r7 + 56*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2,8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2,16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2,24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] mov r2, [rsp] add r2,32*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] mov r2, [rsp] add r2,40*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] mov r2, [rsp] add r2,48*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] mov r2, [rsp] add r2,56*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] lea r2, [r7 + 32*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] lea r2, [r7 + 40*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] lea r2, [r7 + 48*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] lea r2, [r7 + 56*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_64x32, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] mov r2, [rsp] add r2, 32*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] mov r2, [rsp] add r2, 40*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] mov r2, [rsp] add r2, 48*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] mov r2, [rsp] add r2, 56*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] lea r2, [r7 + 32*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] lea r2, [r7 + 40*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] lea r2, [r7 + 48*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] lea r2, [r7 + 56*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_64x48, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] mov r2, [rsp] add r2, 32*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] mov r2, [rsp] add r2, 40*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] mov r2, [rsp] add r2, 48*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] mov r2, [rsp] add r2, 56*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] lea r2, [r7 + 32*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] lea r2, [r7 + 40*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] lea r2, [r7 + 48*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] lea r2, [r7 + 56*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_64x64, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] mov r2, [rsp] add r2, 32*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] mov r2, [rsp] add r2, 40*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] mov r2, [rsp] add r2, 48*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] mov r2, [rsp] add r2, 56*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_16x4, 4,6,14 %else cglobal pixel_satd_16x4, 4,6,8 %endif SATD_START_SSE2 m6, m7 BACKUP_POINTERS call %%pixel_satd_8x4_internal2 RESTORE_AND_INC_POINTERS call %%pixel_satd_8x4_internal2 HADDD m6, m0 movd eax, m6 RET %if WIN64 cglobal pixel_satd_16x8, 4,6,14 %else cglobal pixel_satd_16x8, 4,6,8 %endif SATD_START_SSE2 m6, m7 BACKUP_POINTERS call pixel_satd_8x8_internal2 RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %if WIN64 cglobal pixel_satd_16x12, 4,6,14 %else cglobal pixel_satd_16x12, 4,6,8 %endif SATD_START_SSE2 m6, m7, 1 BACKUP_POINTERS call pixel_satd_8x8_internal2 call %%pixel_satd_8x4_internal2 RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal2 call %%pixel_satd_8x4_internal2 HADDD m6, m0 movd eax, m6 RET %if WIN64 cglobal pixel_satd_16x16, 4,6,14 %else cglobal pixel_satd_16x16, 4,6,8 %endif SATD_START_SSE2 m6, m7, 1 BACKUP_POINTERS call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %if WIN64 cglobal pixel_satd_16x32, 4,6,14 %else cglobal pixel_satd_16x32, 4,6,8 %endif SATD_START_SSE2 m6, m7, 1 BACKUP_POINTERS call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %if WIN64 cglobal pixel_satd_16x64, 4,6,14 %else cglobal pixel_satd_16x64, 4,6,8 %endif SATD_START_SSE2 m6, m7, 1 BACKUP_POINTERS call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if HIGH_BIT_DEPTH %if WIN64 cglobal pixel_satd_12x16, 4,8,8 SATD_START_MMX mov r6, r0 mov r7, r2 pxor m7, m7 SATD_4x8_SSE vertical, 0, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 4*SIZEOF_PIXEL] lea r2, [r7 + 4*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 HADDD m7, m0 movd eax, m7 RET %else cglobal pixel_satd_12x16, 4,7,8,0-gprsize SATD_START_MMX mov r6, r0 mov [rsp], r2 pxor m7, m7 SATD_4x8_SSE vertical, 0, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 4*SIZEOF_PIXEL] mov r2, [rsp] add r2, 4*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 HADDD m7, m0 movd eax, m7 RET %endif %else ;HIGH_BIT_DEPTH %if WIN64 cglobal pixel_satd_12x16, 4,8,8 SATD_START_MMX mov r6, r0 mov r7, r2 %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 4*SIZEOF_PIXEL] lea r2, [r7 + 4*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET %else cglobal pixel_satd_12x16, 4,7,8,0-gprsize SATD_START_MMX mov r6, r0 mov [rsp], r2 %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 4*SIZEOF_PIXEL] mov r2, [rsp] add r2, 4*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET %endif %endif %if WIN64 cglobal pixel_satd_24x32, 4,8,14 SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_24x32, 4,7,8,0-gprsize SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif ;WIN64 %if WIN64 cglobal pixel_satd_8x32, 4,6,14 %else cglobal pixel_satd_8x32, 4,6,8 %endif SATD_START_SSE2 m6, m7 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %if WIN64 cglobal pixel_satd_8x16, 4,6,14 %else cglobal pixel_satd_8x16, 4,6,8 %endif SATD_START_SSE2 m6, m7 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET cglobal pixel_satd_8x8, 4,6,8 SATD_START_SSE2 m6, m7 call pixel_satd_8x8_internal SATD_END_SSE2 m6 %if WIN64 cglobal pixel_satd_8x4, 4,6,14 %else cglobal pixel_satd_8x4, 4,6,8 %endif SATD_START_SSE2 m6, m7 call %%pixel_satd_8x4_internal2 SATD_END_SSE2 m6 %endmacro ; SATDS_SSE2 ;============================================================================= ; SA8D ;============================================================================= %macro SA8D_INTER 0 %if ARCH_X86_64 %define lh m10 %define rh m0 %else %define lh m0 %define rh [esp+48] %endif %if HIGH_BIT_DEPTH HADDUW m0, m1 paddd lh, rh %else paddusw lh, rh %endif ; HIGH_BIT_DEPTH %endmacro %macro SA8D_8x8 0 call pixel_sa8d_8x8_internal %if HIGH_BIT_DEPTH HADDUW m0, m1 %else HADDW m0, m1 %endif ; HIGH_BIT_DEPTH paddd m0, [pd_1] psrld m0, 1 paddd m12, m0 %endmacro %macro SA8D_16x16 0 call pixel_sa8d_8x8_internal ; pix[0] add r2, 8*SIZEOF_PIXEL add r0, 8*SIZEOF_PIXEL %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova m10, m0 call pixel_sa8d_8x8_internal ; pix[8] lea r2, [r2+8*r3] lea r0, [r0+8*r1] SA8D_INTER call pixel_sa8d_8x8_internal ; pix[8*stride+8] sub r2, 8*SIZEOF_PIXEL sub r0, 8*SIZEOF_PIXEL SA8D_INTER call pixel_sa8d_8x8_internal ; pix[8*stride] SA8D_INTER SWAP 0, 10 %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif paddd m0, [pd_1] psrld m0, 1 paddd m12, m0 %endmacro %macro AVG_16x16 0 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d %endmacro %macro SA8D 0 ; sse2 doesn't seem to like the horizontal way of doing things %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) %endmacro ; SA8D ; INTRA SATD ;============================================================================= %define TRANS TRANS_SSE2 %define DIFFOP DIFF_UNPACK_SSE2 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size %define movdqu movups %define punpcklqdq movlhps INIT_XMM sse2 %if BIT_DEPTH <= 10 SA8D %endif SATDS_SSE2 %if HIGH_BIT_DEPTH == 0 INIT_XMM ssse3,atom SATDS_SSE2 SA8D %endif %define DIFFOP DIFF_SUMSUB_SSSE3 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE %if HIGH_BIT_DEPTH == 0 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3 %endif INIT_XMM ssse3 %if BIT_DEPTH <= 10 SA8D %endif SATDS_SSE2 %undef movdqa ; nehalem doesn't like movaps %undef movdqu ; movups %undef punpcklqdq ; or movlhps %define TRANS TRANS_SSE4 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN INIT_XMM sse4 %if BIT_DEPTH <= 10 SA8D %endif SATDS_SSE2 ; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so ; it's effectively free. %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE INIT_XMM avx SA8D SATDS_SSE2 %define TRANS TRANS_XOP INIT_XMM xop %if BIT_DEPTH <= 10 SA8D %endif SATDS_SSE2 %if HIGH_BIT_DEPTH == 0 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2 %define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2 %define TRANS TRANS_SSE4 %macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul] movddup xm%1, [r0] movddup xm%3, [r2] movddup xm%2, [r0+4*r1] movddup xm%5, [r2+4*r3] vinserti128 m%1, m%1, xm%2, 1 vinserti128 m%3, m%3, xm%5, 1 movddup xm%2, [r0+r1] movddup xm%4, [r2+r3] movddup xm%5, [r0+r4] movddup xm%6, [r2+r5] vinserti128 m%2, m%2, xm%5, 1 vinserti128 m%4, m%4, xm%6, 1 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movddup xm%3, [r0] movddup xm%5, [r0+4*r1] vinserti128 m%3, m%3, xm%5, 1 movddup xm%5, [r2] movddup xm%4, [r2+4*r3] vinserti128 m%5, m%5, xm%4, 1 movddup xm%4, [r0+r1] movddup xm%6, [r0+r4] vinserti128 m%4, m%4, xm%6, 1 movq xm%6, [r2+r3] movhps xm%6, [r2+r5] vpermq m%6, m%6, q1100 DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7 %endmacro %macro SATD_START_AVX2 2-3 0 FIX_STRIDES r1, r3 %if %3 mova %2, [hmul_8p] lea r4, [5*r1] lea r5, [5*r3] %else mova %2, [hmul_16p] lea r4, [3*r1] lea r5, [3*r3] %endif pxor %1, %1 %endmacro %define TRANS TRANS_SSE4 INIT_YMM avx2 cglobal pixel_satd_16x8_internal LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 ret cglobal pixel_satd_16x16, 4,6,8 SATD_START_AVX2 m6, m7 call pixel_satd_16x8_internal lea r0, [r0+4*r1] lea r2, [r2+4*r3] pixel_satd_16x8_internal: call pixel_satd_16x8_internal vextracti128 xm0, m6, 1 paddw xm0, xm6 SATD_END_SSE2 xm0 RET cglobal pixel_satd_16x8, 4,6,8 SATD_START_AVX2 m6, m7 jmp pixel_satd_16x8_internal cglobal pixel_satd_8x8_internal LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 ret cglobal pixel_satd_8x16, 4,6,8 SATD_START_AVX2 m6, m7, 1 call pixel_satd_8x8_internal lea r0, [r0+2*r1] lea r2, [r2+2*r3] lea r0, [r0+4*r1] lea r2, [r2+4*r3] call pixel_satd_8x8_internal vextracti128 xm0, m6, 1 paddw xm0, xm6 SATD_END_SSE2 xm0 RET cglobal pixel_satd_8x8, 4,6,8 SATD_START_AVX2 m6, m7, 1 call pixel_satd_8x8_internal vextracti128 xm0, m6, 1 paddw xm0, xm6 SATD_END_SSE2 xm0 RET %endif ;;--------------------------------------------------------------- ;; SATD AVX2 ;; int pixel_satd(const pixel*, intptr_t, const pixel*, intptr_t) ;;--------------------------------------------------------------- ;; r0 - pix0 ;; r1 - pix0Stride ;; r2 - pix1 ;; r3 - pix1Stride %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 INIT_YMM avx2 cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows pxor m6, m6 vbroadcasti128 m0, [r0] vbroadcasti128 m4, [r2] vbroadcasti128 m1, [r0 + r1] vbroadcasti128 m5, [r2 + r3] pmaddubsw m4, m7 pmaddubsw m0, m7 pmaddubsw m5, m7 pmaddubsw m1, m7 psubw m0, m4 psubw m1, m5 vbroadcasti128 m2, [r0 + r1 * 2] vbroadcasti128 m4, [r2 + r3 * 2] vbroadcasti128 m3, [r0 + r4] vbroadcasti128 m5, [r2 + r5] pmaddubsw m4, m7 pmaddubsw m2, m7 pmaddubsw m5, m7 pmaddubsw m3, m7 psubw m2, m4 psubw m3, m5 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] paddw m4, m0, m1 psubw m1, m1, m0 paddw m0, m2, m3 psubw m3, m2 paddw m2, m4, m0 psubw m0, m4 paddw m4, m1, m3 psubw m3, m1 pabsw m2, m2 pabsw m0, m0 pabsw m4, m4 pabsw m3, m3 pblendw m1, m2, m0, 10101010b pslld m0, 16 psrld m2, 16 por m0, m2 pmaxsw m1, m0 paddw m6, m1 pblendw m2, m4, m3, 10101010b pslld m3, 16 psrld m4, 16 por m3, m4 pmaxsw m2, m3 paddw m6, m2 vbroadcasti128 m1, [r0] vbroadcasti128 m4, [r2] vbroadcasti128 m2, [r0 + r1] vbroadcasti128 m5, [r2 + r3] pmaddubsw m4, m7 pmaddubsw m1, m7 pmaddubsw m5, m7 pmaddubsw m2, m7 psubw m1, m4 psubw m2, m5 vbroadcasti128 m0, [r0 + r1 * 2] vbroadcasti128 m4, [r2 + r3 * 2] vbroadcasti128 m3, [r0 + r4] vbroadcasti128 m5, [r2 + r5] lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] pmaddubsw m4, m7 pmaddubsw m0, m7 pmaddubsw m5, m7 pmaddubsw m3, m7 psubw m0, m4 psubw m3, m5 paddw m4, m1, m2 psubw m2, m1 paddw m1, m0, m3 psubw m3, m0 paddw m0, m4, m1 psubw m1, m4 paddw m4, m2, m3 psubw m3, m2 pabsw m0, m0 pabsw m1, m1 pabsw m4, m4 pabsw m3, m3 pblendw m2, m0, m1, 10101010b pslld m1, 16 psrld m0, 16 por m1, m0 pmaxsw m2, m1 paddw m6, m2 pblendw m0, m4, m3, 10101010b pslld m3, 16 psrld m4, 16 por m3, m4 pmaxsw m0, m3 paddw m6, m0 vextracti128 xm0, m6, 1 pmovzxwd m6, xm6 pmovzxwd m0, xm0 paddd m8, m6 paddd m9, m0 ret cglobal calc_satd_16x4 ; function to compute satd cost for 16 columns, 4 rows pxor m6, m6 vbroadcasti128 m0, [r0] vbroadcasti128 m4, [r2] vbroadcasti128 m1, [r0 + r1] vbroadcasti128 m5, [r2 + r3] pmaddubsw m4, m7 pmaddubsw m0, m7 pmaddubsw m5, m7 pmaddubsw m1, m7 psubw m0, m4 psubw m1, m5 vbroadcasti128 m2, [r0 + r1 * 2] vbroadcasti128 m4, [r2 + r3 * 2] vbroadcasti128 m3, [r0 + r4] vbroadcasti128 m5, [r2 + r5] pmaddubsw m4, m7 pmaddubsw m2, m7 pmaddubsw m5, m7 pmaddubsw m3, m7 psubw m2, m4 psubw m3, m5 paddw m4, m0, m1 psubw m1, m1, m0 paddw m0, m2, m3 psubw m3, m2 paddw m2, m4, m0 psubw m0, m4 paddw m4, m1, m3 psubw m3, m1 pabsw m2, m2 pabsw m0, m0 pabsw m4, m4 pabsw m3, m3 pblendw m1, m2, m0, 10101010b pslld m0, 16 psrld m2, 16 por m0, m2 pmaxsw m1, m0 paddw m6, m1 pblendw m2, m4, m3, 10101010b pslld m3, 16 psrld m4, 16 por m3, m4 pmaxsw m2, m3 paddw m6, m2 vextracti128 xm0, m6, 1 pmovzxwd m6, xm6 pmovzxwd m0, xm0 paddd m8, m6 paddd m9, m0 ret cglobal pixel_satd_16x4, 4,6,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 call calc_satd_16x4 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_16x12, 4,6,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 call calc_satd_16x8 call calc_satd_16x4 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_16x32, 4,6,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_16x64, 4,6,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_32x8, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_32x16, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_32x24, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_32x32, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_32x64, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_48x64, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_64x16, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 48] lea r2, [r7 + 48] call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_64x32, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 48] lea r2, [r7 + 48] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_64x48, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 48] lea r2, [r7 + 48] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_64x64, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 48] lea r2, [r7 + 48] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1 INIT_YMM avx2 cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows ; rows 0-3 movu m0, [r0] movu m4, [r2] psubw m0, m4 movu m1, [r0 + r1] movu m5, [r2 + r3] psubw m1, m5 movu m2, [r0 + r1 * 2] movu m4, [r2 + r3 * 2] psubw m2, m4 movu m3, [r0 + r4] movu m5, [r2 + r5] psubw m3, m5 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] paddw m4, m0, m1 psubw m1, m0 paddw m0, m2, m3 psubw m3, m2 punpckhwd m2, m4, m1 punpcklwd m4, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 paddw m3, m4, m0 psubw m0, m4 paddw m4, m2, m1 psubw m1, m2 punpckhdq m2, m3, m0 punpckldq m3, m0 paddw m0, m3, m2 psubw m2, m3 punpckhdq m3, m4, m1 punpckldq m4, m1 paddw m1, m4, m3 psubw m3, m4 punpckhqdq m4, m0, m1 punpcklqdq m0, m1 pabsw m0, m0 pabsw m4, m4 pmaxsw m0, m0, m4 punpckhqdq m1, m2, m3 punpcklqdq m2, m3 pabsw m2, m2 pabsw m1, m1 pmaxsw m2, m1 pxor m7, m7 mova m1, m0 punpcklwd m1, m7 paddd m6, m1 mova m1, m0 punpckhwd m1, m7 paddd m6, m1 pxor m7, m7 mova m1, m2 punpcklwd m1, m7 paddd m6, m1 mova m1, m2 punpckhwd m1, m7 paddd m6, m1 ; rows 4-7 movu m0, [r0] movu m4, [r2] psubw m0, m4 movu m1, [r0 + r1] movu m5, [r2 + r3] psubw m1, m5 movu m2, [r0 + r1 * 2] movu m4, [r2 + r3 * 2] psubw m2, m4 movu m3, [r0 + r4] movu m5, [r2 + r5] psubw m3, m5 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] paddw m4, m0, m1 psubw m1, m0 paddw m0, m2, m3 psubw m3, m2 punpckhwd m2, m4, m1 punpcklwd m4, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 paddw m3, m4, m0 psubw m0, m4 paddw m4, m2, m1 psubw m1, m2 punpckhdq m2, m3, m0 punpckldq m3, m0 paddw m0, m3, m2 psubw m2, m3 punpckhdq m3, m4, m1 punpckldq m4, m1 paddw m1, m4, m3 psubw m3, m4 punpckhqdq m4, m0, m1 punpcklqdq m0, m1 pabsw m0, m0 pabsw m4, m4 pmaxsw m0, m0, m4 punpckhqdq m1, m2, m3 punpcklqdq m2, m3 pabsw m2, m2 pabsw m1, m1 pmaxsw m2, m1 pxor m7, m7 mova m1, m0 punpcklwd m1, m7 paddd m6, m1 mova m1, m0 punpckhwd m1, m7 paddd m6, m1 pxor m7, m7 mova m1, m2 punpcklwd m1, m7 paddd m6, m1 mova m1, m2 punpckhwd m1, m7 paddd m6, m1 ret cglobal calc_satd_16x4 ; function to compute satd cost for 16 columns, 4 rows ; rows 0-3 movu m0, [r0] movu m4, [r2] psubw m0, m4 movu m1, [r0 + r1] movu m5, [r2 + r3] psubw m1, m5 movu m2, [r0 + r1 * 2] movu m4, [r2 + r3 * 2] psubw m2, m4 movu m3, [r0 + r4] movu m5, [r2 + r5] psubw m3, m5 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] paddw m4, m0, m1 psubw m1, m0 paddw m0, m2, m3 psubw m3, m2 punpckhwd m2, m4, m1 punpcklwd m4, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 paddw m3, m4, m0 psubw m0, m4 paddw m4, m2, m1 psubw m1, m2 punpckhdq m2, m3, m0 punpckldq m3, m0 paddw m0, m3, m2 psubw m2, m3 punpckhdq m3, m4, m1 punpckldq m4, m1 paddw m1, m4, m3 psubw m3, m4 punpckhqdq m4, m0, m1 punpcklqdq m0, m1 pabsw m0, m0 pabsw m4, m4 pmaxsw m0, m0, m4 punpckhqdq m1, m2, m3 punpcklqdq m2, m3 pabsw m2, m2 pabsw m1, m1 pmaxsw m2, m1 pxor m7, m7 mova m1, m0 punpcklwd m1, m7 paddd m6, m1 mova m1, m0 punpckhwd m1, m7 paddd m6, m1 pxor m7, m7 mova m1, m2 punpcklwd m1, m7 paddd m6, m1 mova m1, m2 punpckhwd m1, m7 paddd m6, m1 ret cglobal pixel_satd_16x4, 4,6,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 call calc_satd_16x4 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_16x8, 4,6,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_16x12, 4,6,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 call calc_satd_16x8 call calc_satd_16x4 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_16x16, 4,6,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_16x32, 4,6,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_16x64, 4,6,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_32x8, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_32x16, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_32x24, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_32x32, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_32x64, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_48x64, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 64] lea r2, [r7 + 64] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_64x16, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 64] lea r2, [r7 + 64] call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 96] lea r2, [r7 + 96] call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_64x32, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 64] lea r2, [r7 + 64] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 96] lea r2, [r7 + 96] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_64x48, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 64] lea r2, [r7 + 64] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 96] lea r2, [r7 + 96] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_64x64, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 64] lea r2, [r7 + 64] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 96] lea r2, [r7 + 96] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1 %if HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10 %macro LOAD_DIFF_AVX2 4 movu %1, %3 movu %2, %4 psubw %1, %2 %endmacro %macro LOAD_DIFF_8x4P_AVX2 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer LOAD_DIFF_AVX2 xm%1, xm%5, [%7], [%8] LOAD_DIFF_AVX2 xm%2, xm%6, [%7+r1], [%8+r3] LOAD_DIFF_AVX2 xm%3, xm%5, [%7+2*r1], [%8+2*r3] LOAD_DIFF_AVX2 xm%4, xm%6, [%7+r4], [%8+r5] ;lea %7, [%7+4*r1] ;lea %8, [%8+4*r3] %endmacro INIT_YMM avx2 cglobal pixel_satd_8x8, 4,4,7 FIX_STRIDES r1, r3 pxor xm6, xm6 ; load_diff 0 & 4 movu xm0, [r0] movu xm1, [r2] vinserti128 m0, m0, [r0 + r1 * 4], 1 vinserti128 m1, m1, [r2 + r3 * 4], 1 psubw m0, m1 add r0, r1 add r2, r3 ; load_diff 1 & 5 movu xm1, [r0] movu xm2, [r2] vinserti128 m1, m1, [r0 + r1 * 4], 1 vinserti128 m2, m2, [r2 + r3 * 4], 1 psubw m1, m2 add r0, r1 add r2, r3 ; load_diff 2 & 6 movu xm2, [r0] movu xm3, [r2] vinserti128 m2, m2, [r0 + r1 * 4], 1 vinserti128 m3, m3, [r2 + r3 * 4], 1 psubw m2, m3 add r0, r1 add r2, r3 ; load_diff 3 & 7 movu xm3, [r0] movu xm4, [r2] vinserti128 m3, m3, [r0 + r1 * 4], 1 vinserti128 m4, m4, [r2 + r3 * 4], 1 psubw m3, m4 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 vextracti128 xm0, m6, 1 paddw xm6, xm0 HADDUW xm6, xm0 movd eax, xm6 RET ; TODO: optimize me, need more 2 of YMM registers because C model get partial result every 16x16 block %endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10 xavs2-1.3/source/common/x86/ssd-a.asm000066400000000000000000002443661340660520300173640ustar00rootroot00000000000000;***************************************************************************** ;* ssd-a.asm: x86 ssd functions ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Laurent Aimar ;* Alex Izvorski ;* Min Chen ;* Jiaqi Zhang ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 SECTION .text cextern pw_00ff cextern hsub_mul ;============================================================================= ; SSD ;============================================================================= %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SSD_ONE 2 cglobal pixel_ssd_ss_%1x%2, 4,7,8 FIX_STRIDES r1, r3 %if mmsize == %1*2 %define offset0_1 r1 %define offset0_2 r1*2 %define offset0_3 r5 %define offset1_1 r3 %define offset1_2 r3*2 %define offset1_3 r6 lea r5, [3*r1] lea r6, [3*r3] %elif mmsize == %1 %define offset0_1 mmsize %define offset0_2 r1 %define offset0_3 r1+mmsize %define offset1_1 mmsize %define offset1_2 r3 %define offset1_3 r3+mmsize %elif mmsize == %1/2 %define offset0_1 mmsize %define offset0_2 mmsize*2 %define offset0_3 mmsize*3 %define offset1_1 mmsize %define offset1_2 mmsize*2 %define offset1_3 mmsize*3 %endif %assign %%n %2/(2*mmsize/%1) %if %%n > 1 mov r4d, %%n %endif pxor m0, m0 .loop: movu m1, [r0] movu m2, [r0+offset0_1] movu m3, [r0+offset0_2] movu m4, [r0+offset0_3] movu m6, [r2] movu m7, [r2+offset1_1] psubw m1, m6 psubw m2, m7 movu m6, [r2+offset1_2] movu m7, [r2+offset1_3] psubw m3, m6 psubw m4, m7 %if %%n > 1 lea r0, [r0+r1*(%2/%%n)] lea r2, [r2+r3*(%2/%%n)] %endif pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 %if %%n > 1 dec r4d jg .loop %endif %if BIT_DEPTH == 12 && %1 >= 16 && %2 >=16 %if mmsize == 16 movu m5, m0 pxor m6, m6 punpckldq m0, m6 punpckhdq m5, m6 paddq m0, m5 movhlps m5, m0 paddq m0, m5 movq r6, xm0 %elif mmsize == 32 movu m1, m0 pxor m2, m2 punpckldq m0, m2 punpckhdq m1, m2 paddq m0, m1 vextracti128 xm2, m0, 1 paddq xm2, xm0 movhlps xm1, xm2 paddq xm2, xm1 movq rax, xm2 %endif %else HADDD m0, m5 movd eax, xm0 %endif %ifidn movu,movq ; detect MMX EMMS %endif RET %endmacro ; Function to find ssd for 32x16 block, sse2, 12 bit depth ; Defined sepeartely to be called from SSD_ONE_32 macro INIT_XMM sse2 cglobal ssd_ss_32x16 pxor m8, m8 mov r4d, 16 .loop: movu m0, [r0] movu m1, [r0+mmsize] movu m2, [r0+2*mmsize] movu m3, [r0+3*mmsize] movu m4, [r2] movu m5, [r2+mmsize] movu m6, [r2+2*mmsize] movu m7, [r2+3*mmsize] psubw m0, m4 psubw m1, m5 psubw m2, m6 psubw m3, m7 add r0, r1 add r2, r3 pmaddwd m0, m0 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 paddd m2, m3 paddd m0, m1 paddd m0, m2 paddd m8, m0 dec r4d jnz .loop mova m4, m8 pxor m5, m5 punpckldq m8, m5 punpckhdq m4, m5 paddq m4, m8 movhlps m5, m4 paddq m4, m5 paddq m9, m4 ret %macro SSD_ONE_32 0 cglobal pixel_ssd_ss_32x64, 4,7,10 add r1d, r1d add r3d, r3d pxor m9, m9 xor r4, r4 call ssd_ss_32x16 call ssd_ss_32x16 call ssd_ss_32x16 call ssd_ss_32x16 movq rax, m9 RET %endmacro %macro SSD_ONE_SS_32 0 cglobal pixel_ssd_ss_32x32, 4,5,8 add r1d, r1d add r3d, r3d pxor m5, m5 pxor m6, m6 mov r4d, 2 .iterate: mov r5d, 16 pxor m4, m4 pxor m7, m7 .loop: movu m0, [r0] movu m1, [r0 + mmsize] movu m2, [r2] movu m3, [r2 + mmsize] psubw m0, m2 psubw m1, m3 pmaddwd m0, m0 pmaddwd m1, m1 paddd m4, m0 paddd m7, m1 movu m0, [r0 + 2 * mmsize] movu m1, [r0 + 3 * mmsize] movu m2, [r2 + 2 * mmsize] movu m3, [r2 + 3 * mmsize] psubw m0, m2 psubw m1, m3 pmaddwd m0, m0 pmaddwd m1, m1 paddd m4, m0 paddd m7, m1 add r0, r1 add r2, r3 dec r5d jnz .loop mova m0, m4 pxor m1, m1 punpckldq m0, m1 punpckhdq m4, m1 paddq m5, m0 paddq m6, m4 mova m0, m7 punpckldq m0, m1 punpckhdq m7, m1 paddq m5, m0 paddq m6, m7 dec r4d jnz .iterate paddq m5, m6 movhlps m2, m5 paddq m5, m2 movq rax, m5 RET %endmacro %macro SSD_ONE_SS_64 0 cglobal pixel_ssd_ss_64x64, 4,6,8 add r1d, r1d add r3d, r3d pxor m5, m5 pxor m6, m6 mov r5d, 8 .iterate: pxor m4, m4 pxor m7, m7 mov r4d, 8 .loop: ;----process 1st half a row---- movu m0, [r0] movu m1, [r0 + mmsize] movu m2, [r2] movu m3, [r2 + mmsize] psubw m0, m2 psubw m1, m3 pmaddwd m0, m0 pmaddwd m1, m1 paddd m4, m0 paddd m7, m1 movu m0, [r0 + 2 * mmsize] movu m1, [r0 + 3 * mmsize] movu m2, [r2 + 2 * mmsize] movu m3, [r2 + 3 * mmsize] psubw m0, m2 psubw m1, m3 pmaddwd m0, m0 pmaddwd m1, m1 paddd m4, m0 paddd m7, m1 ;----process 2nd half a row---- movu m0, [r0 + 4 * mmsize] movu m1, [r0 + 5 * mmsize] movu m2, [r2 + 4 * mmsize] movu m3, [r2 + 5 * mmsize] psubw m0, m2 psubw m1, m3 pmaddwd m0, m0 pmaddwd m1, m1 paddd m4, m0 paddd m7, m1 movu m0, [r0 + 6 * mmsize] movu m1, [r0 + 7 * mmsize] movu m2, [r2 + 6 * mmsize] movu m3, [r2 + 7 * mmsize] psubw m0, m2 psubw m1, m3 pmaddwd m0, m0 pmaddwd m1, m1 paddd m4, m0 paddd m7, m1 add r0, r1 add r2, r3 dec r4d jnz .loop mova m0, m4 pxor m1, m1 punpckldq m0, m1 punpckhdq m4, m1 paddq m5, m0 paddq m6, m4 mova m0, m7 punpckldq m0, m1 punpckhdq m7, m1 paddq m5, m0 paddq m6, m7 dec r5 jne .iterate paddq m5, m6 movhlps m2, m5 paddq m5, m2 movq rax, m5 RET %endmacro %macro SSD_TWO 2 cglobal pixel_ssd_ss_%1x%2, 4,7,8 FIX_STRIDES r1, r3 pxor m0, m0 mov r4d, %2/2 lea r5, [r1 * 2] lea r6, [r3 * 2] .loop: movu m1, [r0] movu m2, [r0 + 16] movu m3, [r0 + 32] movu m4, [r0 + 48] movu m6, [r2] movu m7, [r2 + 16] psubw m1, m6 psubw m2, m7 movu m6, [r2 + 32] movu m7, [r2 + 48] psubw m3, m6 psubw m4, m7 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 movu m1, [r0 + 64] movu m2, [r0 + 80] movu m6, [r2 + 64] movu m7, [r2 + 80] psubw m1, m6 psubw m2, m7 pmaddwd m1, m1 pmaddwd m2, m2 paddd m1, m2 paddd m0, m1 %if %1 == 64 movu m3, [r0 + 96] movu m4, [r0 + 112] movu m6, [r2 + 96] movu m7, [r2 + 112] psubw m3, m6 psubw m4, m7 pmaddwd m3, m3 pmaddwd m4, m4 paddd m3, m4 paddd m0, m3 %endif movu m1, [r0 + r1] movu m2, [r0 + r1 + 16] movu m3, [r0 + r1 + 32] movu m4, [r0 + r1 + 48] movu m6, [r2 + r3] movu m7, [r2 + r3 + 16] psubw m1, m6 psubw m2, m7 movu m6, [r2 + r3 + 32] movu m7, [r2 + r3 + 48] psubw m3, m6 psubw m4, m7 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 movu m1, [r0 + r1 + 64] movu m2, [r0 + r1 + 80] movu m6, [r2 + r3 + 64] movu m7, [r2 + r3 + 80] psubw m1, m6 psubw m2, m7 pmaddwd m1, m1 pmaddwd m2, m2 paddd m1, m2 paddd m0, m1 %if %1 == 64 movu m3, [r0 + r1 + 96] movu m4, [r0 + r1 + 112] movu m6, [r2 + r3 + 96] movu m7, [r2 + r3 + 112] psubw m3, m6 psubw m4, m7 pmaddwd m3, m3 pmaddwd m4, m4 paddd m3, m4 paddd m0, m3 %endif lea r0, [r0 + r5] lea r2, [r2 + r6] dec r4d jnz .loop %if BIT_DEPTH == 10 && %1 == 64 && %2 ==64 movu m5, m0 pxor m6, m6 punpckldq m0, m6 punpckhdq m5, m6 paddq m0, m5 movhlps m5, m0 paddq m0, m5 movq rax, xm0 %else HADDD m0, m5 movd eax, xm0 %endif RET %endmacro %macro SSD_24 2 cglobal pixel_ssd_ss_%1x%2, 4,7,8 FIX_STRIDES r1, r3 pxor m0, m0 mov r4d, %2/2 lea r5, [r1 * 2] lea r6, [r3 * 2] .loop: movu m1, [r0] movu m2, [r0 + 16] movu m3, [r0 + 32] movu m5, [r2] movu m6, [r2 + 16] movu m7, [r2 + 32] psubw m1, m5 psubw m2, m6 psubw m3, m7 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 paddd m1, m2 paddd m0, m1 movu m1, [r0 + r1] movu m2, [r0 + r1 + 16] movu m4, [r0 + r1 + 32] movu m5, [r2 + r3] movu m6, [r2 + r3 + 16] movu m7, [r2 + r3 + 32] psubw m1, m5 psubw m2, m6 psubw m4, m7 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m4, m4 paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 lea r0, [r0 + r5] lea r2, [r2 + r6] dec r4d jnz .loop HADDD m0, m5 movd eax, xm0 RET %endmacro %macro SSD_12 2 cglobal pixel_ssd_ss_%1x%2, 4,7,8 FIX_STRIDES r1, r3 pxor m0, m0 mov r4d, %2/4 lea r5, [r1 * 2] lea r6, [r3 * 2] .loop: movu m1, [r0] movh m2, [r0 + 16] movu m3, [r0 + r1] punpcklqdq m2, [r0 + r1 + 16] movu m7, [r2] psubw m1, m7 movh m4, [r2 + 16] movu m7, [r2 + r3] psubw m3, m7 punpcklqdq m4, [r2 + r3 + 16] psubw m2, m4 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 paddd m1, m2 paddd m0, m1 movu m1, [r0 + r5] movh m2, [r0 + r5 + 16] lea r0, [r0 + r5] movu m6, [r0 + r1] punpcklqdq m2, [r0 + r1 + 16] movu m7, [r2 + r6] psubw m1, m7 movh m4, [r2 + r6 + 16] lea r2, [r2 + r6] movu m7, [r2 + r3] psubw m6, m7 punpcklqdq m4, [r2 + r3 + 16] psubw m2, m4 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m6, m6 paddd m1, m2 paddd m3, m6 paddd m0, m1 paddd m0, m3 lea r0, [r0 + r5] lea r2, [r2 + r6] dec r4d jnz .loop HADDD m0, m5 movd eax, xm0 RET %endmacro INIT_YMM avx2 cglobal pixel_ssd_16x16, 4,7,3 FIX_STRIDES r1, r3 lea r5, [3 * r1] lea r6, [3 * r3] mov r4d, 4 pxor m0, m0 .loop: movu m1, [r0] movu m2, [r0 + r1] psubw m1, [r2] psubw m2, [r2 + r3] pmaddwd m1, m1 pmaddwd m2, m2 paddd m0, m1 paddd m0, m2 movu m1, [r0 + r1 * 2] movu m2, [r0 + r5] psubw m1, [r2 + r3 * 2] psubw m2, [r2 + r6] pmaddwd m1, m1 pmaddwd m2, m2 paddd m0, m1 paddd m0, m2 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] dec r4d jg .loop mova m1, m0 pxor m2, m2 punpckldq m0, m2 punpckhdq m1, m2 paddq m0, m1 vextracti128 xm2, m0, 1 paddq xm2, xm0 movhlps xm1, xm2 paddq xm2, xm1 movq rax, xm2 ret INIT_YMM avx2 cglobal pixel_ssd_32x2 pxor m0, m0 movu m1, [r0] movu m2, [r0 + 32] psubw m1, [r2] psubw m2, [r2 + 32] pmaddwd m1, m1 pmaddwd m2, m2 paddd m0, m1 paddd m0, m2 movu m1, [r0 + r1] movu m2, [r0 + r1 + 32] psubw m1, [r2 + r3] psubw m2, [r2 + r3 + 32] pmaddwd m1, m1 pmaddwd m2, m2 paddd m0, m1 paddd m0, m2 lea r0, [r0 + r1 * 2] lea r2, [r2 + r3 * 2] mova m1, m0 pxor m2, m2 punpckldq m0, m2 punpckhdq m1, m2 paddq m3, m0 paddq m4, m1 ret INIT_YMM avx2 cglobal pixel_ssd_32x32, 4,5,5 add r1, r1 add r3, r3 pxor m3, m3 pxor m4, m4 mov r4, 16 .iterate: call pixel_ssd_32x2 dec r4d jne .iterate paddq m3, m4 vextracti128 xm4, m3, 1 paddq xm3, xm4 movhlps xm4, xm3 paddq xm3, xm4 movq rax, xm3 RET INIT_YMM avx2 cglobal pixel_ssd_64x64, 4,5,5 FIX_STRIDES r1, r3 mov r4d, 64 pxor m3, m3 pxor m4, m4 .loop: pxor m0, m0 movu m1, [r0] movu m2, [r0+32] psubw m1, [r2] psubw m2, [r2+32] pmaddwd m1, m1 pmaddwd m2, m2 paddd m0, m1 paddd m0, m2 movu m1, [r0+32*2] movu m2, [r0+32*3] psubw m1, [r2+32*2] psubw m2, [r2+32*3] pmaddwd m1, m1 pmaddwd m2, m2 paddd m0, m1 paddd m0, m2 lea r0, [r0+r1] lea r2, [r2+r3] mova m1, m0 pxor m2, m2 punpckldq m0, m2 punpckhdq m1, m2 paddq m3, m0 paddq m4, m1 dec r4d jg .loop paddq m3, m4 vextracti128 xm4, m3, 1 paddq xm3, xm4 movhlps xm4, xm3 paddq xm3, xm4 movq rax, xm3 RET INIT_MMX mmx2 SSD_ONE 4, 4 SSD_ONE 4, 8 SSD_ONE 4, 16 SSD_ONE 8, 4 SSD_ONE 8, 8 SSD_ONE 8, 16 SSD_ONE 16, 8 SSD_ONE 16, 16 INIT_XMM sse2 SSD_ONE 8, 4 SSD_ONE 8, 8 SSD_ONE 8, 16 SSD_ONE 8, 32 SSD_12 12, 16 SSD_ONE 16, 4 SSD_ONE 16, 8 SSD_ONE 16, 12 SSD_ONE 16, 16 SSD_ONE 16, 32 SSD_ONE 16, 64 SSD_24 24, 32 SSD_ONE 32, 8 SSD_ONE 32, 16 SSD_ONE 32, 24 %if BIT_DEPTH <= 10 SSD_ONE 32, 64 SSD_ONE 32, 32 SSD_TWO 64, 64 %else SSD_ONE_32 SSD_ONE_SS_32 SSD_ONE_SS_64 %endif SSD_TWO 48, 64 SSD_TWO 64, 16 SSD_TWO 64, 32 SSD_TWO 64, 48 INIT_YMM avx2 SSD_ONE 16, 8 SSD_ONE 16, 32 SSD_ONE 32, 64 %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH == 0 %macro SSD_SS 2 cglobal pixel_ssd_ss_%1x%2, 4,7,6 FIX_STRIDES r1, r3 %if mmsize == %1*4 || mmsize == %1*2 %define offset0_1 r1*2 %define offset0_2 r1*4 %define offset0_3 r5 %define offset1_1 r3*2 %define offset1_2 r3*4 %define offset1_3 r6 lea r5, [4*r1] lea r6, [4*r3] lea r5, [r5 + 2*r1] lea r6, [r6 + 2*r3] %elif mmsize == %1 %define offset0_1 16 %define offset0_2 r1*2 %define offset0_3 r1*2+16 %define offset1_1 16 %define offset1_2 r3*2 %define offset1_3 r3*2+16 %endif %if %1 == 4 %assign %%n %2/(mmsize/%1) %else %assign %%n %2/(2*mmsize/%1) %endif %if %%n > 1 mov r4d, %%n %endif pxor m0, m0 .loop: %if %1 == 4 movh m1, [r0] movh m2, [r2] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movh m1, [r0 + offset0_1] movh m2, [r2 + offset1_1] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movh m1, [r0 + offset0_2] movh m2, [r2 + offset1_2] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movh m1, [r0 + offset0_3] movh m2, [r2 + offset1_3] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 %else movu m1, [r0] movu m2, [r2] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + offset0_1] movu m2, [r2 + offset1_1] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + offset0_2] movu m2, [r2 + offset1_2] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + offset0_3] movu m2, [r2 + offset1_3] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 %endif lea r0, [r0+r1*(%2/%%n)*2] lea r2, [r2+r3*(%2/%%n)*2] %if %%n > 1 dec r4d jg .loop %endif %if %1 == 4 %if notcpuflag(ssse3) pshufd m1, m0, 1 paddd m0, m1 %else phaddd m0, m0 %endif %else HADDD m0, m1 %endif movd eax, m0 RET %endmacro %macro SSD_SS_ONE 0 SSD_SS 4, 4 SSD_SS 4, 8 SSD_SS 4, 16 SSD_SS 8, 4 SSD_SS 8, 8 SSD_SS 8, 16 SSD_SS 8, 32 SSD_SS 16, 4 SSD_SS 16, 8 SSD_SS 16, 12 SSD_SS 16, 16 SSD_SS 16, 32 SSD_SS 16, 64 %endmacro %macro SSD_SS_12x16 0 cglobal pixel_ssd_ss_12x16, 4,7,6 FIX_STRIDES r1, r3 mov r4d, 8 pxor m0, m0 .loop: movu m1, [r0] movu m2, [r2] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 16] movu m2, [r2 + 16] psubw m1, m2 pmaddwd m1, m1 pslldq m1, 8 psrldq m1, 8 paddd m0, m1 lea r0, [r0 + 2*r1] lea r2, [r2 + 2*r3] movu m1, [r0] movu m2, [r2] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 16] movu m2, [r2 + 16] psubw m1, m2 pmaddwd m1, m1 pslldq m1, 8 psrldq m1, 8 paddd m0, m1 lea r0, [r0 + 2*r1] lea r2, [r2 + 2*r3] dec r4d jnz .loop HADDD m0, m1 movd eax, m0 RET %endmacro %macro SSD_SS_32 1 cglobal pixel_ssd_ss_32x%1, 4,7,6 FIX_STRIDES r1, r3 mov r4d, %1/2 pxor m0, m0 .loop: movu m1, [r0] movu m2, [r2] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 16] movu m2, [r2 + 16] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 32] movu m2, [r2 + 32] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 48] movu m2, [r2 + 48] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 lea r0, [r0 + 2*r1] lea r2, [r2 + 2*r3] movu m1, [r0] movu m2, [r2] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 16] movu m2, [r2 + 16] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 32] movu m2, [r2 + 32] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 48] movu m2, [r2 + 48] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 lea r0, [r0 + 2*r1] lea r2, [r2 + 2*r3] dec r4d jnz .loop HADDD m0, m1 movd eax, m0 RET %endmacro %macro SSD_SS_32xN 0 SSD_SS_32 8 SSD_SS_32 16 SSD_SS_32 24 SSD_SS_32 32 SSD_SS_32 64 %endmacro %macro SSD_SS_24 0 cglobal pixel_ssd_ss_24x32, 4,7,6 FIX_STRIDES r1, r3 mov r4d, 16 pxor m0, m0 .loop: movu m1, [r0] movu m2, [r2] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 16] movu m2, [r2 + 16] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 32] movu m2, [r2 + 32] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 lea r0, [r0 + 2*r1] lea r2, [r2 + 2*r3] movu m1, [r0] movu m2, [r2] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 16] movu m2, [r2 + 16] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 32] movu m2, [r2 + 32] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 lea r0, [r0 + 2*r1] lea r2, [r2 + 2*r3] dec r4d jnz .loop HADDD m0, m1 movd eax, m0 RET %endmacro %macro SSD_SS_48 0 cglobal pixel_ssd_ss_48x64, 4,7,6 FIX_STRIDES r1, r3 mov r4d, 32 pxor m0, m0 .loop: movu m1, [r0] movu m2, [r2] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 16] movu m2, [r2 + 16] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 32] movu m2, [r2 + 32] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 48] movu m2, [r2 + 48] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 64] movu m2, [r2 + 64] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 80] movu m2, [r2 + 80] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 lea r0, [r0 + 2*r1] lea r2, [r2 + 2*r3] movu m1, [r0] movu m2, [r2] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 16] movu m2, [r2 + 16] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 32] movu m2, [r2 + 32] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 48] movu m2, [r2 + 48] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 64] movu m2, [r2 + 64] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 80] movu m2, [r2 + 80] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 lea r0, [r0 + 2*r1] lea r2, [r2 + 2*r3] dec r4d jnz .loop HADDD m0, m1 movd eax, m0 RET %endmacro %macro SSD_SS_64 1 cglobal pixel_ssd_ss_64x%1, 4,7,6 FIX_STRIDES r1, r3 mov r4d, %1/2 pxor m0, m0 .loop: movu m1, [r0] movu m2, [r2] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 16] movu m2, [r2 + 16] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 32] movu m2, [r2 + 32] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 48] movu m2, [r2 + 48] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 64] movu m2, [r2 + 64] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 80] movu m2, [r2 + 80] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 96] movu m2, [r2 + 96] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 112] movu m2, [r2 + 112] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 lea r0, [r0 + 2*r1] lea r2, [r2 + 2*r3] movu m1, [r0] movu m2, [r2] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 16] movu m2, [r2 + 16] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 32] movu m2, [r2 + 32] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 48] movu m2, [r2 + 48] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 64] movu m2, [r2 + 64] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 80] movu m2, [r2 + 80] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 96] movu m2, [r2 + 96] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movu m1, [r0 + 112] movu m2, [r2 + 112] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 lea r0, [r0 + 2*r1] lea r2, [r2 + 2*r3] dec r4d jnz .loop HADDD m0, m1 movd eax, m0 RET %endmacro %macro SSD_SS_64xN 0 SSD_SS_64 16 SSD_SS_64 32 SSD_SS_64 48 SSD_SS_64 64 %endmacro INIT_XMM sse2 SSD_SS_ONE SSD_SS_12x16 SSD_SS_24 SSD_SS_32xN SSD_SS_48 SSD_SS_64xN INIT_XMM sse4 SSD_SS_ONE SSD_SS_12x16 SSD_SS_24 SSD_SS_32xN SSD_SS_48 SSD_SS_64xN INIT_XMM avx SSD_SS_ONE SSD_SS_12x16 SSD_SS_24 SSD_SS_32xN SSD_SS_48 SSD_SS_64xN INIT_YMM avx2 cglobal pixel_ssd_ss_16x16, 4,6,4 add r1d, r1d add r3d, r3d pxor m2, m2 pxor m3, m3 lea r4, [3 * r1] lea r5, [3 * r3] movu m0, [r0] movu m1, [r0 + r1] psubw m0, [r2] psubw m1, [r2 + r3] pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 movu m0, [r0 + 2 * r1] movu m1, [r0 + r4] psubw m0, [r2 + 2 * r3] psubw m1, [r2 + r5] pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] movu m0, [r0] movu m1, [r0 + r1] psubw m0, [r2] psubw m1, [r2 + r3] pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 movu m0, [r0 + 2 * r1] movu m1, [r0 + r4] psubw m0, [r2 + 2 * r3] psubw m1, [r2 + r5] pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] movu m0, [r0] movu m1, [r0 + r1] psubw m0, [r2] psubw m1, [r2 + r3] pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 movu m0, [r0 + 2 * r1] movu m1, [r0 + r4] psubw m0, [r2 + 2 * r3] psubw m1, [r2 + r5] pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] movu m0, [r0] movu m1, [r0 + r1] psubw m0, [r2] psubw m1, [r2 + r3] pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 movu m0, [r0 + 2 * r1] movu m1, [r0 + r4] psubw m0, [r2 + 2 * r3] psubw m1, [r2 + r5] pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 paddd m2, m3 HADDD m2, m0 movd eax, xm2 RET INIT_YMM avx2 cglobal pixel_ssd_ss_32x32, 4,5,4 add r1d, r1d add r3d, r3d pxor m2, m2 pxor m3, m3 mov r4d, 16 .loop: movu m0, [r0] movu m1, [r0 + mmsize] psubw m0, [r2] psubw m1, [r2 + mmsize] pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 movu m0, [r0 + r1] movu m1, [r0 + r1 + mmsize] psubw m0, [r2 + r3] psubw m1, [r2 + r3 + mmsize] pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jne .loop paddd m2, m3 HADDD m2, m0 movd eax, xm2 RET INIT_YMM avx2 cglobal pixel_ssd_ss_64x64, 4,5,4 add r1d, r1d add r3d, r3d pxor m2, m2 pxor m3, m3 mov r4d,64 .loop: movu m0, [r0] movu m1, [r0 + mmsize] psubw m0, [r2] psubw m1, [r2 + mmsize] pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 movu m0, [r0 + 2 * mmsize] movu m1, [r0 + 3 * mmsize] psubw m0, [r2 + 2 * mmsize] psubw m1, [r2 + 3 * mmsize] pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 add r0, r1 add r2, r3 dec r4d jne .loop paddd m2, m3 HADDD m2, m0 movd eax, xm2 RET %endif ; !HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 %macro SSD_LOAD_FULL 5 movu m1, [t0+%1] movu m2, [t2+%2] movu m3, [t0+%3] movu m4, [t2+%4] %if %5==1 add t0, t1 add t2, t3 %elif %5==2 lea t0, [t0+2*t1] lea t2, [t2+2*t3] %endif %endmacro %macro LOAD 5 movh m%1, %3 movh m%2, %4 %if %5 lea t0, [t0+2*t1] %endif %endmacro %macro JOIN 7 movh m%3, %5 movh m%4, %6 %if %7 lea t2, [t2+2*t3] %endif punpcklbw m%1, m7 punpcklbw m%3, m7 psubw m%1, m%3 punpcklbw m%2, m7 punpcklbw m%4, m7 psubw m%2, m%4 %endmacro %macro JOIN_SSE2 7 movh m%3, %5 movh m%4, %6 %if %7 lea t2, [t2+2*t3] %endif punpcklqdq m%1, m%2 punpcklqdq m%3, m%4 DEINTB %2, %1, %4, %3, 7 psubw m%2, m%4 psubw m%1, m%3 %endmacro %macro JOIN_SSSE3 7 movh m%3, %5 movh m%4, %6 %if %7 lea t2, [t2+2*t3] %endif punpcklbw m%1, m%3 punpcklbw m%2, m%4 %endmacro %macro LOAD_AVX2 5 mova xm%1, %3 vinserti128 m%1, m%1, %4, 1 %if %5 lea t0, [t0+2*t1] %endif %endmacro %macro JOIN_AVX2 7 mova xm%2, %5 vinserti128 m%2, m%2, %6, 1 %if %7 lea t2, [t2+2*t3] %endif SBUTTERFLY bw, %1, %2, %3 %endmacro %macro SSD_LOAD_HALF 5 LOAD 1, 2, [t0+%1], [t0+%3], 1 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1 LOAD 3, 4, [t0+%1], [t0+%3], %5 JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5 %endmacro %macro SSD_CORE 7-8 %ifidn %8, FULL mova m%6, m%2 mova m%7, m%4 psubusb m%2, m%1 psubusb m%4, m%3 psubusb m%1, m%6 psubusb m%3, m%7 por m%1, m%2 por m%3, m%4 punpcklbw m%2, m%1, m%5 punpckhbw m%1, m%5 punpcklbw m%4, m%3, m%5 punpckhbw m%3, m%5 %endif pmaddwd m%1, m%1 pmaddwd m%2, m%2 pmaddwd m%3, m%3 pmaddwd m%4, m%4 %endmacro %macro SSD_CORE_SSE2 7-8 %ifidn %8, FULL DEINTB %6, %1, %7, %2, %5 psubw m%6, m%7 psubw m%1, m%2 SWAP %6, %2, %1 DEINTB %6, %3, %7, %4, %5 psubw m%6, m%7 psubw m%3, m%4 SWAP %6, %4, %3 %endif pmaddwd m%1, m%1 pmaddwd m%2, m%2 pmaddwd m%3, m%3 pmaddwd m%4, m%4 %endmacro %macro SSD_CORE_SSSE3 7-8 %ifidn %8, FULL punpckhbw m%6, m%1, m%2 punpckhbw m%7, m%3, m%4 punpcklbw m%1, m%2 punpcklbw m%3, m%4 SWAP %6, %2, %3 SWAP %7, %4 %endif pmaddubsw m%1, m%5 pmaddubsw m%2, m%5 pmaddubsw m%3, m%5 pmaddubsw m%4, m%5 pmaddwd m%1, m%1 pmaddwd m%2, m%2 pmaddwd m%3, m%3 pmaddwd m%4, m%4 %endmacro %macro SSD_ITER 6 SSD_LOAD_%1 %2,%3,%4,%5,%6 SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1 paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 %endmacro ;----------------------------------------------------------------------------- ; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SSD 2 %if %1 != %2 %assign function_align 8 %else %assign function_align 16 %endif cglobal pixel_ssd_%1x%2, 0,0,0 mov al, %1*%2/mmsize/2 %if %1 != %2 jmp mangle(private_prefix %+ _ %+ pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop) %else .startloop: %if ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3 PROLOGUE 0,0,8 %else PROLOGUE 0,5 DECLARE_REG_TMP 1,2,3,4 mov t0, r0m mov t1, r1m mov t2, r2m mov t3, r3m %endif %if cpuflag(ssse3) mova m7, [hsub_mul] %elifidn cpuname, sse2 mova m7, [pw_00ff] %elif %1 >= mmsize pxor m7, m7 %endif pxor m0, m0 ALIGN 16 .loop: %if %1 > mmsize SSD_ITER FULL, 0, 0, mmsize, mmsize, 1 %elif %1 == mmsize SSD_ITER FULL, 0, 0, t1, t3, 2 %else SSD_ITER HALF, 0, 0, t1, t3, 2 %endif dec al jg .loop %if mmsize==32 vextracti128 xm1, m0, 1 paddd xm0, xm1 HADDD xm0, xm1 movd eax, xm0 %else HADDD m0, m1 movd eax, m0 %endif %if (mmsize == 8) emms %endif RET %endif %endmacro %macro AVS_SSD 0 SSD 32, 64 SSD 16, 64 SSD 32, 32 SSD 32, 16 SSD 16, 32 SSD 32, 8 SSD 8, 32 SSD 32, 24 SSD 24, 24 ; not used, but resolves xavs2_pixel_ssd_24x24_sse2.startloop symbol SSD 8, 4 SSD 8, 8 SSD 16, 16 SSD 16, 12 SSD 16, 8 SSD 8, 16 SSD 16, 4 %endmacro INIT_MMX mmx SSD 16, 16 SSD 16, 8 SSD 8, 8 SSD 8, 16 SSD 4, 4 SSD 8, 4 SSD 4, 8 SSD 4, 16 INIT_XMM sse2slow SSD 16, 16 SSD 8, 8 SSD 16, 8 SSD 8, 16 SSD 8, 4 INIT_XMM sse2 %define SSD_CORE SSD_CORE_SSE2 %define JOIN JOIN_SSE2 AVS_SSD INIT_XMM ssse3 %define SSD_CORE SSD_CORE_SSSE3 %define JOIN JOIN_SSSE3 AVS_SSD INIT_XMM avx AVS_SSD INIT_MMX ssse3 SSD 4, 4 SSD 4, 8 SSD 4, 16 INIT_XMM xop SSD 16, 16 SSD 8, 8 SSD 16, 8 SSD 8, 16 SSD 8, 4 %define LOAD LOAD_AVX2 %define JOIN JOIN_AVX2 INIT_YMM avx2 SSD 16, 16 SSD 16, 8 SSD 32, 32 SSD 64, 64 %assign function_align 16 %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; int pixel_ssd_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_12x16, 4, 5, 7, src1, stride1, src2, stride2 pxor m6, m6 mov r4d, 4 .loop: movu m0, [r0] movu m1, [r2] movu m2, [r0 + r1] movu m3, [r2 + r3] punpckhdq m4, m0, m2 punpckhdq m5, m1, m3 pmovzxbw m0, m0 pmovzxbw m1, m1 pmovzxbw m2, m2 pmovzxbw m3, m3 pmovzxbw m4, m4 pmovzxbw m5, m5 psubw m0, m1 psubw m2, m3 psubw m4, m5 pmaddwd m0, m0 pmaddwd m2, m2 pmaddwd m4, m4 paddd m0, m2 paddd m6, m4 paddd m6, m0 movu m0, [r0 + 2 * r1] movu m1, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m2, [r0 + r1] movu m3, [r2 + r3] punpckhdq m4, m0, m2 punpckhdq m5, m1, m3 pmovzxbw m0, m0 pmovzxbw m1, m1 pmovzxbw m2, m2 pmovzxbw m3, m3 pmovzxbw m4, m4 pmovzxbw m5, m5 psubw m0, m1 psubw m2, m3 psubw m4, m5 pmaddwd m0, m0 pmaddwd m2, m2 pmaddwd m4, m4 paddd m0, m2 paddd m6, m4 paddd m6, m0 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop HADDD m6, m1 movd eax, m6 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_24x32, 4, 5, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r4d, 16 .loop: movu m1, [r0] pmovzxbw m0, m1 punpckhbw m1, m6 pmovzxbw m2, [r0 + 16] movu m4, [r2] pmovzxbw m3, m4 punpckhbw m4, m6 pmovzxbw m5, [r2 + 16] psubw m0, m3 psubw m1, m4 psubw m2, m5 pmaddwd m0, m0 pmaddwd m1, m1 pmaddwd m2, m2 paddd m0, m1 paddd m7, m2 paddd m7, m0 movu m1, [r0 + r1] pmovzxbw m0, m1 punpckhbw m1, m6 pmovzxbw m2, [r0 + r1 + 16] movu m4, [r2 + r3] pmovzxbw m3, m4 punpckhbw m4, m6 pmovzxbw m5, [r2 + r3 + 16] psubw m0, m3 psubw m1, m4 psubw m2, m5 pmaddwd m0, m0 pmaddwd m1, m1 pmaddwd m2, m2 paddd m0, m1 paddd m7, m2 paddd m7, m0 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop HADDD m7, m1 movd eax, m7 RET %macro PIXEL_SSD_16x4 0 movu m1, [r0] pmovzxbw m0, m1 punpckhbw m1, m6 movu m3, [r2] pmovzxbw m2, m3 punpckhbw m3, m6 psubw m0, m2 psubw m1, m3 movu m5, [r0 + r1] pmovzxbw m4, m5 punpckhbw m5, m6 movu m3, [r2 + r3] pmovzxbw m2, m3 punpckhbw m3, m6 psubw m4, m2 psubw m5, m3 pmaddwd m0, m0 pmaddwd m1, m1 pmaddwd m4, m4 pmaddwd m5, m5 paddd m0, m1 paddd m4, m5 paddd m4, m0 paddd m7, m4 movu m1, [r0 + r6] pmovzxbw m0, m1 punpckhbw m1, m6 movu m3, [r2 + 2 * r3] pmovzxbw m2, m3 punpckhbw m3, m6 psubw m0, m2 psubw m1, m3 lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] movu m5, [r0 + r1] pmovzxbw m4, m5 punpckhbw m5, m6 movu m3, [r2 + r3] pmovzxbw m2, m3 punpckhbw m3, m6 psubw m4, m2 psubw m5, m3 pmaddwd m0, m0 pmaddwd m1, m1 pmaddwd m4, m4 pmaddwd m5, m5 paddd m0, m1 paddd m4, m5 paddd m4, m0 paddd m7, m4 %endmacro cglobal pixel_ssd_16x16_internal PIXEL_SSD_16x4 lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] PIXEL_SSD_16x4 lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] PIXEL_SSD_16x4 lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] PIXEL_SSD_16x4 ret ;----------------------------------------------------------------------------- ; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_48x64, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r4, r0 mov r5, r2 lea r6, [r1 * 2] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r4 + 16] lea r2, [r5 + 16] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r4 + 32] lea r2, [r5 + 32] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_64x16, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r4, r0 mov r5, r2 lea r6, [r1 * 2] call pixel_ssd_16x16_internal lea r0, [r4 + 16] lea r2, [r5 + 16] call pixel_ssd_16x16_internal lea r0, [r4 + 32] lea r2, [r5 + 32] call pixel_ssd_16x16_internal lea r0, [r4 + 48] lea r2, [r5 + 48] call pixel_ssd_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_64x32, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r4, r0 mov r5, r2 lea r6, [r1 * 2] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r4 + 16] lea r2, [r5 + 16] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r4 + 32] lea r2, [r5 + 32] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r4 + 48] lea r2, [r5 + 48] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_64x48, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r4, r0 mov r5, r2 lea r6, [r1 * 2] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r4 + 16] lea r2, [r5 + 16] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r4 + 32] lea r2, [r5 + 32] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r4 + 48] lea r2, [r5 + 48] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_64x64, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r4, r0 mov r5, r2 lea r6, [r1 * 2] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r4 + 16] lea r2, [r5 + 16] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r4 + 32] lea r2, [r5 + 32] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r4 + 48] lea r2, [r5 + 48] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal lea r0, [r0 + r6] lea r2, [r2 + 2 * r3] call pixel_ssd_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_sp ( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_ssd_sp_4x4_internal movh m0, [r0] movh m1, [r0 + r1] punpcklqdq m0, m1 movd m2, [r2] movd m3, [r2 + r3] punpckldq m2, m3 pmovzxbw m2, m2 psubw m0, m2 movh m4, [r0 + 2 * r1] movh m5, [r0 + r4] punpcklqdq m4, m5 movd m6, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movd m1, [r2 + r3] punpckldq m6, m1 pmovzxbw m6, m6 psubw m4, m6 pmaddwd m0, m0 pmaddwd m4, m4 paddd m0, m4 paddd m7, m0 ret ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_4x4( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_4x4, 4, 5, 8, src1, stride1, src2, stride2 pxor m7, m7 add r1, r1 lea r4, [r1 * 3] call pixel_ssd_sp_4x4_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_4x8( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_4x8, 4, 5, 8, src1, stride1, src2, stride2 pxor m7, m7 add r1, r1 lea r4, [r1 * 3] call pixel_ssd_sp_4x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_4x4_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_4x16( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_4x16, 4, 5, 8, src1, stride1, src2, stride2 pxor m7, m7 add r1, r1 lea r4, [r1 * 3] call pixel_ssd_sp_4x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_4x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_4x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_4x4_internal HADDD m7, m1 movd eax, m7 RET cglobal pixel_ssd_sp_8x4_internal movu m0, [r0] movu m1, [r0 + r1] movh m2, [r2] movh m3, [r2 + r3] pmovzxbw m2, m2 pmovzxbw m3, m3 psubw m0, m2 psubw m1, m3 movu m4, [r0 + 2 * r1] movu m5, [r0 + r4] movh m2, [r2 + 2 * r3] movh m3, [r2 + r5] pmovzxbw m2, m2 pmovzxbw m3, m3 psubw m4, m2 psubw m5, m3 pmaddwd m0, m0 pmaddwd m1, m1 pmaddwd m4, m4 pmaddwd m5, m5 paddd m0, m1 paddd m4, m5 paddd m4, m0 paddd m7, m4 ret ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_8x4( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_8x4, 4, 6, 8, src1, stride1, src2, stride2 pxor m7, m7 add r1, r1 lea r4, [r1 * 3] lea r5, [r3 * 3] call pixel_ssd_sp_8x4_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_8x8( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_8x8, 4, 6, 8, src1, stride1, src2, stride2 pxor m7, m7 add r1, r1 lea r4, [r1 * 3] lea r5, [r3 * 3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_8x16( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_8x16, 4, 6, 8, src1, stride1, src2, stride2 pxor m7, m7 add r1, r1 lea r4, [r1 * 3] lea r5, [r3 * 3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_8x32( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_8x32, 4, 6, 8, src1, stride1, src2, stride2 pxor m7, m7 add r1, r1 lea r4, [r1 * 3] lea r5, [r3 * 3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_12x16( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_12x16, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 add r1, r1 lea r4, [r1 * 3] mov r5, r0 mov r6, r2 call pixel_ssd_sp_4x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_4x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_4x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_4x4_internal lea r0, [r5 + 8] lea r2, [r6 + 4] lea r5, [r3 * 3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal HADDD m7, m1 movd eax, m7 RET %macro PIXEL_SSD_SP_16x4 0 movu m0, [r0] movu m1, [r0 + 16] movu m3, [r2] pmovzxbw m2, m3 punpckhbw m3, m6 psubw m0, m2 psubw m1, m3 movu m4, [r0 + r1] movu m5, [r0 + r1 +16] movu m3, [r2 + r3] pmovzxbw m2, m3 punpckhbw m3, m6 psubw m4, m2 psubw m5, m3 pmaddwd m0, m0 pmaddwd m1, m1 pmaddwd m4, m4 pmaddwd m5, m5 paddd m0, m1 paddd m4, m5 paddd m4, m0 paddd m7, m4 movu m0, [r0 + 2 * r1] movu m1, [r0 + 2 * r1 + 16] movu m3, [r2 + 2 * r3] pmovzxbw m2, m3 punpckhbw m3, m6 psubw m0, m2 psubw m1, m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m4, [r0 + r1] movu m5, [r0 + r1 + 16] movu m3, [r2 + r3] pmovzxbw m2, m3 punpckhbw m3, m6 psubw m4, m2 psubw m5, m3 pmaddwd m0, m0 pmaddwd m1, m1 pmaddwd m4, m4 pmaddwd m5, m5 paddd m0, m1 paddd m4, m5 paddd m4, m0 paddd m7, m4 %endmacro ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_16x4( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_16x4, 4, 6, 8, src1, stride1, src2, stride2 pxor m6, m6 pxor m7, m7 add r1, r1 PIXEL_SSD_SP_16x4 HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_16x8( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_16x8, 4, 4, 8, src1, stride1, src2, stride2 pxor m6, m6 pxor m7, m7 add r1, r1 PIXEL_SSD_SP_16x4 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] PIXEL_SSD_SP_16x4 HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_16x12( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_16x12, 4, 6, 8, src1, stride1, src2, stride2 pxor m6, m6 pxor m7, m7 add r1, r1 lea r4, [r1 * 2] lea r5, [r3 * 2] PIXEL_SSD_SP_16x4 lea r0, [r0 + r4] lea r2, [r2 + r5] PIXEL_SSD_SP_16x4 lea r0, [r0 + r4] lea r2, [r2 + r5] PIXEL_SSD_SP_16x4 HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_16x16( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_16x16, 4, 6, 8, src1, stride1, src2, stride2 pxor m6, m6 pxor m7, m7 add r1, r1 lea r4, [r1 * 2] lea r5, [r3 * 2] PIXEL_SSD_SP_16x4 lea r0, [r0 + r4] lea r2, [r2 + r5] PIXEL_SSD_SP_16x4 lea r0, [r0 + r4] lea r2, [r2 + r5] PIXEL_SSD_SP_16x4 lea r0, [r0 + r4] lea r2, [r2 + r5] PIXEL_SSD_SP_16x4 HADDD m7, m1 movd eax, m7 RET cglobal pixel_ssd_sp_16x16_internal PIXEL_SSD_SP_16x4 lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] PIXEL_SSD_SP_16x4 lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] PIXEL_SSD_SP_16x4 lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] PIXEL_SSD_SP_16x4 ret ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_16x32( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_16x32, 4, 5, 8, src1, stride1, src2, stride2 pxor m6, m6 pxor m7, m7 add r1, r1 lea r4, [r1 * 2] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_16x64( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_16x64, 4, 6, 8, src1, stride1, src2, stride2 pxor m6, m6 pxor m7, m7 add r1, r1 lea r4, [r1 * 2] lea r5, [r3 * 2] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + r5] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + r5] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + r5] call pixel_ssd_sp_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_sp_24x32( int16_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_24x32, 4, 7, 8, src1, stride1, src2, stride2 pxor m6, m6 pxor m7, m7 add r1, r1 lea r4, [r1 * 2] mov r5, r0 mov r6, r2 call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 32] lea r2, [r6 + 16] lea r4, [r1 * 3] lea r5, [r3 * 3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] call pixel_ssd_sp_8x4_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_32x8, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r5, r0 mov r6, r2 add r1, r1 lea r4, [r1 * 2] PIXEL_SSD_SP_16x4 lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] PIXEL_SSD_SP_16x4 lea r0, [r5 + 32] lea r2, [r6 + 16] PIXEL_SSD_SP_16x4 lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] PIXEL_SSD_SP_16x4 HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_32x16, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r5, r0 mov r6, r2 add r1, r1 lea r4, [r1 * 2] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 32] lea r2, [r6 + 16] call pixel_ssd_sp_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_32x24, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r5, r0 mov r6, r2 add r1, r1 lea r4, [r1 * 2] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] PIXEL_SSD_SP_16x4 lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] PIXEL_SSD_SP_16x4 lea r0, [r5 + 32] lea r2, [r6 + 16] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] PIXEL_SSD_SP_16x4 lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] PIXEL_SSD_SP_16x4 HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_32x32, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r5, r0 mov r6, r2 add r1, r1 lea r4, [r1 * 2] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 32] lea r2, [r6 + 16] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_32x64, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r5, r0 mov r6, r2 add r1, r1 lea r4, [r1 * 2] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 32] lea r2, [r6 + 16] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_48x64, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r5, r0 mov r6, r2 add r1, r1 lea r4, [r1 * 2] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 32] lea r2, [r6 + 16] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 64] lea r2, [r6 + 32] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_64x16, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r5, r0 mov r6, r2 add r1, r1 lea r4, [r1 * 2] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 32] lea r2, [r6 + 16] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 64] lea r2, [r6 + 32] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 96] lea r2, [r6 + 48] call pixel_ssd_sp_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_64x32, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r5, r0 mov r6, r2 add r1, r1 lea r4, [r1 * 2] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 32] lea r2, [r6 + 16] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 64] lea r2, [r6 + 32] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 96] lea r2, [r6 + 48] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_64x48, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r5, r0 mov r6, r2 add r1, r1 lea r4, [r1 * 2] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 32] lea r2, [r6 + 16] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 64] lea r2, [r6 + 32] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 96] lea r2, [r6 + 48] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal pixel_ssd_sp_64x64, 4, 7, 8, src1, stride1, src2, stride2 pxor m7, m7 pxor m6, m6 mov r5, r0 mov r6, r2 add r1, r1 lea r4, [r1 * 2] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 32] lea r2, [r6 + 16] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 64] lea r2, [r6 + 32] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r5 + 96] lea r2, [r6 + 48] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal lea r0, [r0 + r4] lea r2, [r2 + 2 * r3] call pixel_ssd_sp_16x16_internal HADDD m7, m1 movd eax, m7 RET ;----------------------------------------------------------------------------- ; int pixel_ssd_s( int16_t *ref, intptr_t i_stride ) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal pixel_ssd_s_4, 2,2,2 add r1, r1 movh m0, [r0] movhps m0, [r0 + r1] lea r0, [r0 + r1 * 2] movh m1, [r0] movhps m1, [r0 + r1] pmaddwd m0, m0 pmaddwd m1, m1 paddd m0, m1 ; calculate sum and return HADDD m0, m1 movd eax, m0 RET INIT_XMM sse2 cglobal pixel_ssd_s_8, 2,3,5 add r1, r1 lea r2, [r1 * 3] movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r2] pmaddwd m0, m0 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 paddd m0, m1 paddd m2, m3 paddd m0, m2 lea r0, [r0 + r1 * 4] movu m4, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r2] pmaddwd m4, m4 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 paddd m4, m1 paddd m2, m3 paddd m4, m2 paddd m0, m4 ; calculate sum and return HADDD m0, m1 movd eax, m0 RET INIT_XMM sse2 cglobal pixel_ssd_s_16, 2,3,5 add r1, r1 mov r2d, 4 pxor m0, m0 .loop: movu m1, [r0] movu m2, [r0 + mmsize] movu m3, [r0 + r1] movu m4, [r0 + r1 + mmsize] lea r0, [r0 + r1 * 2] pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 paddd m1, m2 paddd m3, m4 paddd m1, m3 paddd m0, m1 movu m1, [r0] movu m2, [r0 + mmsize] movu m3, [r0 + r1] movu m4, [r0 + r1 + mmsize] lea r0, [r0 + r1 * 2] pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 paddd m1, m2 paddd m3, m4 paddd m1, m3 paddd m0, m1 dec r2d jnz .loop ; calculate sum and return HADDD m0, m1 movd eax, m0 RET INIT_XMM sse2 cglobal pixel_ssd_s_32, 2,3,5 add r1, r1 mov r2d, 16 pxor m0, m0 .loop: movu m1, [r0 + 0 * mmsize] movu m2, [r0 + 1 * mmsize] movu m3, [r0 + 2 * mmsize] movu m4, [r0 + 3 * mmsize] add r0, r1 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 paddd m1, m2 paddd m3, m4 paddd m1, m3 paddd m0, m1 movu m1, [r0 + 0 * mmsize] movu m2, [r0 + 1 * mmsize] movu m3, [r0 + 2 * mmsize] movu m4, [r0 + 3 * mmsize] add r0, r1 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 paddd m1, m2 paddd m3, m4 paddd m1, m3 paddd m0, m1 dec r2d jnz .loop %if BIT_DEPTH >= 10 movu m1, m0 pxor m2, m2 punpckldq m0, m2 punpckhdq m1, m2 paddq m0, m1 movhlps m1, m0 paddq m0, m1 movq rax, xm0 %else ; calculate sum and return HADDD m0, m1 movd eax, m0 %endif RET INIT_YMM avx2 cglobal pixel_ssd_s_16, 2,4,5 add r1, r1 lea r3, [r1 * 3] mov r2d, 16/4 pxor m0, m0 .loop: movu m1, [r0] movu m2, [r0 + r1] movu m3, [r0 + 2 * r1] movu m4, [r0 + r3] lea r0, [r0 + r1 * 4] pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 paddd m1, m2 paddd m3, m4 paddd m1, m3 paddd m0, m1 dec r2d jnz .loop ; calculate sum and return HADDD m0, m1 movd eax, xm0 RET INIT_YMM avx2 cglobal pixel_ssd_s_32, 2,4,5 add r1, r1 lea r3, [r1 * 3] mov r2d, 8 pxor m0, m0 .loop: movu m1, [r0 + 0 * mmsize] movu m2, [r0 + 1 * mmsize] movu m3, [r0 + r1 + 0 * mmsize] movu m4, [r0 + r1 + 1 * mmsize] pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 paddd m1, m2 paddd m3, m4 paddd m1, m3 paddd m0, m1 movu m1, [r0 + r1 * 2 + 0 * mmsize] movu m2, [r0 + r1 * 2 + 1 * mmsize] movu m3, [r0 + r3 + 0 * mmsize] movu m4, [r0 + r3 + 1 * mmsize] lea r0, [r0 + 4 * r1] pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 paddd m1, m2 paddd m3, m4 paddd m1, m3 paddd m0, m1 dec r2d jnz .loop %if BIT_DEPTH >= 10 movu m1, m0 pxor m2, m2 punpckldq m0, m2 punpckhdq m1, m2 paddq m0, m1 vextracti128 xm2, m0, 1 paddq xm2, xm0 movhlps xm1, xm2 paddq xm2, xm1 movq rax, xm2 %else ; calculate sum and return HADDD m0, m1 movd eax, xm0 %endif RET xavs2-1.3/source/common/x86/x86inc.asm000066400000000000000000001255441340660520300174700ustar00rootroot00000000000000;***************************************************************************** ;* x86inc.asm: x264asm abstraction layer ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* Copyright (C) 2013-2017 MulticoreWarae, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Loren Merritt ;* Anton Mitrofanov ;* Fiona Glaser ;* Henrik Gramner ;* Min Chen ;* Jiaqi Zhang ;* ;* Permission to use, copy, modify, and/or distribute this software for any ;* purpose with or without fee is hereby granted, provided that the above ;* copyright notice and this permission notice appear in all copies. ;* ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;***************************************************************************** ; This is a header file for the x264ASM assembly language, which uses ; NASM/YASM syntax combined with a large number of macros to provide easy ; abstraction between different calling conventions (x86_32, win64, linux64). ; It also has various other useful features to simplify writing the kind of ; DSP functions that are most often used in x264. ; Unlike the rest of x264, this file is available under an ISC license, as it ; has significant usefulness outside of x264 and we want it to be available ; to the largest audience possible. Of course, if you modify it for your own ; purposes to add a new feature, we strongly encourage contributing a patch ; as this feature might be useful for others as well. Send patches or ideas ; to x264-devel@videolan.org . %ifndef private_prefix %define private_prefix xavs2 %endif %ifndef public_prefix %define public_prefix private_prefix %endif %ifndef STACK_ALIGNMENT %if ARCH_X86_64 %define STACK_ALIGNMENT 16 %else %define STACK_ALIGNMENT 4 %endif %endif %define WIN64 0 %define UNIX64 0 %if ARCH_X86_64 %ifidn __OUTPUT_FORMAT__,win32 %define WIN64 1 %elifidn __OUTPUT_FORMAT__,win64 %define WIN64 1 %elifidn __OUTPUT_FORMAT__,x64 %define WIN64 1 %else %define UNIX64 1 %endif %endif %ifdef PREFIX %define mangle(x) _ %+ x %else %define mangle(x) x %endif %macro SECTION_RODATA 0-1 32 SECTION .rodata align=%1 %endmacro %macro SECTION_TEXT 0-1 16 SECTION .text align=%1 %endmacro %if WIN64 %define PIC %elif ARCH_X86_64 == 0 ; x86_32 doesn't require PIC. ; Some distros prefer shared objects to be PIC, but nothing breaks if ; the code contains a few textrels, so we'll skip that complexity. %undef PIC %endif %ifdef PIC default rel %endif ; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments ; into registers at the start, and make no other use of the stack. Luckily that ; covers most of x264's asm. ; PROLOGUE: ; %1 = number of arguments. loads them from stack if needed. ; %2 = number of registers used. pushes callee-saved regs if needed. ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. ; %4 = (optional) stack size to be allocated. The stack will be aligned before ; allocating the specified stack size. If the required stack alignment is ; larger than the known stack alignment the stack will be manually aligned ; and an extra register will be allocated to hold the original stack ; pointer (to not invalidate r0m etc.). To prevent the use of an extra ; register as stack pointer, request a negative stack size. ; %4+/%5+ = list of names to define to registers ; PROLOGUE can also be invoked by adding the same options to cglobal ; e.g. ; cglobal foo, 2,3,7,0x40, dst, src, tmp ; declares a function (foo) that automatically loads two arguments (dst and ; src) into registers, uses one additional register (tmp) plus 7 vector ; registers (m0-m6) and allocates 0x40 bytes of stack space. ; TODO Some functions can use some args directly from the stack. If they're the ; last args then you can just not declare them, but if they're in the middle ; we need more flexible macro. ; RET: ; Pops anything that was pushed by PROLOGUE, and returns. ; REP_RET: ; Use this instead of RET if it's a branch target. ; registers: ; rN and rNq are the native-size register holding function argument N ; rNd, rNw, rNb are dword, word, and byte size ; rNh is the high 8 bits of the word size ; rNm is the original location of arg N (a register or on the stack), dword ; rNmp is native size %macro DECLARE_REG 2-3 %define r%1q %2 %define r%1d %2d %define r%1w %2w %define r%1b %2b %define r%1h %2h %if %0 == 2 %define r%1m %2d %define r%1mp %2 %elif ARCH_X86_64 ; memory %define r%1m [rstk + stack_offset + %3] %define r%1mp qword r %+ %1 %+ m %else %define r%1m [rstk + stack_offset + %3] %define r%1mp dword r %+ %1 %+ m %endif %define r%1 %2 %endmacro %macro DECLARE_REG_SIZE 3 %define r%1q r%1 %define e%1q r%1 %define r%1d e%1 %define e%1d e%1 %define r%1w %1 %define e%1w %1 %define r%1h %3 %define e%1h %3 %define r%1b %2 %define e%1b %2 %if ARCH_X86_64 == 0 %define r%1 e%1 %endif %endmacro DECLARE_REG_SIZE ax, al, ah DECLARE_REG_SIZE bx, bl, bh DECLARE_REG_SIZE cx, cl, ch DECLARE_REG_SIZE dx, dl, dh DECLARE_REG_SIZE si, sil, null DECLARE_REG_SIZE di, dil, null DECLARE_REG_SIZE bp, bpl, null ; t# defines for when per-arch register allocation is more complex than just function arguments %macro DECLARE_REG_TMP 1-* %assign %%i 0 %rep %0 CAT_XDEFINE t, %%i, r%1 %assign %%i %%i+1 %rotate 1 %endrep %endmacro %macro DECLARE_REG_TMP_SIZE 0-* %rep %0 %define t%1q t%1 %+ q %define t%1d t%1 %+ d %define t%1w t%1 %+ w %define t%1h t%1 %+ h %define t%1b t%1 %+ b %rotate 1 %endrep %endmacro DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %if ARCH_X86_64 %define gprsize 8 %else %define gprsize 4 %endif %macro PUSH 1 push %1 %ifidn rstk, rsp %assign stack_offset stack_offset+gprsize %endif %endmacro %macro POP 1 pop %1 %ifidn rstk, rsp %assign stack_offset stack_offset-gprsize %endif %endmacro %macro PUSH_IF_USED 1-* %rep %0 %if %1 < regs_used PUSH r%1 %endif %rotate 1 %endrep %endmacro %macro POP_IF_USED 1-* %rep %0 %if %1 < regs_used pop r%1 %endif %rotate 1 %endrep %endmacro %macro LOAD_IF_USED 1-* %rep %0 %if %1 < num_args mov r%1, r %+ %1 %+ mp %endif %rotate 1 %endrep %endmacro %macro SUB 2 sub %1, %2 %ifidn %1, rstk %assign stack_offset stack_offset+(%2) %endif %endmacro %macro ADD 2 add %1, %2 %ifidn %1, rstk %assign stack_offset stack_offset-(%2) %endif %endmacro %macro movifnidn 2 %ifnidn %1, %2 mov %1, %2 %endif %endmacro %macro movsxdifnidn 2 %ifnidn %1, %2 movsxd %1, %2 %endif %endmacro %macro ASSERT 1 %if (%1) == 0 %error assert failed %endif %endmacro %macro DEFINE_ARGS 0-* %ifdef n_arg_names %assign %%i 0 %rep n_arg_names CAT_UNDEF arg_name %+ %%i, q CAT_UNDEF arg_name %+ %%i, d CAT_UNDEF arg_name %+ %%i, w CAT_UNDEF arg_name %+ %%i, h CAT_UNDEF arg_name %+ %%i, b CAT_UNDEF arg_name %+ %%i, m CAT_UNDEF arg_name %+ %%i, mp CAT_UNDEF arg_name, %%i %assign %%i %%i+1 %endrep %endif %xdefine %%stack_offset stack_offset %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine %assign %%i 0 %rep %0 %xdefine %1q r %+ %%i %+ q %xdefine %1d r %+ %%i %+ d %xdefine %1w r %+ %%i %+ w %xdefine %1h r %+ %%i %+ h %xdefine %1b r %+ %%i %+ b %xdefine %1m r %+ %%i %+ m %xdefine %1mp r %+ %%i %+ mp CAT_XDEFINE arg_name, %%i, %1 %assign %%i %%i+1 %rotate 1 %endrep %xdefine stack_offset %%stack_offset %assign n_arg_names %0 %endmacro %define required_stack_alignment ((mmsize + 15) & ~15) %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) %ifnum %1 %if %1 != 0 %assign %%pad 0 %assign stack_size %1 %if stack_size < 0 %assign stack_size -stack_size %endif %if WIN64 %assign %%pad %%pad + 32 ; shadow space %if mmsize != 8 %assign xmm_regs_used %2 %if xmm_regs_used > 8 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers %endif %endif %endif %if required_stack_alignment <= STACK_ALIGNMENT ; maintain the current stack alignment %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded %else %assign %%reg_num (regs_used - 1) %xdefine rstk r %+ %%reg_num ; align stack, and save original stack location directly above ; it, i.e. in [rsp+stack_size_padded], so we can restore the ; stack in a single instruction (i.e. mov rsp, rstk or mov ; rsp, [rsp+stack_size_padded]) %if %1 < 0 ; need to store rsp on stack %xdefine rstkm [rsp + stack_size + %%pad] %assign %%pad %%pad + gprsize %else ; can keep rsp in rstk during whole function %xdefine rstkm rstk %endif %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) mov rstk, rsp and rsp, ~(required_stack_alignment-1) sub rsp, stack_size_padded movifnidn rstkm, rstk %endif WIN64_PUSH_XMM %endif %endif %endmacro %macro SETUP_STACK_POINTER 1 %ifnum %1 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT %if %1 > 0 %assign regs_used (regs_used + 1) %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 %warning "Stack pointer will overwrite register argument" %endif %endif %endif %endmacro %macro DEFINE_ARGS_INTERNAL 3+ %ifnum %2 DEFINE_ARGS %3 %elif %1 == 4 DEFINE_ARGS %2 %elif %1 > 4 DEFINE_ARGS %2, %3 %endif %endmacro %if WIN64 ; Windows x64 ;================================================= DECLARE_REG 0, rcx DECLARE_REG 1, rdx DECLARE_REG 2, R8 DECLARE_REG 3, R9 DECLARE_REG 4, R10, 40 DECLARE_REG 5, R11, 48 DECLARE_REG 6, rax, 56 DECLARE_REG 7, rdi, 64 DECLARE_REG 8, rsi, 72 DECLARE_REG 9, rbx, 80 DECLARE_REG 10, rbp, 88 DECLARE_REG 11, R12, 96 DECLARE_REG 12, R13, 104 DECLARE_REG 13, R14, 112 DECLARE_REG 14, R15, 120 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 ALLOC_STACK %4, %3 %if mmsize != 8 && stack_size == 0 WIN64_SPILL_XMM %3 %endif LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro %macro WIN64_PUSH_XMM 0 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. %if xmm_regs_used > 6 movaps [rstk + stack_offset + 8], xmm6 %endif %if xmm_regs_used > 7 movaps [rstk + stack_offset + 24], xmm7 %endif %if xmm_regs_used > 8 %assign %%i 8 %rep xmm_regs_used-8 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i %assign %%i %%i+1 %endrep %endif %endmacro %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 ASSERT xmm_regs_used <= 16 %if xmm_regs_used > 8 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. %assign %%pad (xmm_regs_used-8)*16 + 32 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded %endif WIN64_PUSH_XMM %endmacro %macro WIN64_RESTORE_XMM_INTERNAL 1 %assign %%pad_size 0 %if xmm_regs_used > 8 %assign %%i xmm_regs_used %rep xmm_regs_used-8 %assign %%i %%i-1 movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] %endrep %endif %if stack_size_padded > 0 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add %1, stack_size_padded %assign %%pad_size stack_size_padded %endif %endif %if xmm_regs_used > 7 movaps xmm7, [%1 + stack_offset - %%pad_size + 24] %endif %if xmm_regs_used > 6 movaps xmm6, [%1 + stack_offset - %%pad_size + 8] %endif %endmacro %macro WIN64_RESTORE_XMM 1 WIN64_RESTORE_XMM_INTERNAL %1 %assign stack_offset (stack_offset-stack_size_padded) %assign xmm_regs_used 0 %endmacro %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 %macro RET 0 WIN64_RESTORE_XMM_INTERNAL rsp POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 %if mmsize == 32 vzeroupper %endif AUTO_REP_RET %endmacro %elif ARCH_X86_64 ; *nix x64 ;============================================= DECLARE_REG 0, rdi DECLARE_REG 1, rsi DECLARE_REG 2, rdx DECLARE_REG 3, rcx DECLARE_REG 4, R8 DECLARE_REG 5, R9 DECLARE_REG 6, rax, 8 DECLARE_REG 7, R10, 16 DECLARE_REG 8, R11, 24 DECLARE_REG 9, rbx, 32 DECLARE_REG 10, rbp, 40 DECLARE_REG 11, R12, 48 DECLARE_REG 12, R13, 56 DECLARE_REG 13, R14, 64 DECLARE_REG 14, R15, 72 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 9, 10, 11, 12, 13, 14 ALLOC_STACK %4 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro %define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 %macro RET 0 %if stack_size_padded > 0 %if required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %endif %endif POP_IF_USED 14, 13, 12, 11, 10, 9 %if mmsize == 32 vzeroupper %endif AUTO_REP_RET %endmacro %else ; X86_32 ;============================================================== DECLARE_REG 0, eax, 4 DECLARE_REG 1, ecx, 8 DECLARE_REG 2, edx, 12 DECLARE_REG 3, ebx, 16 DECLARE_REG 4, esi, 20 DECLARE_REG 5, edi, 24 DECLARE_REG 6, ebp, 28 %define rsp esp %macro DECLARE_ARG 1-* %rep %0 %define r%1m [rstk + stack_offset + 4*%1 + 4] %define r%1mp dword r%1m %rotate 1 %endrep %endmacro DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args %if num_args > 7 %assign num_args 7 %endif %if regs_used > 7 %assign regs_used 7 %endif SETUP_STACK_POINTER %4 ASSERT regs_used <= 7 PUSH_IF_USED 3, 4, 5, 6 ALLOC_STACK %4 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro %define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 %macro RET 0 %if stack_size_padded > 0 %if required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %endif %endif POP_IF_USED 6, 5, 4, 3 %if mmsize == 32 vzeroupper %endif AUTO_REP_RET %endmacro %endif ;====================================================================== %if WIN64 == 0 %macro WIN64_SPILL_XMM 1 %endmacro %macro WIN64_RESTORE_XMM 1 %endmacro %macro WIN64_PUSH_XMM 0 %endmacro %endif ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either ; a branch or a branch target. So switch to a 2-byte form of ret in that case. ; We can automatically detect "follows a branch", but not a branch target. ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) %macro REP_RET 0 %if has_epilogue RET %else rep ret %endif %endmacro %define last_branch_adr $$ %macro AUTO_REP_RET 0 %ifndef cpuflags times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr. %elif notcpuflag(ssse3) times ((last_branch_adr-$)>>31)+1 rep %endif ret %endmacro %macro BRANCH_INSTR 0-* %rep %0 %macro %1 1-2 %1 %2 %1 %%branch_instr: %xdefine last_branch_adr %%branch_instr %endmacro %rotate 1 %endrep %endmacro BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp %macro TAIL_CALL 2 ; callee, is_nonadjacent %if has_epilogue call %1 RET %elif %2 jmp %1 %endif %endmacro ;============================================================================= ; arch-independent part ;============================================================================= %assign function_align 16 ; Begin a function. ; Applies any symbol mangling needed for C linkage, and sets up a define such that ; subsequent uses of the function name automatically refer to the mangled version. ; Appends cpuflags to the function name if cpuflags has been specified. ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). %macro cglobal 1-2+ "" ; name, [PROLOGUE args] cglobal_internal 1, %1 %+ SUFFIX, %2 %endmacro %macro cvisible 1-2+ "" ; name, [PROLOGUE args] cglobal_internal 0, %1 %+ SUFFIX, %2 %endmacro %macro cglobal_internal 2-3+ %if %1 %xdefine %%FUNCTION_PREFIX private_prefix %xdefine %%VISIBILITY hidden %else %xdefine %%FUNCTION_PREFIX public_prefix %xdefine %%VISIBILITY %endif %ifndef cglobaled_%2 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) %xdefine %2.skip_prologue %2 %+ .skip_prologue CAT_XDEFINE cglobaled_, %2, 1 %endif %xdefine current_function %2 %ifidn __OUTPUT_FORMAT__,elf global %2:function %%VISIBILITY %else global %2 %endif align function_align %2: RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required %assign stack_offset 0 ; stack pointer offset relative to the return address %assign stack_size 0 ; amount of stack space that can be freely used inside a function %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 %ifnidn %3, "" PROLOGUE %3 %endif %endmacro %macro cextern 1 %xdefine %1 mangle(private_prefix %+ _ %+ %1) CAT_XDEFINE cglobaled_, %1, 1 extern %1 %endmacro ; like cextern, but without the prefix %macro cextern_naked 1 %xdefine %1 mangle(%1) CAT_XDEFINE cglobaled_, %1, 1 extern %1 %endmacro %macro const 1-2+ %xdefine %1 mangle(private_prefix %+ _ %+ %1) %ifidn __OUTPUT_FORMAT__,elf global %1:data hidden %else global %1 %endif ALIGN 32 %1: %2 %endmacro ; This is needed for ELF, otherwise the GNU linker assumes the stack is ; executable by default. %ifidn __OUTPUT_FORMAT__,elf SECTION .note.GNU-stack noalloc noexec nowrite progbits %endif ; cpuflags %assign cpuflags_mmx (1<<0) %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx %assign cpuflags_3dnow (1<<2) | cpuflags_mmx %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 %assign cpuflags_sse3 (1<<7) | cpuflags_sse2 %assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 %assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 %assign cpuflags_sse42 (1<<10)| cpuflags_sse4 %assign cpuflags_avx (1<<11)| cpuflags_sse42 %assign cpuflags_xop (1<<12)| cpuflags_avx %assign cpuflags_fma4 (1<<13)| cpuflags_avx %assign cpuflags_avx2 (1<<14)| cpuflags_avx %assign cpuflags_fma3 (1<<15)| cpuflags_avx %assign cpuflags_cache32 (1<<16) %assign cpuflags_cache64 (1<<17) %assign cpuflags_slowctz (1<<18) %assign cpuflags_lzcnt (1<<19) %assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant %assign cpuflags_atom (1<<21) %assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt %assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) ; Takes an arbitrary number of cpuflags from the above list. ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. %macro INIT_CPUFLAGS 0-* %xdefine SUFFIX %undef cpuname %assign cpuflags 0 %if %0 >= 1 %rep %0 %ifdef cpuname %xdefine cpuname cpuname %+ _%1 %else %xdefine cpuname %1 %endif %assign cpuflags cpuflags | cpuflags_%1 %rotate 1 %endrep %xdefine SUFFIX _ %+ cpuname %if cpuflag(avx) %assign avx_enabled 1 %endif %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) %define mova movaps %define movu movups %define movnta movntps %endif %if cpuflag(aligned) %define movu mova %elif cpuflag(sse3) && notcpuflag(ssse3) %define movu lddqu %endif %endif %if ARCH_X86_64 || cpuflag(sse2) CPU amdnop %else CPU basicnop %endif %endmacro ; Merge mmx and sse* ; m# is a simd register of the currently selected size ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# ; (All 3 remain in sync through SWAP.) %macro CAT_XDEFINE 3 %xdefine %1%2 %3 %endmacro %macro CAT_UNDEF 2 %undef %1%2 %endmacro %macro INIT_MMX 0-1+ %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_MMX %1 %define mmsize 8 %define num_mmregs 8 %define mova movq %define movu movq %define movh movd %define movnta movntq %assign %%i 0 %rep 8 CAT_XDEFINE m, %%i, mm %+ %%i CAT_XDEFINE nmm, %%i, %%i %assign %%i %%i+1 %endrep %rep 8 CAT_UNDEF m, %%i CAT_UNDEF nmm, %%i %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 %endmacro %macro INIT_XMM 0-1+ %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_XMM %1 %define mmsize 16 %define num_mmregs 8 %if ARCH_X86_64 %define num_mmregs 16 %endif %define mova movdqa %define movu movdqu %define movh movq %define movnta movntdq %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, xmm %+ %%i CAT_XDEFINE nxmm, %%i, %%i %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 %endmacro %macro INIT_YMM 0-1+ %assign avx_enabled 1 %define RESET_MM_PERMUTATION INIT_YMM %1 %define mmsize 32 %define num_mmregs 8 %if ARCH_X86_64 %define num_mmregs 16 %endif %define mova movdqa %define movu movdqu %undef movh %define movnta movntdq %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, ymm %+ %%i CAT_XDEFINE nymm, %%i, %%i %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 %endmacro INIT_XMM %macro DECLARE_MMCAST 1 %define mmmm%1 mm%1 %define mmxmm%1 mm%1 %define mmymm%1 mm%1 %define xmmmm%1 mm%1 %define xmmxmm%1 xmm%1 %define xmmymm%1 xmm%1 %define ymmmm%1 mm%1 %define ymmxmm%1 xmm%1 %define ymmymm%1 ymm%1 %define ymm%1xmm xmm%1 %define xmm%1ymm ymm%1 %define xm%1 xmm %+ m%1 %define ym%1 ymm %+ m%1 %endmacro %assign i 0 %rep 16 DECLARE_MMCAST i %assign i i+1 %endrep ; I often want to use macros that permute their arguments. e.g. there's no ; efficient way to implement butterfly or transpose or dct without swapping some ; arguments. ; ; I would like to not have to manually keep track of the permutations: ; If I insert a permutation in the middle of a function, it should automatically ; change everything that follows. For more complex macros I may also have multiple ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. ; ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that ; permutes its arguments. It's equivalent to exchanging the contents of the ; registers, except that this way you exchange the register names instead, so it ; doesn't cost any cycles. %macro PERMUTE 2-* ; takes a list of pairs to swap %rep %0/2 %xdefine %%tmp%2 m%2 %rotate 2 %endrep %rep %0/2 %xdefine m%1 %%tmp%2 CAT_XDEFINE n, m%1, %1 %rotate 2 %endrep %endmacro %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) %ifnum %1 ; SWAP 0, 1, ... SWAP_INTERNAL_NUM %1, %2 %else ; SWAP m0, m1, ... SWAP_INTERNAL_NAME %1, %2 %endif %endmacro %macro SWAP_INTERNAL_NUM 2-* %rep %0-1 %xdefine %%tmp m%1 %xdefine m%1 m%2 %xdefine m%2 %%tmp CAT_XDEFINE n, m%1, %1 CAT_XDEFINE n, m%2, %2 %rotate 1 %endrep %endmacro %macro SWAP_INTERNAL_NAME 2-* %xdefine %%args n %+ %1 %rep %0-1 %xdefine %%args %%args, n %+ %2 %rotate 1 %endrep SWAP_INTERNAL_NUM %%args %endmacro ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later ; calls to that function will automatically load the permutation, so values can ; be returned in mmregs. %macro SAVE_MM_PERMUTATION 0-1 %if %0 %xdefine %%f %1_m %else %xdefine %%f current_function %+ _m %endif %assign %%i 0 %rep num_mmregs CAT_XDEFINE %%f, %%i, m %+ %%i %assign %%i %%i+1 %endrep %endmacro %macro LOAD_MM_PERMUTATION 1 ; name to load from %ifdef %1_m0 %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, %1_m %+ %%i CAT_XDEFINE n, m %+ %%i, %%i %assign %%i %%i+1 %endrep %endif %endmacro ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't %macro call 1 call_internal %1, %1 %+ SUFFIX %endmacro %macro call_internal 2 %xdefine %%i %1 %ifndef cglobaled_%1 %ifdef cglobaled_%2 %xdefine %%i %2 %endif %endif call %%i LOAD_MM_PERMUTATION %%i %endmacro ; Substitutions that reduce instruction size but are functionally equivalent %macro add 2 %ifnum %2 %if %2==128 sub %1, -128 %else add %1, %2 %endif %else add %1, %2 %endif %endmacro %macro sub 2 %ifnum %2 %if %2==128 add %1, -128 %else sub %1, %2 %endif %else sub %1, %2 %endif %endmacro ;============================================================================= ; AVX abstraction layer ;============================================================================= %assign i 0 %rep 16 %if i < 8 CAT_XDEFINE sizeofmm, i, 8 %endif CAT_XDEFINE sizeofxmm, i, 16 CAT_XDEFINE sizeofymm, i, 32 %assign i i+1 %endrep %undef i %macro CHECK_AVX_INSTR_EMU 3-* %xdefine %%opcode %1 %xdefine %%dst %2 %rep %0-2 %ifidn %%dst, %3 %error non-avx emulation of ``%%opcode'' is not supported %endif %rotate 1 %endrep %endmacro ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int ;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not ;%6+: operands %macro RUN_AVX_INSTR 6-9+ %ifnum sizeof%7 %assign __sizeofreg sizeof%7 %elifnum sizeof%6 %assign __sizeofreg sizeof%6 %else %assign __sizeofreg mmsize %endif %assign __emulate_avx 0 %if avx_enabled && __sizeofreg >= 16 %xdefine __instr v%1 %else %xdefine __instr %1 %if %0 >= 8+%4 %assign __emulate_avx 1 %endif %endif %ifnidn %2, fnord %ifdef cpuname %if notcpuflag(%2) %error use of ``%1'' %2 instruction in cpuname function: current_function %endif %endif %endif %if __emulate_avx %xdefine __src1 %7 %xdefine __src2 %8 %ifnidn %6, %7 %if %0 >= 9 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9 %else CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8 %endif %if %5 && %4 == 0 %ifnid %8 ; 3-operand AVX instructions with a memory arg can only have it in src2, ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). ; So, if the instruction is commutative with a memory arg, swap them. %xdefine __src1 %8 %xdefine __src2 %7 %endif %endif %if __sizeofreg == 8 MOVQ %6, __src1 %elif %3 MOVAPS %6, __src1 %else MOVDQA %6, __src1 %endif %endif %if %0 >= 9 %1 %6, __src2, %9 %else %1 %6, __src2 %endif %elif %0 >= 9 __instr %6, %7, %8, %9 %elif %0 == 8 __instr %6, %7, %8 %elif %0 == 7 __instr %6, %7 %else __instr %6 %endif %endmacro ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int ;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not %macro AVX_INSTR 1-5 fnord, 0, 1, 0 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 %ifidn %2, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 %elifidn %3, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 %elifidn %4, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 %elifidn %5, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 %else RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 %endif %endmacro %endmacro ; Instructions with both VEX and non-VEX encodings ; Non-destructive instructions are written without parameters AVX_INSTR addpd, sse2, 1, 0, 1 AVX_INSTR addps, sse, 1, 0, 1 AVX_INSTR addsd, sse2, 1, 0, 1 AVX_INSTR addss, sse, 1, 0, 1 AVX_INSTR addsubpd, sse3, 1, 0, 0 AVX_INSTR addsubps, sse3, 1, 0, 0 AVX_INSTR aesdec, fnord, 0, 0, 0 AVX_INSTR aesdeclast, fnord, 0, 0, 0 AVX_INSTR aesenc, fnord, 0, 0, 0 AVX_INSTR aesenclast, fnord, 0, 0, 0 AVX_INSTR aesimc AVX_INSTR aeskeygenassist AVX_INSTR andnpd, sse2, 1, 0, 0 AVX_INSTR andnps, sse, 1, 0, 0 AVX_INSTR andpd, sse2, 1, 0, 1 AVX_INSTR andps, sse, 1, 0, 1 AVX_INSTR blendpd, sse4, 1, 0, 0 AVX_INSTR blendps, sse4, 1, 0, 0 AVX_INSTR blendvpd, sse4, 1, 0, 0 AVX_INSTR blendvps, sse4, 1, 0, 0 AVX_INSTR cmppd, sse2, 1, 1, 0 AVX_INSTR cmpps, sse, 1, 1, 0 AVX_INSTR cmpsd, sse2, 1, 1, 0 AVX_INSTR cmpss, sse, 1, 1, 0 AVX_INSTR comisd, sse2 AVX_INSTR comiss, sse AVX_INSTR cvtdq2pd, sse2 AVX_INSTR cvtdq2ps, sse2 AVX_INSTR cvtpd2dq, sse2 AVX_INSTR cvtpd2ps, sse2 AVX_INSTR cvtps2dq, sse2 AVX_INSTR cvtps2pd, sse2 AVX_INSTR cvtsd2si, sse2 AVX_INSTR cvtsd2ss, sse2 AVX_INSTR cvtsi2sd, sse2 AVX_INSTR cvtsi2ss, sse AVX_INSTR cvtss2sd, sse2 AVX_INSTR cvtss2si, sse AVX_INSTR cvttpd2dq, sse2 AVX_INSTR cvttps2dq, sse2 AVX_INSTR cvttsd2si, sse2 AVX_INSTR cvttss2si, sse AVX_INSTR divpd, sse2, 1, 0, 0 AVX_INSTR divps, sse, 1, 0, 0 AVX_INSTR divsd, sse2, 1, 0, 0 AVX_INSTR divss, sse, 1, 0, 0 AVX_INSTR dppd, sse4, 1, 1, 0 AVX_INSTR dpps, sse4, 1, 1, 0 AVX_INSTR extractps, sse4 AVX_INSTR haddpd, sse3, 1, 0, 0 AVX_INSTR haddps, sse3, 1, 0, 0 AVX_INSTR hsubpd, sse3, 1, 0, 0 AVX_INSTR hsubps, sse3, 1, 0, 0 AVX_INSTR insertps, sse4, 1, 1, 0 AVX_INSTR lddqu, sse3 AVX_INSTR ldmxcsr, sse AVX_INSTR maskmovdqu, sse2 AVX_INSTR maxpd, sse2, 1, 0, 1 AVX_INSTR maxps, sse, 1, 0, 1 AVX_INSTR maxsd, sse2, 1, 0, 1 AVX_INSTR maxss, sse, 1, 0, 1 AVX_INSTR minpd, sse2, 1, 0, 1 AVX_INSTR minps, sse, 1, 0, 1 AVX_INSTR minsd, sse2, 1, 0, 1 AVX_INSTR minss, sse, 1, 0, 1 AVX_INSTR movapd, sse2 AVX_INSTR movaps, sse AVX_INSTR movd AVX_INSTR movddup, sse3 AVX_INSTR movdqa, sse2 AVX_INSTR movdqu, sse2 AVX_INSTR movhlps, sse, 1, 0, 0 AVX_INSTR movhpd, sse2, 1, 0, 0 AVX_INSTR movhps, sse, 1, 0, 0 AVX_INSTR movlhps, sse, 1, 0, 0 AVX_INSTR movlpd, sse2, 1, 0, 0 AVX_INSTR movlps, sse, 1, 0, 0 AVX_INSTR movmskpd, sse2 AVX_INSTR movmskps, sse AVX_INSTR movntdq, sse2 AVX_INSTR movntdqa, sse4 AVX_INSTR movntpd, sse2 AVX_INSTR movntps, sse AVX_INSTR movq AVX_INSTR movsd, sse2, 1, 0, 0 AVX_INSTR movshdup, sse3 AVX_INSTR movsldup, sse3 AVX_INSTR movss, sse, 1, 0, 0 AVX_INSTR movupd, sse2 AVX_INSTR movups, sse AVX_INSTR mpsadbw, sse4 AVX_INSTR mulpd, sse2, 1, 0, 1 AVX_INSTR mulps, sse, 1, 0, 1 AVX_INSTR mulsd, sse2, 1, 0, 1 AVX_INSTR mulss, sse, 1, 0, 1 AVX_INSTR orpd, sse2, 1, 0, 1 AVX_INSTR orps, sse, 1, 0, 1 AVX_INSTR pabsb, ssse3 AVX_INSTR pabsd, ssse3 AVX_INSTR pabsw, ssse3 AVX_INSTR packsswb, mmx, 0, 0, 0 AVX_INSTR packssdw, mmx, 0, 0, 0 AVX_INSTR packuswb, mmx, 0, 0, 0 AVX_INSTR packusdw, sse4, 0, 0, 0 AVX_INSTR paddb, mmx, 0, 0, 1 AVX_INSTR paddw, mmx, 0, 0, 1 AVX_INSTR paddd, mmx, 0, 0, 1 AVX_INSTR paddq, sse2, 0, 0, 1 AVX_INSTR paddsb, mmx, 0, 0, 1 AVX_INSTR paddsw, mmx, 0, 0, 1 AVX_INSTR paddusb, mmx, 0, 0, 1 AVX_INSTR paddusw, mmx, 0, 0, 1 AVX_INSTR palignr, ssse3 AVX_INSTR pand, mmx, 0, 0, 1 AVX_INSTR pandn, mmx, 0, 0, 0 AVX_INSTR pavgb, mmx2, 0, 0, 1 AVX_INSTR pavgw, mmx2, 0, 0, 1 AVX_INSTR pblendvb, sse4, 0, 0, 0 AVX_INSTR pblendw, sse4 AVX_INSTR pclmulqdq AVX_INSTR pcmpestri, sse42 AVX_INSTR pcmpestrm, sse42 AVX_INSTR pcmpistri, sse42 AVX_INSTR pcmpistrm, sse42 AVX_INSTR pcmpeqb, mmx, 0, 0, 1 AVX_INSTR pcmpeqw, mmx, 0, 0, 1 AVX_INSTR pcmpeqd, mmx, 0, 0, 1 AVX_INSTR pcmpeqq, sse4, 0, 0, 1 AVX_INSTR pcmpgtb, mmx, 0, 0, 0 AVX_INSTR pcmpgtw, mmx, 0, 0, 0 AVX_INSTR pcmpgtd, mmx, 0, 0, 0 AVX_INSTR pcmpgtq, sse42, 0, 0, 0 AVX_INSTR pextrb, sse4 AVX_INSTR pextrd, sse4 AVX_INSTR pextrq, sse4 AVX_INSTR pextrw, mmx2 AVX_INSTR phaddw, ssse3, 0, 0, 0 AVX_INSTR phaddd, ssse3, 0, 0, 0 AVX_INSTR phaddsw, ssse3, 0, 0, 0 AVX_INSTR phminposuw, sse4 AVX_INSTR phsubw, ssse3, 0, 0, 0 AVX_INSTR phsubd, ssse3, 0, 0, 0 AVX_INSTR phsubsw, ssse3, 0, 0, 0 AVX_INSTR pinsrb, sse4 AVX_INSTR pinsrd, sse4 AVX_INSTR pinsrq, sse4 AVX_INSTR pinsrw, mmx2 AVX_INSTR pmaddwd, mmx, 0, 0, 1 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 AVX_INSTR pmaxsb, sse4, 0, 0, 1 AVX_INSTR pmaxsw, mmx2, 0, 0, 1 AVX_INSTR pmaxsd, sse4, 0, 0, 1 AVX_INSTR pmaxub, mmx2, 0, 0, 1 AVX_INSTR pmaxuw, sse4, 0, 0, 1 AVX_INSTR pmaxud, sse4, 0, 0, 1 AVX_INSTR pminsb, sse4, 0, 0, 1 AVX_INSTR pminsw, mmx2, 0, 0, 1 AVX_INSTR pminsd, sse4, 0, 0, 1 AVX_INSTR pminub, mmx2, 0, 0, 1 AVX_INSTR pminuw, sse4, 0, 0, 1 AVX_INSTR pminud, sse4, 0, 0, 1 AVX_INSTR pmovmskb, mmx2 AVX_INSTR pmovsxbw, sse4 AVX_INSTR pmovsxbd, sse4 AVX_INSTR pmovsxbq, sse4 AVX_INSTR pmovsxwd, sse4 AVX_INSTR pmovsxwq, sse4 AVX_INSTR pmovsxdq, sse4 AVX_INSTR pmovzxbw, sse4 AVX_INSTR pmovzxbd, sse4 AVX_INSTR pmovzxbq, sse4 AVX_INSTR pmovzxwd, sse4 AVX_INSTR pmovzxwq, sse4 AVX_INSTR pmovzxdq, sse4 AVX_INSTR pmuldq, sse4, 0, 0, 1 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 AVX_INSTR pmulhuw, mmx2, 0, 0, 1 AVX_INSTR pmulhw, mmx, 0, 0, 1 AVX_INSTR pmullw, mmx, 0, 0, 1 AVX_INSTR pmulld, sse4, 0, 0, 1 AVX_INSTR pmuludq, sse2, 0, 0, 1 AVX_INSTR por, mmx, 0, 0, 1 AVX_INSTR psadbw, mmx2, 0, 0, 1 AVX_INSTR pshufb, ssse3, 0, 0, 0 AVX_INSTR pshufd, sse2 AVX_INSTR pshufhw, sse2 AVX_INSTR pshuflw, sse2 AVX_INSTR psignb, ssse3, 0, 0, 0 AVX_INSTR psignw, ssse3, 0, 0, 0 AVX_INSTR psignd, ssse3, 0, 0, 0 AVX_INSTR psllw, mmx, 0, 0, 0 AVX_INSTR pslld, mmx, 0, 0, 0 AVX_INSTR psllq, mmx, 0, 0, 0 AVX_INSTR pslldq, sse2, 0, 0, 0 AVX_INSTR psraw, mmx, 0, 0, 0 AVX_INSTR psrad, mmx, 0, 0, 0 AVX_INSTR psrlw, mmx, 0, 0, 0 AVX_INSTR psrld, mmx, 0, 0, 0 AVX_INSTR psrlq, mmx, 0, 0, 0 AVX_INSTR psrldq, sse2, 0, 0, 0 AVX_INSTR psubb, mmx, 0, 0, 0 AVX_INSTR psubw, mmx, 0, 0, 0 AVX_INSTR psubd, mmx, 0, 0, 0 AVX_INSTR psubq, sse2, 0, 0, 0 AVX_INSTR psubsb, mmx, 0, 0, 0 AVX_INSTR psubsw, mmx, 0, 0, 0 AVX_INSTR psubusb, mmx, 0, 0, 0 AVX_INSTR psubusw, mmx, 0, 0, 0 AVX_INSTR ptest, sse4 AVX_INSTR punpckhbw, mmx, 0, 0, 0 AVX_INSTR punpckhwd, mmx, 0, 0, 0 AVX_INSTR punpckhdq, mmx, 0, 0, 0 AVX_INSTR punpckhqdq, sse2, 0, 0, 0 AVX_INSTR punpcklbw, mmx, 0, 0, 0 AVX_INSTR punpcklwd, mmx, 0, 0, 0 AVX_INSTR punpckldq, mmx, 0, 0, 0 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 AVX_INSTR pxor, mmx, 0, 0, 1 AVX_INSTR rcpps, sse, 1, 0, 0 AVX_INSTR rcpss, sse, 1, 0, 0 AVX_INSTR roundpd, sse4 AVX_INSTR roundps, sse4 AVX_INSTR roundsd, sse4 AVX_INSTR roundss, sse4 AVX_INSTR rsqrtps, sse, 1, 0, 0 AVX_INSTR rsqrtss, sse, 1, 0, 0 AVX_INSTR shufpd, sse2, 1, 1, 0 AVX_INSTR shufps, sse, 1, 1, 0 AVX_INSTR sqrtpd, sse2, 1, 0, 0 AVX_INSTR sqrtps, sse, 1, 0, 0 AVX_INSTR sqrtsd, sse2, 1, 0, 0 AVX_INSTR sqrtss, sse, 1, 0, 0 AVX_INSTR stmxcsr, sse AVX_INSTR subpd, sse2, 1, 0, 0 AVX_INSTR subps, sse, 1, 0, 0 AVX_INSTR subsd, sse2, 1, 0, 0 AVX_INSTR subss, sse, 1, 0, 0 AVX_INSTR ucomisd, sse2 AVX_INSTR ucomiss, sse AVX_INSTR unpckhpd, sse2, 1, 0, 0 AVX_INSTR unpckhps, sse, 1, 0, 0 AVX_INSTR unpcklpd, sse2, 1, 0, 0 AVX_INSTR unpcklps, sse, 1, 0, 0 AVX_INSTR xorpd, sse2, 1, 0, 1 AVX_INSTR xorps, sse, 1, 0, 1 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN AVX_INSTR pfadd, 3dnow, 1, 0, 1 AVX_INSTR pfsub, 3dnow, 1, 0, 0 AVX_INSTR pfmul, 3dnow, 1, 0, 1 ; base-4 constants for shuffles %assign i 0 %rep 256 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) %if j < 10 CAT_XDEFINE q000, j, i %elif j < 100 CAT_XDEFINE q00, j, i %elif j < 1000 CAT_XDEFINE q0, j, i %else CAT_XDEFINE q, j, i %endif %assign i i+1 %endrep %undef i %undef j %macro FMA_INSTR 3 %macro %1 4-7 %1, %2, %3 %if cpuflag(xop) v%5 %1, %2, %3, %4 %elifnidn %1, %4 %6 %1, %2, %3 %7 %1, %4 %else %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported %endif %endmacro %endmacro FMA_INSTR pmacsww, pmullw, paddw FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation FMA_INSTR pmadcswd, pmaddwd, paddd ; convert FMA4 to FMA3 if possible %macro FMA4_INSTR 4 %macro %1 4-8 %1, %2, %3, %4 %if cpuflag(fma4) v%5 %1, %2, %3, %4 %elifidn %1, %2 v%6 %1, %4, %3 ; %1 = %1 * %3 + %4 %elifidn %1, %3 v%7 %1, %2, %4 ; %1 = %2 * %1 + %4 %elifidn %1, %4 v%8 %1, %2, %3 ; %1 = %2 * %3 + %1 %else %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported %endif %endmacro %endmacro FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss ; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug %if ARCH_X86_64 == 0 %macro vpbroadcastq 2 %if sizeof%1 == 16 movddup %1, %2 %else vbroadcastsd %1, %2 %endif %endmacro %endif xavs2-1.3/source/common/x86/x86util.asm000066400000000000000000000515161340660520300176710ustar00rootroot00000000000000;***************************************************************************** ;* x86util.asm: x86 utility macros ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* ;* Authors: Holger Lubitz ;* Loren Merritt ;* Min Chen ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %assign FENC_STRIDE 64 %assign FDEC_STRIDE 32 %assign SIZEOF_PIXEL 1 %assign SIZEOF_DCTCOEF 2 %define pixel byte %define vpbroadcastdct vpbroadcastw %define vpbroadcastpix vpbroadcastb %if HIGH_BIT_DEPTH %assign SIZEOF_PIXEL 2 %assign SIZEOF_DCTCOEF 4 %define pixel word %define vpbroadcastdct vpbroadcastd %define vpbroadcastpix vpbroadcastw %endif %assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE %assign FDEC_STRIDEB SIZEOF_PIXEL*FDEC_STRIDE %assign PIXEL_MAX ((1 << BIT_DEPTH)-1) %macro FIX_STRIDES 1-* %if HIGH_BIT_DEPTH %rep %0 add %1, %1 %rotate 1 %endrep %endif %endmacro %macro SBUTTERFLY 4 %ifidn %1, dqqq vperm2i128 m%4, m%2, m%3, q0301 ; punpckh vinserti128 m%2, m%2, xm%3, 1 ; punpckl %elif avx_enabled && mmsize >= 16 punpckh%1 m%4, m%2, m%3 punpckl%1 m%2, m%3 %else mova m%4, m%2 punpckl%1 m%2, m%3 punpckh%1 m%4, m%3 %endif SWAP %3, %4 %endmacro %macro SBUTTERFLY2 4 punpckl%1 m%4, m%2, m%3 punpckh%1 m%2, m%2, m%3 SWAP %2, %4, %3 %endmacro %macro TRANSPOSE4x4W 5 SBUTTERFLY wd, %1, %2, %5 SBUTTERFLY wd, %3, %4, %5 SBUTTERFLY dq, %1, %3, %5 SBUTTERFLY dq, %2, %4, %5 SWAP %2, %3 %endmacro %macro TRANSPOSE2x4x4W 5 SBUTTERFLY wd, %1, %2, %5 SBUTTERFLY wd, %3, %4, %5 SBUTTERFLY dq, %1, %3, %5 SBUTTERFLY dq, %2, %4, %5 SBUTTERFLY qdq, %1, %2, %5 SBUTTERFLY qdq, %3, %4, %5 %endmacro %macro TRANSPOSE4x4D 5 SBUTTERFLY dq, %1, %2, %5 SBUTTERFLY dq, %3, %4, %5 SBUTTERFLY qdq, %1, %3, %5 SBUTTERFLY qdq, %2, %4, %5 SWAP %2, %3 %endmacro %macro TRANSPOSE8x8W 9-11 %if ARCH_X86_64 SBUTTERFLY wd, %1, %2, %9 SBUTTERFLY wd, %3, %4, %9 SBUTTERFLY wd, %5, %6, %9 SBUTTERFLY wd, %7, %8, %9 SBUTTERFLY dq, %1, %3, %9 SBUTTERFLY dq, %2, %4, %9 SBUTTERFLY dq, %5, %7, %9 SBUTTERFLY dq, %6, %8, %9 SBUTTERFLY qdq, %1, %5, %9 SBUTTERFLY qdq, %2, %6, %9 SBUTTERFLY qdq, %3, %7, %9 SBUTTERFLY qdq, %4, %8, %9 SWAP %2, %5 SWAP %4, %7 %else ; in: m0..m7, unless %11 in which case m6 is in %9 ; out: m0..m7, unless %11 in which case m4 is in %10 ; spills into %9 and %10 %if %0<11 movdqa %9, m%7 %endif SBUTTERFLY wd, %1, %2, %7 movdqa %10, m%2 movdqa m%7, %9 SBUTTERFLY wd, %3, %4, %2 SBUTTERFLY wd, %5, %6, %2 SBUTTERFLY wd, %7, %8, %2 SBUTTERFLY dq, %1, %3, %2 movdqa %9, m%3 movdqa m%2, %10 SBUTTERFLY dq, %2, %4, %3 SBUTTERFLY dq, %5, %7, %3 SBUTTERFLY dq, %6, %8, %3 SBUTTERFLY qdq, %1, %5, %3 SBUTTERFLY qdq, %2, %6, %3 movdqa %10, m%2 movdqa m%3, %9 SBUTTERFLY qdq, %3, %7, %2 SBUTTERFLY qdq, %4, %8, %2 SWAP %2, %5 SWAP %4, %7 %if %0<11 movdqa m%5, %10 %endif %endif %endmacro %macro WIDEN_SXWD 2 punpckhwd m%2, m%1 psrad m%2, 16 %if cpuflag(sse4) pmovsxwd m%1, m%1 %else punpcklwd m%1, m%1 psrad m%1, 16 %endif %endmacro %macro ABSW 2-3 ; dst, src, tmp (tmp used only if dst==src) %if cpuflag(ssse3) pabsw %1, %2 %elifidn %3, sign ; version for pairing with PSIGNW: modifies src pxor %1, %1 pcmpgtw %1, %2 pxor %2, %1 psubw %2, %1 SWAP %1, %2 %elifidn %1, %2 pxor %3, %3 psubw %3, %1 pmaxsw %1, %3 %elifid %2 pxor %1, %1 psubw %1, %2 pmaxsw %1, %2 %elif %0 == 2 pxor %1, %1 psubw %1, %2 pmaxsw %1, %2 %else mova %1, %2 pxor %3, %3 psubw %3, %1 pmaxsw %1, %3 %endif %endmacro %macro ABSW2 6 ; dst1, dst2, src1, src2, tmp, tmp %if cpuflag(ssse3) pabsw %1, %3 pabsw %2, %4 %elifidn %1, %3 pxor %5, %5 pxor %6, %6 psubw %5, %1 psubw %6, %2 pmaxsw %1, %5 pmaxsw %2, %6 %else pxor %1, %1 pxor %2, %2 psubw %1, %3 psubw %2, %4 pmaxsw %1, %3 pmaxsw %2, %4 %endif %endmacro %macro ABSB 2 %if cpuflag(ssse3) pabsb %1, %1 %else pxor %2, %2 psubb %2, %1 pminub %1, %2 %endif %endmacro %macro ABSD 2-3 %if cpuflag(ssse3) pabsd %1, %2 %else %define %%s %2 %if %0 == 3 mova %3, %2 %define %%s %3 %endif pxor %1, %1 pcmpgtd %1, %%s pxor %%s, %1 psubd %%s, %1 SWAP %1, %%s %endif %endmacro %macro PSIGN 3-4 %if cpuflag(ssse3) && %0 == 4 psign%1 %2, %3, %4 %elif cpuflag(ssse3) psign%1 %2, %3 %elif %0 == 4 pxor %2, %3, %4 psub%1 %2, %4 %else pxor %2, %3 psub%1 %2, %3 %endif %endmacro %define PSIGNW PSIGN w, %define PSIGND PSIGN d, %macro SPLATB_LOAD 3 %if cpuflag(ssse3) movd %1, [%2-3] pshufb %1, %3 %else movd %1, [%2-3] ;to avoid crossing a cacheline punpcklbw %1, %1 SPLATW %1, %1, 3 %endif %endmacro %imacro SPLATW 2-3 0 %if cpuflag(avx2) && %3 == 0 vpbroadcastw %1, %2 %else PSHUFLW %1, %2, (%3)*q1111 %if mmsize == 16 punpcklqdq %1, %1 %endif %endif %endmacro %imacro SPLATD 2-3 0 %if mmsize == 16 pshufd %1, %2, (%3)*q1111 %else pshufw %1, %2, (%3)*q0101 + ((%3)+1)*q1010 %endif %endmacro %macro CLIPW 3 ;(dst, min, max) pmaxsw %1, %2 pminsw %1, %3 %endmacro %macro CLIPW2 4 ;(dst0, dst1, min, max) pmaxsw %1, %3 pmaxsw %2, %3 pminsw %1, %4 pminsw %2, %4 %endmacro %macro HADDD 2 ; sum junk %if sizeof%1 == 32 %define %2 xmm%2 vextracti128 %2, %1, 1 %define %1 xmm%1 paddd %1, %2 %endif %if mmsize >= 16 %if cpuflag(xop) && sizeof%1 == 16 vphadddq %1, %1 %endif movhlps %2, %1 paddd %1, %2 %endif %if notcpuflag(xop) PSHUFLW %2, %1, q0032 paddd %1, %2 %endif %undef %1 %undef %2 %endmacro %macro HADDW 2 ; reg, tmp %if cpuflag(xop) && sizeof%1 == 16 vphaddwq %1, %1 movhlps %2, %1 paddd %1, %2 %else pmaddwd %1, [pw_1] HADDD %1, %2 %endif %endmacro %macro HADDUWD 2 %if cpuflag(xop) && sizeof%1 == 16 vphadduwd %1, %1 %else psrld %2, %1, 16 pslld %1, 16 psrld %1, 16 paddd %1, %2 %endif %endmacro %macro HADDUW 2 %if cpuflag(xop) && sizeof%1 == 16 vphadduwq %1, %1 movhlps %2, %1 paddd %1, %2 %else HADDUWD %1, %2 HADDD %1, %2 %endif %endmacro %macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp ; AVX2 version uses a precalculated extra input that ; can be re-used across calls %if sizeof%1==32 ; %3 = abcdefgh ijklmnop (lower address) ; %2 = ABCDEFGH IJKLMNOP (higher address) vperm2i128 %4, %1, %2, q0003 ; %4 = ijklmnop ABCDEFGH %if %3 < 16 palignr %1, %4, %2, %3 ; %1 = bcdefghi jklmnopA %else palignr %1, %2, %4, %3-16 ; %1 = pABCDEFG HIJKLMNO %endif %elif cpuflag(ssse3) %if %0==5 palignr %1, %2, %3, %4 %else palignr %1, %2, %3 %endif %else %define %%dst %1 %if %0==5 %ifnidn %1, %2 mova %%dst, %2 %endif %rotate 1 %endif %ifnidn %4, %2 mova %4, %2 %endif %if mmsize==8 psllq %%dst, (8-%3)*8 psrlq %4, %3*8 %else pslldq %%dst, 16-%3 psrldq %4, %3 %endif por %%dst, %4 %endif %endmacro %macro PSHUFLW 1+ %if mmsize == 8 pshufw %1 %else pshuflw %1 %endif %endmacro ; shift a mmxreg by n bytes, or a xmmreg by 2*n bytes ; values shifted in are undefined ; faster if dst==src %define PSLLPIX PSXLPIX l, -1, ;dst, src, shift %define PSRLPIX PSXLPIX r, 1, ;dst, src, shift %macro PSXLPIX 5 %if mmsize == 8 %if %5&1 ps%1lq %3, %4, %5*8 %else pshufw %3, %4, (q3210<<8>>(8+%2*%5))&0xff %endif %else ps%1ldq %3, %4, %5*2 %endif %endmacro %macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from %ifnum %5 pand m%3, m%5, m%4 ; src .. y6 .. y4 pand m%1, m%5, m%2 ; dst .. y6 .. y4 %else mova m%1, %5 pand m%3, m%1, m%4 ; src .. y6 .. y4 pand m%1, m%1, m%2 ; dst .. y6 .. y4 %endif psrlw m%2, 8 ; dst .. y7 .. y5 psrlw m%4, 8 ; src .. y7 .. y5 %endmacro %macro SUMSUB_BA 3-4 %if %0==3 padd%1 m%2, m%3 padd%1 m%3, m%3 psub%1 m%3, m%2 %elif avx_enabled padd%1 m%4, m%2, m%3 psub%1 m%3, m%2 SWAP %2, %4 %else mova m%4, m%2 padd%1 m%2, m%3 psub%1 m%3, m%4 %endif %endmacro %macro SUMSUB_BADC 5-6 %if %0==6 SUMSUB_BA %1, %2, %3, %6 SUMSUB_BA %1, %4, %5, %6 %else padd%1 m%2, m%3 padd%1 m%4, m%5 padd%1 m%3, m%3 padd%1 m%5, m%5 psub%1 m%3, m%2 psub%1 m%5, m%4 %endif %endmacro %macro HADAMARD4_V 4+ SUMSUB_BADC w, %1, %2, %3, %4 SUMSUB_BADC w, %1, %3, %2, %4 %endmacro %macro HADAMARD8_V 8+ SUMSUB_BADC w, %1, %2, %3, %4 SUMSUB_BADC w, %5, %6, %7, %8 SUMSUB_BADC w, %1, %3, %2, %4 SUMSUB_BADC w, %5, %7, %6, %8 SUMSUB_BADC w, %1, %5, %2, %6 SUMSUB_BADC w, %3, %7, %4, %8 %endmacro %macro TRANS_SSE2 5-6 ; TRANSPOSE2x2 ; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq ; %2: ord/unord (for compat with sse4, unused) ; %3/%4: source regs ; %5/%6: tmp regs %ifidn %1, d %define mask [mask_10] %define shift 16 %elifidn %1, q %define mask [mask_1100] %define shift 32 %endif %if %0==6 ; less dependency if we have two tmp mova m%5, mask ; ff00 mova m%6, m%4 ; x5x4 psll%1 m%4, shift ; x4.. pand m%6, m%5 ; x5.. pandn m%5, m%3 ; ..x0 psrl%1 m%3, shift ; ..x1 por m%4, m%5 ; x4x0 por m%3, m%6 ; x5x1 %else ; more dependency, one insn less. sometimes faster, sometimes not mova m%5, m%4 ; x5x4 psll%1 m%4, shift ; x4.. pxor m%4, m%3 ; (x4^x1)x0 pand m%4, mask ; (x4^x1).. pxor m%3, m%4 ; x4x0 psrl%1 m%4, shift ; ..(x1^x4) pxor m%5, m%4 ; x5x1 SWAP %4, %3, %5 %endif %endmacro %macro TRANS_SSE4 5-6 ; see above %ifidn %1, d %ifidn %2, ord psrl%1 m%5, m%3, 16 pblendw m%5, m%4, q2222 psll%1 m%4, 16 pblendw m%4, m%3, q1111 SWAP %3, %5 %else %if avx_enabled pblendw m%5, m%3, m%4, q2222 SWAP %3, %5 %else mova m%5, m%3 pblendw m%3, m%4, q2222 %endif psll%1 m%4, 16 psrl%1 m%5, 16 por m%4, m%5 %endif %elifidn %1, q shufps m%5, m%3, m%4, q3131 shufps m%3, m%3, m%4, q2020 SWAP %4, %5 %endif %endmacro %macro TRANS_XOP 5-6 %ifidn %1, d vpperm m%5, m%3, m%4, [transd_shuf1] vpperm m%3, m%3, m%4, [transd_shuf2] %elifidn %1, q shufps m%5, m%3, m%4, q3131 shufps m%3, m%4, q2020 %endif SWAP %4, %5 %endmacro %macro HADAMARD 5-6 ; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes) ; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes) ; %3/%4: regs ; %5(%6): tmpregs %if %1!=0 ; have to reorder stuff for horizontal op %ifidn %2, sumsub %define ORDER ord ; sumsub needs order because a-b != b-a unless a=b %else %define ORDER unord ; if we just max, order doesn't matter (allows pblendw+or in sse4) %endif %if %1==1 TRANS d, ORDER, %3, %4, %5, %6 %elif %1==2 %if mmsize==8 SBUTTERFLY dq, %3, %4, %5 %else TRANS q, ORDER, %3, %4, %5, %6 %endif %elif %1==4 SBUTTERFLY qdq, %3, %4, %5 %elif %1==8 SBUTTERFLY dqqq, %3, %4, %5 %endif %endif %ifidn %2, sumsub SUMSUB_BA w, %3, %4, %5 %else %ifidn %2, amax %if %0==6 ABSW2 m%3, m%4, m%3, m%4, m%5, m%6 %else ABSW m%3, m%3, m%5 ABSW m%4, m%4, m%5 %endif %endif pmaxsw m%3, m%4 %endif %endmacro %macro HADAMARD2_2D 6-7 sumsub HADAMARD 0, sumsub, %1, %2, %5 HADAMARD 0, sumsub, %3, %4, %5 SBUTTERFLY %6, %1, %2, %5 %ifnum %7 HADAMARD 0, amax, %1, %2, %5, %7 %else HADAMARD 0, %7, %1, %2, %5 %endif SBUTTERFLY %6, %3, %4, %5 %ifnum %7 HADAMARD 0, amax, %3, %4, %5, %7 %else HADAMARD 0, %7, %3, %4, %5 %endif %endmacro %macro HADAMARD4_2D 5-6 sumsub HADAMARD2_2D %1, %2, %3, %4, %5, wd HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6 SWAP %2, %3 %endmacro %macro HADAMARD4_2D_SSE 5-6 sumsub HADAMARD 0, sumsub, %1, %2, %5 ; 1st V row 0 + 1 HADAMARD 0, sumsub, %3, %4, %5 ; 1st V row 2 + 3 SBUTTERFLY wd, %1, %2, %5 ; %1: m0 1+0 %2: m1 1+0 SBUTTERFLY wd, %3, %4, %5 ; %3: m0 3+2 %4: m1 3+2 HADAMARD2_2D %1, %3, %2, %4, %5, dq SBUTTERFLY qdq, %1, %2, %5 HADAMARD 0, %6, %1, %2, %5 ; 2nd H m1/m0 row 0+1 SBUTTERFLY qdq, %3, %4, %5 HADAMARD 0, %6, %3, %4, %5 ; 2nd H m1/m0 row 2+3 %endmacro %macro HADAMARD8_2D 9-10 sumsub HADAMARD2_2D %1, %2, %3, %4, %9, wd HADAMARD2_2D %5, %6, %7, %8, %9, wd HADAMARD2_2D %1, %3, %2, %4, %9, dq HADAMARD2_2D %5, %7, %6, %8, %9, dq HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10 HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10 %ifnidn %10, amax SWAP %2, %5 SWAP %4, %7 %endif %endmacro ; doesn't include the "pmaddubsw hmul_8p" pass %macro HADAMARD8_2D_HMUL 10 HADAMARD4_V %1, %2, %3, %4, %9 HADAMARD4_V %5, %6, %7, %8, %9 SUMSUB_BADC w, %1, %5, %2, %6, %9 HADAMARD 2, sumsub, %1, %5, %9, %10 HADAMARD 2, sumsub, %2, %6, %9, %10 SUMSUB_BADC w, %3, %7, %4, %8, %9 HADAMARD 2, sumsub, %3, %7, %9, %10 HADAMARD 2, sumsub, %4, %8, %9, %10 HADAMARD 1, amax, %1, %5, %9, %10 HADAMARD 1, amax, %2, %6, %9, %5 HADAMARD 1, amax, %3, %7, %9, %5 HADAMARD 1, amax, %4, %8, %9, %5 %endmacro %macro SUMSUB2_AB 4 %if cpuflag(xop) pmacs%1%1 m%4, m%3, [p%1_m2], m%2 pmacs%1%1 m%2, m%2, [p%1_2], m%3 %elifnum %3 psub%1 m%4, m%2, m%3 psub%1 m%4, m%3 padd%1 m%2, m%2 padd%1 m%2, m%3 %else mova m%4, m%2 padd%1 m%2, m%2 padd%1 m%2, %3 psub%1 m%4, %3 psub%1 m%4, %3 %endif %endmacro %macro SUMSUBD2_AB 5 %ifnum %4 psra%1 m%5, m%2, 1 ; %3: %3>>1 psra%1 m%4, m%3, 1 ; %2: %2>>1 padd%1 m%4, m%2 ; %3: %3>>1+%2 psub%1 m%5, m%3 ; %2: %2>>1-%3 SWAP %2, %5 SWAP %3, %4 %else mova %5, m%2 mova %4, m%3 psra%1 m%3, 1 ; %3: %3>>1 psra%1 m%2, 1 ; %2: %2>>1 padd%1 m%3, %5 ; %3: %3>>1+%2 psub%1 m%2, %4 ; %2: %2>>1-%3 %endif %endmacro %macro DCT4_1D 5 %ifnum %5 SUMSUB_BADC w, %4, %1, %3, %2, %5 SUMSUB_BA w, %3, %4, %5 SUMSUB2_AB w, %1, %2, %5 SWAP %1, %3, %4, %5, %2 %else SUMSUB_BADC w, %4, %1, %3, %2 SUMSUB_BA w, %3, %4 mova [%5], m%2 SUMSUB2_AB w, %1, [%5], %2 SWAP %1, %3, %4, %2 %endif %endmacro %macro IDCT4_1D 6-7 %ifnum %6 SUMSUBD2_AB %1, %3, %5, %7, %6 ; %3: %3>>1-%5 %5: %3+%5>>1 SUMSUB_BA %1, %4, %2, %7 ; %4: %2+%4 %2: %2-%4 SUMSUB_BADC %1, %5, %4, %3, %2, %7 ; %5: %2+%4 + (%3+%5>>1) ; %4: %2+%4 - (%3+%5>>1) ; %3: %2-%4 + (%3>>1-%5) ; %2: %2-%4 - (%3>>1-%5) %else %ifidn %1, w SUMSUBD2_AB %1, %3, %5, [%6], [%6+16] %else SUMSUBD2_AB %1, %3, %5, [%6], [%6+32] %endif SUMSUB_BA %1, %4, %2 SUMSUB_BADC %1, %5, %4, %3, %2 %endif SWAP %2, %5, %4 ; %2: %2+%4 + (%3+%5>>1) row0 ; %3: %2-%4 + (%3>>1-%5) row1 ; %4: %2-%4 - (%3>>1-%5) row2 ; %5: %2+%4 - (%3+%5>>1) row3 %endmacro %macro LOAD_DIFF 5-6 1 %if HIGH_BIT_DEPTH %if %6 ; %5 aligned? mova %1, %4 psubw %1, %5 %else movu %1, %4 movu %2, %5 psubw %1, %2 %endif %else ; !HIGH_BIT_DEPTH %ifidn %3, none movh %1, %4 movh %2, %5 punpcklbw %1, %2 punpcklbw %2, %2 psubw %1, %2 %else movh %1, %4 punpcklbw %1, %3 movh %2, %5 punpcklbw %2, %3 psubw %1, %2 %endif %endif ; HIGH_BIT_DEPTH %endmacro %macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr %if BIT_DEPTH == 8 && cpuflag(ssse3) movh m%2, [%8+%1*FDEC_STRIDE] movh m%1, [%7+%1*FENC_STRIDE] punpcklbw m%1, m%2 movh m%3, [%8+%2*FDEC_STRIDE] movh m%2, [%7+%2*FENC_STRIDE] punpcklbw m%2, m%3 movh m%4, [%8+%3*FDEC_STRIDE] movh m%3, [%7+%3*FENC_STRIDE] punpcklbw m%3, m%4 movh m%5, [%8+%4*FDEC_STRIDE] movh m%4, [%7+%4*FENC_STRIDE] punpcklbw m%4, m%5 pmaddubsw m%1, m%6 pmaddubsw m%2, m%6 pmaddubsw m%3, m%6 pmaddubsw m%4, m%6 %else LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDEB], [%8+%1*FDEC_STRIDEB] LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDEB], [%8+%2*FDEC_STRIDEB] LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDEB], [%8+%3*FDEC_STRIDEB] LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDEB], [%8+%4*FDEC_STRIDEB] %endif %endmacro %macro STORE_DCT 6 movq [%5+%6+ 0], m%1 movq [%5+%6+ 8], m%2 movq [%5+%6+16], m%3 movq [%5+%6+24], m%4 movhps [%5+%6+32], m%1 movhps [%5+%6+40], m%2 movhps [%5+%6+48], m%3 movhps [%5+%6+56], m%4 %endmacro %macro STORE_IDCT 4 movhps [r0-4*FDEC_STRIDE], %1 movh [r0-3*FDEC_STRIDE], %1 movhps [r0-2*FDEC_STRIDE], %2 movh [r0-1*FDEC_STRIDE], %2 movhps [r0+0*FDEC_STRIDE], %3 movh [r0+1*FDEC_STRIDE], %3 movhps [r0+2*FDEC_STRIDE], %4 movh [r0+3*FDEC_STRIDE], %4 %endmacro %macro LOAD_DIFF_8x4P 7-11 r0,r2,0,1 ; 4x dest, 2x temp, 2x pointer, increment, aligned? LOAD_DIFF m%1, m%5, m%7, [%8], [%9], %11 LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3], %11 LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3], %11 LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5], %11 %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro ; 2xdst, 2xtmp, 2xsrcrow %macro LOAD_DIFF16x2_AVX2 6 pmovzxbw m%1, [r1+%5*FENC_STRIDE] pmovzxbw m%2, [r1+%6*FENC_STRIDE] pmovzxbw m%3, [r2+(%5-4)*FDEC_STRIDE] pmovzxbw m%4, [r2+(%6-4)*FDEC_STRIDE] psubw m%1, m%3 psubw m%2, m%4 %endmacro %macro DIFFx2 6-7 movh %3, %5 punpcklbw %3, %4 psraw %1, 6 paddsw %1, %3 movh %3, %6 punpcklbw %3, %4 psraw %2, 6 paddsw %2, %3 packuswb %2, %1 %endmacro ; (high depth) in: %1, %2, min to clip, max to clip, mem128 ; in: %1, tmp, %3, mem64 %macro STORE_DIFF 4-5 %if HIGH_BIT_DEPTH psrad %1, 6 psrad %2, 6 packssdw %1, %2 paddw %1, %5 CLIPW %1, %3, %4 mova %5, %1 %else movh %2, %4 punpcklbw %2, %3 psraw %1, 6 paddsw %1, %2 packuswb %1, %1 movh %4, %1 %endif %endmacro %macro SHUFFLE_MASK_W 8 %rep 8 %if %1>=0x80 db %1, %1 %else db %1*2 db %1*2+1 %endif %rotate 1 %endrep %endmacro ; instruction, accum, input, iteration (zero to swap, nonzero to add) %macro ACCUM 4 %if %4 %1 m%2, m%3 %else SWAP %2, %3 %endif %endmacro ; IACA support %macro IACA_START 0 mov ebx, 111 db 0x64, 0x67, 0x90 %endmacro %macro IACA_END 0 mov ebx, 222 db 0x64, 0x67, 0x90 %endmacro xavs2-1.3/source/compat/000077500000000000000000000000001340660520300152025ustar00rootroot00000000000000xavs2-1.3/source/compat/getopt/000077500000000000000000000000001340660520300165045ustar00rootroot00000000000000xavs2-1.3/source/compat/getopt/getopt.c000066400000000000000000001072511340660520300201600ustar00rootroot00000000000000/* Getopt for GNU. NOTE: getopt is now part of the C library, so if you don't know what "Keep this file name-space clean" means, talk to drepper@gnu.org before changing it! Copyright (C) 1987,88,89,90,91,92,93,94,95,96,98,99,2000,2001 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ /* This tells Alpha OSF/1 not to define a getopt prototype in . Ditto for AIX 3.2 and . */ #ifndef _NO_PROTO # define _NO_PROTO #endif #ifdef HAVE_CONFIG_H # include #endif #if !defined __STDC__ || !__STDC__ /* This is a separate conditional since some stdc systems reject `defined (const)'. */ # ifndef const # define const # endif #endif #include /* Comment out all this code if we are using the GNU C Library, and are not actually compiling the library itself. This code is part of the GNU C Library, but also included in many other GNU distributions. Compiling and linking in this code is a waste when using the GNU C library (especially if it is a shared library). Rather than having every GNU program understand `configure --with-gnu-libc' and omit the object files, it is simpler to just do this in the source for each such file. */ #define GETOPT_INTERFACE_VERSION 2 #if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2 # include # if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION # define ELIDE_CODE # endif #endif #ifndef ELIDE_CODE /* This needs to come after some library #include to get __GNU_LIBRARY__ defined. */ #ifdef __GNU_LIBRARY__ /* Don't include stdlib.h for non-GNU C libraries because some of them contain conflicting prototypes for getopt. */ # include # include #endif /* GNU C library. */ #ifdef VMS # include # if HAVE_STRING_H - 0 # include # endif #endif #ifndef _ /* This is for other GNU distributions with internationalized messages. */ # if defined HAVE_LIBINTL_H || defined _LIBC # include # ifndef _ # define _(msgid) gettext (msgid) # endif # else # define _(msgid) (msgid) # endif #endif /* This version of `getopt' appears to the caller like standard Unix `getopt' but it behaves differently for the user, since it allows the user to intersperse the options with the other arguments. As `getopt' works, it permutes the elements of ARGV so that, when it is done, all the options precede everything else. Thus all application programs are extended to handle flexible argument order. Setting the environment variable POSIXLY_CORRECT disables permutation. Then the behavior is completely standard. GNU application programs can use a third alternative mode in which they can distinguish the relative order of options and other arguments. */ #include "getopt.h" /* For communication from `getopt' to the caller. When `getopt' finds an option that takes an argument, the argument value is returned here. Also, when `ordering' is RETURN_IN_ORDER, each non-option ARGV-element is returned here. */ char *optarg; /* Index in ARGV of the next element to be scanned. This is used for communication to and from the caller and for communication between successive calls to `getopt'. On entry to `getopt', zero means this is the first call; initialize. When `getopt' returns -1, this is the index of the first of the non-option elements that the caller should itself scan. Otherwise, `optind' communicates from one call to the next how much of ARGV has been scanned so far. */ /* 1003.2 says this must be 1 before any call. */ int optind = 1; /* Formerly, initialization of getopt depended on optind==0, which causes problems with re-calling getopt as programs generally don't know that. */ int __getopt_initialized; /* The next char to be scanned in the option-element in which the last option character we returned was found. This allows us to pick up the scan where we left off. If this is zero, or a null string, it means resume the scan by advancing to the next ARGV-element. */ static char *nextchar; /* Callers store zero here to inhibit the error message for unrecognized options. */ int opterr = 1; /* Set to an option character which was unrecognized. This must be initialized on some systems to avoid linking in the system's own getopt implementation. */ int optopt = '?'; /* Describe how to deal with options that follow non-option ARGV-elements. If the caller did not specify anything, the default is REQUIRE_ORDER if the environment variable POSIXLY_CORRECT is defined, PERMUTE otherwise. REQUIRE_ORDER means don't recognize them as options; stop option processing when the first non-option is seen. This is what Unix does. This mode of operation is selected by either setting the environment variable POSIXLY_CORRECT, or using `+' as the first character of the list of option characters. PERMUTE is the default. We permute the contents of ARGV as we scan, so that eventually all the non-options are at the end. This allows options to be given in any order, even with programs that were not written to expect this. RETURN_IN_ORDER is an option available to programs that were written to expect options and other ARGV-elements in any order and that care about the ordering of the two. We describe each non-option ARGV-element as if it were the argument of an option with character code 1. Using `-' as the first character of the list of option characters selects this mode of operation. The special argument `--' forces an end of option-scanning regardless of the value of `ordering'. In the case of RETURN_IN_ORDER, only `--' can cause `getopt' to return -1 with `optind' != ARGC. */ static enum { REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER } ordering; /* Value of POSIXLY_CORRECT environment variable. */ static char *posixly_correct; #ifdef __GNU_LIBRARY__ /* We want to avoid inclusion of string.h with non-GNU libraries because there are many ways it can cause trouble. On some systems, it contains special magic macros that don't work in GCC. */ # include # define my_index strchr #else # if HAVE_STRING_H # include # else # include # endif /* Avoid depending on library functions or files whose names are inconsistent. */ #ifndef getenv extern char *getenv(); #endif static char * my_index(str, chr) const char *str; int chr; { while (*str) { if (*str == chr) { return (char *) str; } str++; } return 0; } /* If using GCC, we can safely declare strlen this way. If not using GCC, it is ok not to declare it. */ #ifdef __GNUC__ /* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h. That was relevant to code that was here before. */ # if (!defined __STDC__ || !__STDC__) && !defined strlen /* gcc with -traditional declares the built-in strlen to return int, and has done so at least since version 2.4.5. -- rms. */ extern int strlen(const char *); # endif /* not __STDC__ */ #endif /* __GNUC__ */ #endif /* not __GNU_LIBRARY__ */ /* Handle permutation of arguments. */ /* Describe the part of ARGV that contains non-options that have been skipped. `first_nonopt' is the index in ARGV of the first of them; `last_nonopt' is the index after the last of them. */ static int first_nonopt; static int last_nonopt; #ifdef _LIBC /* Stored original parameters. XXX This is no good solution. We should rather copy the args so that we can compare them later. But we must not use malloc(3). */ extern int __libc_argc; extern char **__libc_argv; /* Bash 2.0 gives us an environment variable containing flags indicating ARGV elements that should not be considered arguments. */ # ifdef USE_NONOPTION_FLAGS /* Defined in getopt_init.c */ extern char *__getopt_nonoption_flags; static int nonoption_flags_max_len; static int nonoption_flags_len; # endif # ifdef USE_NONOPTION_FLAGS # define SWAP_FLAGS(ch1, ch2) \ if (nonoption_flags_len > 0) \ { \ char __tmp = __getopt_nonoption_flags[ch1]; \ __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2]; \ __getopt_nonoption_flags[ch2] = __tmp; \ } # else # define SWAP_FLAGS(ch1, ch2) # endif #else /* !_LIBC */ # define SWAP_FLAGS(ch1, ch2) #endif /* _LIBC */ /* Exchange two adjacent subsequences of ARGV. One subsequence is elements [first_nonopt,last_nonopt) which contains all the non-options that have been skipped so far. The other is elements [last_nonopt,optind), which contains all the options processed since those non-options were skipped. `first_nonopt' and `last_nonopt' are relocated so that they describe the new indices of the non-options in ARGV after they are moved. */ #if defined __STDC__ && __STDC__ static void exchange(char **); #endif static void exchange(argv) char **argv; { int bottom = first_nonopt; int middle = last_nonopt; int top = optind; char *tem; /* Exchange the shorter segment with the far end of the longer segment. That puts the shorter segment into the right place. It leaves the longer segment in the right place overall, but it consists of two parts that need to be swapped next. */ #if defined _LIBC && defined USE_NONOPTION_FLAGS /* First make sure the handling of the `__getopt_nonoption_flags' string can work normally. Our top argument must be in the range of the string. */ if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len) { /* We must extend the array. The user plays games with us and presents new arguments. */ char *new_str = malloc(top + 1); if (new_str == NULL) { nonoption_flags_len = nonoption_flags_max_len = 0; } else { memset(__mempcpy(new_str, __getopt_nonoption_flags, nonoption_flags_max_len), '\0', top + 1 - nonoption_flags_max_len); nonoption_flags_max_len = top + 1; __getopt_nonoption_flags = new_str; } } #endif while (top > middle && middle > bottom) { if (top - middle > middle - bottom) { /* Bottom segment is the short one. */ int len = middle - bottom; register int i; /* Swap it with the top part of the top segment. */ for (i = 0; i < len; i++) { tem = argv[bottom + i]; argv[bottom + i] = argv[top - (middle - bottom) + i]; argv[top - (middle - bottom) + i] = tem; SWAP_FLAGS(bottom + i, top - (middle - bottom) + i); } /* Exclude the moved bottom segment from further swapping. */ top -= len; } else { /* Top segment is the short one. */ int len = top - middle; register int i; /* Swap it with the bottom part of the bottom segment. */ for (i = 0; i < len; i++) { tem = argv[bottom + i]; argv[bottom + i] = argv[middle + i]; argv[middle + i] = tem; SWAP_FLAGS(bottom + i, middle + i); } /* Exclude the moved top segment from further swapping. */ bottom += len; } } /* Update records for the slots the non-options now occupy. */ first_nonopt += (optind - last_nonopt); last_nonopt = optind; } /* Initialize the internal data when the first call is made. */ #if defined __STDC__ && __STDC__ static const char *_getopt_initialize(int, char *const *, const char *); #endif static const char * _getopt_initialize(argc, argv, optstring) int argc; char *const *argv; const char *optstring; { /* Start processing options with ARGV-element 1 (since ARGV-element 0 is the program name); the sequence of previously skipped non-option ARGV-elements is empty. */ first_nonopt = last_nonopt = optind; nextchar = NULL; posixly_correct = getenv("POSIXLY_CORRECT"); /* Determine how to handle the ordering of options and nonoptions. */ if (optstring[0] == '-') { ordering = RETURN_IN_ORDER; ++optstring; } else if (optstring[0] == '+') { ordering = REQUIRE_ORDER; ++optstring; } else if (posixly_correct != NULL) { ordering = REQUIRE_ORDER; } else { ordering = PERMUTE; } #if defined _LIBC && defined USE_NONOPTION_FLAGS if (posixly_correct == NULL && argc == __libc_argc && argv == __libc_argv) { if (nonoption_flags_max_len == 0) { if (__getopt_nonoption_flags == NULL || __getopt_nonoption_flags[0] == '\0') { nonoption_flags_max_len = -1; } else { const char *orig_str = __getopt_nonoption_flags; int len = nonoption_flags_max_len = strlen(orig_str); if (nonoption_flags_max_len < argc) { nonoption_flags_max_len = argc; } __getopt_nonoption_flags = (char *) malloc(nonoption_flags_max_len); if (__getopt_nonoption_flags == NULL) { nonoption_flags_max_len = -1; } else memset(__mempcpy(__getopt_nonoption_flags, orig_str, len), '\0', nonoption_flags_max_len - len); } } nonoption_flags_len = nonoption_flags_max_len; } else { nonoption_flags_len = 0; } #endif return optstring; } /* Scan elements of ARGV (whose length is ARGC) for option characters given in OPTSTRING. If an element of ARGV starts with '-', and is not exactly "-" or "--", then it is an option element. The characters of this element (aside from the initial '-') are option characters. If `getopt' is called repeatedly, it returns successively each of the option characters from each of the option elements. If `getopt' finds another option character, it returns that character, updating `optind' and `nextchar' so that the next call to `getopt' can resume the scan with the following option character or ARGV-element. If there are no more option characters, `getopt' returns -1. Then `optind' is the index in ARGV of the first ARGV-element that is not an option. (The ARGV-elements have been permuted so that those that are not options now come last.) OPTSTRING is a string containing the legitimate option characters. If an option character is seen that is not listed in OPTSTRING, return '?' after printing an error message. If you set `opterr' to zero, the error message is suppressed but we still return '?'. If a char in OPTSTRING is followed by a colon, that means it wants an arg, so the following text in the same ARGV-element, or the text of the following ARGV-element, is returned in `optarg'. Two colons mean an option that wants an optional arg; if there is text in the current ARGV-element, it is returned in `optarg', otherwise `optarg' is set to zero. If OPTSTRING starts with `-' or `+', it requests different methods of handling the non-option ARGV-elements. See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above. Long-named options begin with `--' instead of `-'. Their names may be abbreviated as long as the abbreviation is unique or is an exact match for some defined option. If they have an argument, it follows the option name in the same ARGV-element, separated from the option name by a `=', or else the in next ARGV-element. When `getopt' finds a long-named option, it returns 0 if that option's `flag' field is nonzero, the value of the option's `val' field if the `flag' field is zero. The elements of ARGV aren't really const, because we permute them. But we pretend they're const in the prototype to be compatible with other systems. LONGOPTS is a vector of `struct option' terminated by an element containing a name which is zero. LONGIND returns the index in LONGOPT of the long-named option found. It is only valid when a long-named option has been found by the most recent call. If LONG_ONLY is nonzero, '-' as well as '--' can introduce long-named options. */ int _getopt_internal(argc, argv, optstring, longopts, longind, long_only) int argc; char *const *argv; const char *optstring; const struct option *longopts; int32_t *longind; int long_only; { int print_errors = opterr; if (optstring[0] == ':') { print_errors = 0; } if (argc < 1) { return -1; } optarg = NULL; if (optind == 0 || !__getopt_initialized) { if (optind == 0) { optind = 1; /* Don't scan ARGV[0], the program name. */ } optstring = _getopt_initialize(argc, argv, optstring); __getopt_initialized = 1; } /* Test whether ARGV[optind] points to a non-option argument. Either it does not have option syntax, or there is an environment flag from the shell indicating it is not an option. The later information is only used when the used in the GNU libc. */ #if defined _LIBC && defined USE_NONOPTION_FLAGS # define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0' \ || (optind < nonoption_flags_len \ && __getopt_nonoption_flags[optind] == '1')) #else # define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0') #endif if (nextchar == NULL || *nextchar == '\0') { /* Advance to the next ARGV-element. */ /* Give FIRST_NONOPT & LAST_NONOPT rational values if OPTIND has been moved back by the user (who may also have changed the arguments). */ if (last_nonopt > optind) { last_nonopt = optind; } if (first_nonopt > optind) { first_nonopt = optind; } if (ordering == PERMUTE) { /* If we have just processed some options following some non-options, exchange them so that the options come first. */ if (first_nonopt != last_nonopt && last_nonopt != optind) { exchange((char **) argv); } else if (last_nonopt != optind) { first_nonopt = optind; } /* Skip any additional non-options and extend the range of non-options previously skipped. */ while (optind < argc && NONOPTION_P) { optind++; } last_nonopt = optind; } /* The special ARGV-element `--' means premature end of options. Skip it like a null option, then exchange with previous non-options as if it were an option, then skip everything else like a non-option. */ if (optind != argc && !strcmp(argv[optind], "--")) { optind++; if (first_nonopt != last_nonopt && last_nonopt != optind) { exchange((char **) argv); } else if (first_nonopt == last_nonopt) { first_nonopt = optind; } last_nonopt = argc; optind = argc; } /* If we have done all the ARGV-elements, stop the scan and back over any non-options that we skipped and permuted. */ if (optind == argc) { /* Set the next-arg-index to point at the non-options that we previously skipped, so the caller will digest them. */ if (first_nonopt != last_nonopt) { optind = first_nonopt; } return -1; } /* If we have come to a non-option and did not permute it, either stop the scan or describe it to the caller and pass it by. */ if (NONOPTION_P) { if (ordering == REQUIRE_ORDER) { return -1; } optarg = argv[optind++]; return 1; } /* We have found another option-ARGV-element. Skip the initial punctuation. */ nextchar = (argv[optind] + 1 + (longopts != NULL && argv[optind][1] == '-')); } /* Decode the current option-ARGV-element. */ /* Check whether the ARGV-element is a long option. If long_only and the ARGV-element has the form "-f", where f is a valid short option, don't consider it an abbreviated form of a long option that starts with f. Otherwise there would be no way to give the -f short option. On the other hand, if there's a long option "fubar" and the ARGV-element is "-fu", do consider that an abbreviation of the long option, just like "--fu", and not "-f" with arg "u". This distinction seems to be the most useful approach. */ if (longopts != NULL && (argv[optind][1] == '-' || (long_only && (argv[optind][2] || !my_index(optstring, argv[optind][1]))))) { char *nameend; const struct option *p; const struct option *pfound = NULL; int exact = 0; int ambig = 0; int indfound = -1; int option_index; for (nameend = nextchar; *nameend && *nameend != '='; nameend++) /* Do nothing. */ ; /* Test all long options for either exact match or abbreviated matches. */ for (p = longopts, option_index = 0; p->name; p++, option_index++) if (!strncmp(p->name, nextchar, nameend - nextchar)) { if ((unsigned int)(nameend - nextchar) == (unsigned int) strlen(p->name)) { /* Exact match found. */ pfound = p; indfound = option_index; exact = 1; break; } else if (pfound == NULL) { /* First nonexact match found. */ pfound = p; indfound = option_index; } else if (long_only || pfound->has_arg != p->has_arg || pfound->flag != p->flag || pfound->val != p->val) /* Second or later nonexact match found. */ { ambig = 1; } } if (ambig && !exact) { if (print_errors) fprintf(stderr, _("%s: option `%s' is ambiguous\n"), argv[0], argv[optind]); nextchar += strlen(nextchar); optind++; optopt = 0; return '?'; } if (pfound != NULL) { option_index = indfound; optind++; if (*nameend) { /* Don't test has_arg with >, because some C compilers don't allow it to be used on enums. */ if (pfound->has_arg) { optarg = nameend + 1; } else { if (print_errors) { if (argv[optind - 1][1] == '-') /* --option */ fprintf(stderr, _("%s: option `--%s' doesn't allow an argument\n"), argv[0], pfound->name); else /* +option or -option */ fprintf(stderr, _("%s: option `%c%s' doesn't allow an argument\n"), argv[0], argv[optind - 1][0], pfound->name); } nextchar += strlen(nextchar); optopt = pfound->val; return '?'; } } else if (pfound->has_arg == 1) { if (optind < argc) { optarg = argv[optind++]; } else { if (print_errors) fprintf(stderr, _("%s: option `%s' requires an argument\n"), argv[0], argv[optind - 1]); nextchar += strlen(nextchar); optopt = pfound->val; return optstring[0] == ':' ? ':' : '?'; } } nextchar += strlen(nextchar); if (longind != NULL) { *longind = option_index; } if (pfound->flag) { *(pfound->flag) = pfound->val; return 0; } return pfound->val; } /* Can't find it as a long option. If this is not getopt_long_only, or the option starts with '--' or is not a valid short option, then it's an error. Otherwise interpret it as a short option. */ if (!long_only || argv[optind][1] == '-' || my_index(optstring, *nextchar) == NULL) { if (print_errors) { if (argv[optind][1] == '-') /* --option */ fprintf(stderr, _("%s: unrecognized option `--%s'\n"), argv[0], nextchar); else /* +option or -option */ fprintf(stderr, _("%s: unrecognized option `%c%s'\n"), argv[0], argv[optind][0], nextchar); } nextchar = (char *) ""; optind++; optopt = 0; return '?'; } } /* Look at and handle the next short option-character. */ { char c = *nextchar++; char *temp = my_index(optstring, c); /* Increment `optind' when we start to process its last character. */ if (*nextchar == '\0') { ++optind; } if (temp == NULL || c == ':') { if (print_errors) { if (posixly_correct) /* 1003.2 specifies the format of this message. */ fprintf(stderr, _("%s: illegal option -- %c\n"), argv[0], c); else fprintf(stderr, _("%s: invalid option -- %c\n"), argv[0], c); } optopt = c; return '?'; } /* Convenience. Treat POSIX -W foo same as long option --foo */ if (temp[0] == 'W' && temp[1] == ';') { char *nameend; const struct option *p; const struct option *pfound = NULL; int exact = 0; int ambig = 0; int indfound = 0; int option_index; /* This is an option that requires an argument. */ if (*nextchar != '\0') { optarg = nextchar; /* If we end this ARGV-element by taking the rest as an arg, we must advance to the next element now. */ optind++; } else if (optind == argc) { if (print_errors) { /* 1003.2 specifies the format of this message. */ fprintf(stderr, _("%s: option requires an argument -- %c\n"), argv[0], c); } optopt = c; if (optstring[0] == ':') { c = ':'; } else { c = '?'; } return c; } else /* We already incremented `optind' once; increment it again when taking next ARGV-elt as argument. */ { optarg = argv[optind++]; } /* optarg is now the argument, see if it's in the table of longopts. */ for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++) /* Do nothing. */ ; /* Test all long options for either exact match or abbreviated matches. */ for (p = longopts, option_index = 0; p->name; p++, option_index++) if (!strncmp(p->name, nextchar, nameend - nextchar)) { if ((unsigned int)(nameend - nextchar) == strlen(p->name)) { /* Exact match found. */ pfound = p; indfound = option_index; exact = 1; break; } else if (pfound == NULL) { /* First nonexact match found. */ pfound = p; indfound = option_index; } else /* Second or later nonexact match found. */ { ambig = 1; } } if (ambig && !exact) { if (print_errors) fprintf(stderr, _("%s: option `-W %s' is ambiguous\n"), argv[0], argv[optind]); nextchar += strlen(nextchar); optind++; return '?'; } if (pfound != NULL) { option_index = indfound; if (*nameend) { /* Don't test has_arg with >, because some C compilers don't allow it to be used on enums. */ if (pfound->has_arg) { optarg = nameend + 1; } else { if (print_errors) fprintf(stderr, _("\ %s: option `-W %s' doesn't allow an argument\n"), argv[0], pfound->name); nextchar += strlen(nextchar); return '?'; } } else if (pfound->has_arg == 1) { if (optind < argc) { optarg = argv[optind++]; } else { if (print_errors) fprintf(stderr, _("%s: option `%s' requires an argument\n"), argv[0], argv[optind - 1]); nextchar += strlen(nextchar); return optstring[0] == ':' ? ':' : '?'; } } nextchar += strlen(nextchar); if (longind != NULL) { *longind = option_index; } if (pfound->flag) { *(pfound->flag) = pfound->val; return 0; } return pfound->val; } nextchar = NULL; return 'W'; /* Let the application handle it. */ } if (temp[1] == ':') { if (temp[2] == ':') { /* This is an option that accepts an argument optionally. */ if (*nextchar != '\0') { optarg = nextchar; optind++; } else { optarg = NULL; } nextchar = NULL; } else { /* This is an option that requires an argument. */ if (*nextchar != '\0') { optarg = nextchar; /* If we end this ARGV-element by taking the rest as an arg, we must advance to the next element now. */ optind++; } else if (optind == argc) { if (print_errors) { /* 1003.2 specifies the format of this message. */ fprintf(stderr, _("%s: option requires an argument -- %c\n"), argv[0], c); } optopt = c; if (optstring[0] == ':') { c = ':'; } else { c = '?'; } } else /* We already incremented `optind' once; increment it again when taking next ARGV-elt as argument. */ { optarg = argv[optind++]; } nextchar = NULL; } } return c; } } int getopt(argc, argv, optstring) int argc; char *const *argv; const char *optstring; { return _getopt_internal(argc, argv, optstring, (const struct option *) 0, (int32_t *) 0, 0); } int getopt_long(argc, argv, options, long_options, opt_index) int argc; char *const *argv; const char *options; const struct option *long_options; int32_t *opt_index; { return _getopt_internal(argc, argv, options, long_options, opt_index, 0); } #endif /* Not ELIDE_CODE. */ #ifdef TEST /* Compile with -DTEST to make an executable for use in testing the above definition of `getopt'. */ int main(argc, argv) int argc; char **argv; { int c; int digit_optind = 0; while (1) { int this_option_optind = optind ? optind : 1; c = getopt(argc, argv, "abc:d:0123456789"); if (c == -1) { break; } switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (digit_optind != 0 && digit_optind != this_option_optind) { printf("digits occur in two different argv-elements.\n"); } digit_optind = this_option_optind; printf("option %c\n", c); break; case 'a': printf("option a\n"); break; case 'b': printf("option b\n"); break; case 'c': printf("option c with value `%s'\n", optarg); break; case '?': break; default: printf("?? getopt returned character code 0%o ??\n", c); } } if (optind < argc) { printf("non-option ARGV-elements: "); while (optind < argc) { printf("%s ", argv[optind++]); } printf("\n"); } exit(0); } #endif /* TEST */ xavs2-1.3/source/compat/getopt/getopt.h000066400000000000000000000152041340660520300201610ustar00rootroot00000000000000/* Declarations for getopt. Copyright (C) 1989-1994, 1996-1999, 2001 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ #ifndef _GETOPT_H #ifndef __need_getopt # define _GETOPT_H 1 #endif #include /* If __GNU_LIBRARY__ is not already defined, either we are being used standalone, or this is the first header included in the source file. If we are being used with glibc, we need to include , but that does not exist if we are standalone. So: if __GNU_LIBRARY__ is not defined, include , which will pull in for us if it's from glibc. (Why ctype.h? It's guaranteed to exist and it doesn't flood the namespace with stuff the way some other headers do.) */ #if !defined __GNU_LIBRARY__ # include #endif #ifdef __cplusplus extern "C" { #endif /* For communication from `getopt' to the caller. When `getopt' finds an option that takes an argument, the argument value is returned here. Also, when `ordering' is RETURN_IN_ORDER, each non-option ARGV-element is returned here. */ extern char *optarg; /* Index in ARGV of the next element to be scanned. This is used for communication to and from the caller and for communication between successive calls to `getopt'. On entry to `getopt', zero means this is the first call; initialize. When `getopt' returns -1, this is the index of the first of the non-option elements that the caller should itself scan. Otherwise, `optind' communicates from one call to the next how much of ARGV has been scanned so far. */ extern int optind; /* Callers store zero here to inhibit the error message `getopt' prints for unrecognized options. */ extern int opterr; /* Set to an option character which was unrecognized. */ extern int optopt; #ifndef __need_getopt /* Describe the long-named options requested by the application. The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector of `struct option' terminated by an element containing a name which is zero. The field `has_arg' is: no_argument (or 0) if the option does not take an argument, required_argument (or 1) if the option requires an argument, optional_argument (or 2) if the option takes an optional argument. If the field `flag' is not NULL, it points to a variable that is set to the value given in the field `val' when the option is found, but left unchanged if the option is not found. To have a long-named option do something other than set an `int' to a compiled-in constant, such as set a value from `optarg', set the option's `flag' field to zero and its `val' field to a nonzero value (the equivalent single-letter option character, if there is one). For long options that have a zero `flag' field, `getopt' returns the contents of the `val' field. */ struct option { # if (defined __STDC__ && __STDC__) || defined __cplusplus const char *name; # else char *name; # endif /* has_arg can't be an enum because some compilers complain about type mismatches in all the code that assumes it is an int. */ int has_arg; int32_t *flag; int val; }; /* Names for the values of the `has_arg' field of `struct option'. */ # define no_argument 0 # define required_argument 1 # define optional_argument 2 #endif /* need getopt */ /* Get definitions and prototypes for functions to process the arguments in ARGV (ARGC of them, minus the program name) for options given in OPTS. Return the option character from OPTS just read. Return -1 when there are no more options. For unrecognized options, or options missing arguments, `optopt' is set to the option letter, and '?' is returned. The OPTS string is a list of characters which are recognized option letters, optionally followed by colons, specifying that that letter takes an argument, to be placed in `optarg'. If a letter in OPTS is followed by two colons, its argument is optional. This behavior is specific to the GNU `getopt'. The argument `--' causes premature termination of argument scanning, explicitly telling `getopt' that there are no more options. If OPTS begins with `--', then non-option arguments are treated as arguments to the option '\0'. This behavior is specific to the GNU `getopt'. */ #if (defined __STDC__ && __STDC__) || defined __cplusplus # ifdef __GNU_LIBRARY__ /* Many other libraries have conflicting prototypes for getopt, with differences in the consts, in stdlib.h. To avoid compilation errors, only prototype getopt for the GNU C library. */ extern int getopt(int __argc, char *const *__argv, const char *__shortopts); # else /* not __GNU_LIBRARY__ */ extern int getopt(); # endif /* __GNU_LIBRARY__ */ # ifndef __need_getopt extern int getopt_long(int __argc, char *const *__argv, const char *__shortopts, const struct option *__longopts, int32_t *__longind); extern int getopt_long_only(int __argc, char *const *__argv, const char *__shortopts, const struct option *__longopts, int32_t *__longind); /* Internal only. Users should not call this directly. */ extern int _getopt_internal(int __argc, char *const *__argv, const char *__shortopts, const struct option *__longopts, int32_t *__longind, int __long_only); # endif #else /* not __STDC__ */ extern int getopt(); # ifndef __need_getopt extern int getopt_long(); extern int getopt_long_only(); extern int _getopt_internal(); # endif #endif /* __STDC__ */ #ifdef __cplusplus } #endif /* Make sure we later can get all the definitions and declarations. */ #undef __need_getopt #endif /* getopt.h */ xavs2-1.3/source/compat/msvc/000077500000000000000000000000001340660520300161525ustar00rootroot00000000000000xavs2-1.3/source/compat/msvc/stdint.h000066400000000000000000000010401340660520300176230ustar00rootroot00000000000000#pragma once #ifndef _MSC_VER #error "Use this header only with Microsoft Visual C++ compilers!" #endif #include // for intptr_t #if !defined(UINT64_MAX) #include #define UINT64_MAX _UI64_MAX #endif /* a minimal set of C99 types for use with MSVC (VC9) */ typedef signed char int8_t; typedef short int int16_t; typedef int int32_t; typedef __int64 int64_t; typedef unsigned char uint8_t; typedef unsigned short int uint16_t; typedef unsigned int uint32_t; typedef unsigned __int64 uint64_t; xavs2-1.3/source/configw.h000066400000000000000000000056221340660520300155310ustar00rootroot00000000000000/* * configw.h * * Description of this file: * compiling configuration for windows platform * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_CONFIGW_H #define XAVS2_CONFIGW_H #if defined(__ICL) || defined(_MSC_VER) /* arch */ #define ARCH_X86 1 #define ARCH_PPC 0 #define ARCH_ARM 0 #define ARCH_UltraSPARC 0 /* system */ #define SYS_WINDOWS 1 #define SYS_LINUX 0 #define SYS_MACOSX 0 #define SYS_BEOS 0 #define SYS_FREEBSD 0 #define SYS_OPENBSD 0 /* cpu */ #ifndef __SSE__ #define __SSE__ #endif #define HAVE_MMX 1 /* X86 */ #define HAVE_ALTIVEC 0 /* ALTIVEC */ #define HAVE_ALTIVEC_H 0 #define HAVE_ARMV6 0 #define HAVE_ARMV6T2 0 /* thread */ #define HAVE_THREAD 1 #define HAVE_WIN32THREAD 1 #define HAVE_PTHREAD 0 #define HAVE_BEOSTHREAD 0 #define HAVE_POSIXTHREAD 0 #define PTW32_STATIC_LIB 0 /* interlace support */ #define HAVE_INTERLACED 1 /* malloc */ #define HAVE_MALLOC_H 0 /* big-endian */ #define WORDS_BIGENDIAN 0 /* others */ #define HAVE_STDINT_H 1 #define HAVE_VECTOREXT 0 #define HAVE_LOG2F 0 #define HAVE_SWSCALE 0 #define HAVE_LAVF 0 #define HAVE_FFMS 0 #define HAVE_GPAC 0 #define HAVE_GF_MALLOC 0 #define HAVE_AVS 0 #endif #endif // XAVS2_CONFIGW_H xavs2-1.3/source/encoder/000077500000000000000000000000001340660520300153365ustar00rootroot00000000000000xavs2-1.3/source/encoder/aec.c000066400000000000000000002403111340660520300162330ustar00rootroot00000000000000/* * aec.c * * Description of this file: * AEC functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "cudata.h" #include "aec.h" #include "bitstream.h" #include "block_info.h" #include "cudata.h" /** * =========================================================================== * local & global variables (const tables) * =========================================================================== */ static const int tab_b8xy_to_zigzag[8][8] = { { 0, 1, 4, 5, 16, 17, 20, 21 }, { 2, 3, 6, 7, 18, 19, 22, 23 }, { 8, 9, 12, 13, 24, 25, 28, 29 }, { 10, 11, 14, 15, 26, 27, 30, 31 }, { 32, 33, 36, 37, 48, 49, 52, 53 }, { 34, 35, 38, 39, 50, 51, 54, 55 }, { 40, 41, 44, 45, 56, 57, 60, 61 }, { 42, 43, 46, 47, 58, 59, 62, 63 } }; /* --------------------------------------------------------------------------- * ļone bitʣλ */ static INLINE void bitstt_put_one_bit_and_remainder(aec_t *p_aec, const int b) { uint32_t N = 1 + p_aec->i_bits_to_follow; // ܹı if (N > p_aec->num_left_flush_bits) { /* ıǰֽʣı */ int header_bits = p_aec->num_left_flush_bits; // ǰһֽʣλ uint32_t header_byte = (1 << (header_bits - 1)) - (!b); // ʣλֵ int num_left_bytes = (N - header_bits) >> 3; // ǰֽ⣬ʣӦֽ int num_left_bits = N - header_bits - (num_left_bytes << 3); // ı p_aec->reg_flush_bits |= header_byte; bitstr_flush_bits(p_aec); p_aec->num_left_flush_bits = NUM_FLUSH_BITS - num_left_bits; if (b == 0) { /* b Ϊʱмbitsȫ 1 */ while (num_left_bytes != 0) { *(p_aec->p) = 0xff; p_aec->p++; num_left_bytes--; } /* num_left_bits λ reg_flush_bits λ */ p_aec->reg_flush_bits = 0xffu >> (8 - num_left_bits) << p_aec->num_left_flush_bits; } else { p_aec->p += num_left_bytes; } } else { /* ǰҪbitСдֽʣbit */ uint32_t bits = (1 << p_aec->i_bits_to_follow) - (!b); // ıɵĶֵ p_aec->reg_flush_bits |= bits << (p_aec->num_left_flush_bits - N); p_aec->num_left_flush_bits -= N; if (p_aec->num_left_flush_bits == 0) { bitstr_flush_bits(p_aec); p_aec->reg_flush_bits = 0; p_aec->num_left_flush_bits = NUM_FLUSH_BITS; } } p_aec->i_bits_to_follow = 0; } /** * =========================================================================== * binary * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int aec_get_shift(uint32_t v) { #if SYS_WINDOWS && !ARCH_X86_64 __asm { bsr eax, v mov v, eax } return 8 - v; #else int i; for (i = 0; !(v & 0x100); i++) { v <<= 1; } return i; #endif } /* --------------------------------------------------------------------------- * logarithm arithmetic coder */ static INLINE void biari_encode_symbol_aec(aec_t *p_aec, uint8_t symbol, context_t *p_ctx) { uint32_t lg_pmps = p_ctx->LG_PMPS; #if !CTRL_OPT_AEC uint32_t cycno = p_ctx->cycno; uint32_t lg_tmp; #endif uint32_t t = p_aec->i_t1; uint32_t low = p_aec->i_low; const uint32_t lg_pmps_shifted = lg_pmps >> LG_PMPS_SHIFTNO; int s = (t < lg_pmps_shifted); if (symbol == p_ctx->MPS) { // MPS happens if (s) { if (low & (1 << 9)) { bitstt_put_one_bit_and_remainder(p_aec, 1); } else if (low & (1 << 8)) { p_aec->i_bits_to_follow++; low &= ((1 << 8) ^ 0xFFFFFFFF); } else { bitstt_put_one_bit_and_remainder(p_aec, 0); } low <<= 1; } t = (t - lg_pmps_shifted) & 0xFF; #if CTRL_OPT_AEC p_ctx->v = g_tab_ctx_mps[p_ctx->v].v; #else lg_tmp = lg_pmps >> XAVS2_MAX(2 + cycno, 3); lg_pmps -= (lg_tmp + (lg_tmp >> LG_PMPS_SHIFTNO)); cycno += (!cycno); #endif } else { // LPS int shift; uint32_t low_buf = (low << s) + 256 + ((t - lg_pmps_shifted) & 0xFF); uint32_t bitstogo = 9 + s; uint32_t bit_oa; t = ((-s) & t) + lg_pmps_shifted; shift = aec_get_shift(t); t <<= shift; s += shift; // left shift s2 bits bit_oa = ((low_buf >> bitstogo) & 1); while (s-- != 0) { uint32_t bit_o = bit_oa; bitstogo--; bit_oa = ((low_buf >> bitstogo) & 1); if (bit_o) { bitstt_put_one_bit_and_remainder(p_aec, 1); } else if (bit_oa) { // 01 p_aec->i_bits_to_follow++; bit_oa = 0; } else { // 00 bitstt_put_one_bit_and_remainder(p_aec, 0); } } t &= 0xff; low = (low_buf << shift) & ((bit_oa << 9) | 0x1ff); #if CTRL_OPT_AEC p_ctx->v = g_tab_ctx_lps[p_ctx->v].v; #else lg_pmps += tab_cwr[cycno]; cycno += (cycno < 3); if (lg_pmps >= (256 << LG_PMPS_SHIFTNO)) { lg_pmps = ((512 << LG_PMPS_SHIFTNO) - 1) - lg_pmps; p_ctx->MPS = (uint8_t)(!p_ctx->MPS); } #endif } p_aec->i_t1 = t; p_aec->i_low = low; #if !CTRL_OPT_AEC p_ctx->LG_PMPS = (uint16_t)lg_pmps; p_ctx->cycno = (uint8_t) cycno; #endif } /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_tu_aec(aec_t *p_aec, int num_zeros, int max_len, context_t *p_ctx) { max_len -= num_zeros; while (num_zeros != 0) { biari_encode_symbol_aec(p_aec, 0, p_ctx); num_zeros--; } if (max_len) { biari_encode_symbol_aec(p_aec, 1, p_ctx); } } /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_symbol_eq_prob_aec(aec_t *p_aec, uint8_t symbol) { uint32_t low_buf = (p_aec->i_low << 1) + (symbol ? (p_aec->i_t1 + 256) : 0); uint8_t bit_oa = (uint8_t)((low_buf >> 9) & 1); // out bit if ((low_buf >> 10) & 1) { bitstt_put_one_bit_and_remainder(p_aec, 1); } else { if (bit_oa) { // 01 p_aec->i_bits_to_follow++; bit_oa = 0; } else { // 00 bitstt_put_one_bit_and_remainder(p_aec, 0); } } p_aec->i_low = low_buf & ((bit_oa << 9) | 0x1ff); } /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_symbols_eq_prob_aec(aec_t *p_aec, uint32_t val, int len) { while (--len >= 0) { biari_encode_symbol_eq_prob_aec(p_aec, (uint8_t)((val >> len) & 1)); } } /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_symbol_final_aec(aec_t *p_aec, uint8_t symbol) { uint32_t t = p_aec->i_t1; uint32_t low = p_aec->i_low; if (symbol) { int s = !t; uint32_t low_buf = (low << s) + 256 + ((t - 1) & 0xFF); uint32_t bitstogo = 9 + s; uint8_t bit_oa = (uint8_t)((low_buf >> bitstogo) & 1); uint8_t bit_o; s += 8; while (s-- > 0) { bit_o = bit_oa; bitstogo--; bit_oa = (uint8_t)((low_buf >> bitstogo) & 1); if (bit_o) { bitstt_put_one_bit_and_remainder(p_aec, 1); } else { if (bit_oa) { // 01 p_aec->i_bits_to_follow++; bit_oa = 0; } else { // 00 bitstt_put_one_bit_and_remainder(p_aec, 0); } } } p_aec->i_low = (low_buf << 8) & ((bit_oa << 9) | 0x1ff); p_aec->i_t1 = 0; } else { // MPS if (!t) { if (low & (1 << 9)) { bitstt_put_one_bit_and_remainder(p_aec, 1); } else { if (low & (1 << 8)) { p_aec->i_bits_to_follow++; low &= ((1 << 8) ^ 0xFFFFFFFF); } else { bitstt_put_one_bit_and_remainder(p_aec, 0); } } p_aec->i_low = low << 1; } p_aec->i_t1 = (t - 1) & 0xff; } } /** * =========================================================================== * arithmetic coding * =========================================================================== */ /** * =========================================================================== * syntax coding * =========================================================================== */ /* --------------------------------------------------------------------------- * cu type for B/F/P frame */ static INLINE int aec_write_cutype(aec_t *p_aec, int i_cu_type, int i_cu_level, int i_cu_cbp, int is_amp_enabled) { context_t *p_ctx = p_aec->p_ctx_set->cu_type_contexts; int org_bits = arienco_bits_written(p_aec); int act_sym = MAP_CU_TYPE[i_cu_type]; if (i_cu_type == PRED_SKIP && i_cu_cbp == 0) { act_sym = 0; } switch (act_sym) { case 0: // SKIP biari_encode_symbol_aec(p_aec, 1, p_ctx + 0); break; case 1: // DIRECT biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); biari_encode_symbol_aec(p_aec, 1, p_ctx + 1); break; case 2: // 2Nx2N biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); biari_encode_symbol_aec(p_aec, 1, p_ctx + 2); break; case 3: // 2NxN, 2NxnU, 2NxnD biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); biari_encode_symbol_aec(p_aec, 0, p_ctx + 2); biari_encode_symbol_aec(p_aec, 1, p_ctx + 3); if (is_amp_enabled && i_cu_level >= B16X16_IN_BIT) { p_ctx = p_aec->p_ctx_set->shape_of_partition_index; if (i_cu_type == PRED_2NxN) { biari_encode_symbol_aec(p_aec, 1, p_ctx); // SMP - AMP signal bit } else { biari_encode_symbol_aec(p_aec, 0, p_ctx); // SMP - AMP signal bit biari_encode_symbol_aec(p_aec, (uint8_t)(i_cu_type == PRED_2NxnU), p_ctx + 1); // AMP shape } } break; case 4: // Nx2N, nLx2N, nRx2N biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); biari_encode_symbol_aec(p_aec, 0, p_ctx + 2); biari_encode_symbol_aec(p_aec, 0, p_ctx + 3); biari_encode_symbol_aec(p_aec, 1, p_ctx + 4); if (is_amp_enabled && i_cu_level >= B16X16_IN_BIT) { p_ctx = p_aec->p_ctx_set->shape_of_partition_index; if (i_cu_type == PRED_Nx2N) { biari_encode_symbol_aec(p_aec, 1, p_ctx); // SMP - AMP signal bit } else { biari_encode_symbol_aec(p_aec, 0, p_ctx); // SMP - AMP signal bit biari_encode_symbol_aec(p_aec, (uint8_t)(i_cu_type == PRED_nLx2N), p_ctx + 1); // AMP shape } } break; //case 5: // NxN, not enabled // biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); // biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); // biari_encode_symbol_aec(p_aec, 0, p_ctx + 2); // biari_encode_symbol_aec(p_aec, 0, p_ctx + 3); // biari_encode_symbol_aec(p_aec, 0, p_ctx + 4); // if (i_cu_level > B8X8_IN_BIT) { // biari_encode_symbol_final_aec(p_aec, 1); // } // break; default: // case 6: // Intra biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); biari_encode_symbol_aec(p_aec, 0, p_ctx + 2); biari_encode_symbol_aec(p_aec, 0, p_ctx + 3); biari_encode_symbol_aec(p_aec, 0, p_ctx + 4); if (i_cu_level > B8X8_IN_BIT) { biari_encode_symbol_final_aec(p_aec, 0); } break; } #if XAVS2_TRACE if (p_aec->b_writting) { write_trace_info2("cuType", i_cu_type, 1); } #endif /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode a pair of intra prediction modes of a given cu */ static int aec_write_intra_pred_mode(aec_t *p_aec, int ipmode) { context_t *p_ctx = p_aec->p_ctx_set->intra_luma_pred_mode; int org_bits = arienco_bits_written(p_aec); if (ipmode >= 0) { biari_encode_symbol_aec(p_aec, 0, p_ctx ); biari_encode_symbol_aec(p_aec, (uint8_t)((ipmode & 0x10) >> 4), p_ctx + 1); biari_encode_symbol_aec(p_aec, (uint8_t)((ipmode & 0x08) >> 3), p_ctx + 2); biari_encode_symbol_aec(p_aec, (uint8_t)((ipmode & 0x04) >> 2), p_ctx + 3); biari_encode_symbol_aec(p_aec, (uint8_t)((ipmode & 0x02) >> 1), p_ctx + 4); biari_encode_symbol_aec(p_aec, (uint8_t)((ipmode & 0x01) ), p_ctx + 5); } else { biari_encode_symbol_aec(p_aec, 1, p_ctx ); biari_encode_symbol_aec(p_aec, (uint8_t)(ipmode + 2), p_ctx + 6); } #if XAVS2_TRACE if (p_aec->b_writting) { xavs2_trace("@%d Ipred Mode\t\t\t%d\n", g_sym_count++, ipmode); } #endif /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the reference parameter of a given cu */ static INLINE int aec_write_ref(xavs2_t *h, aec_t *p_aec, int ref_idx) { context_t *p_ctx = p_aec->p_ctx_set->pu_reference_index; int org_bits = arienco_bits_written(p_aec); int act_sym = ref_idx; /* 0λ0ģ1λ1ģ2 */ if (act_sym == 0) { biari_encode_symbol_aec(p_aec, 1, p_ctx); } else { int act_ctx = 0; biari_encode_symbol_aec(p_aec, 0, p_ctx++); while (--act_sym != 0) { biari_encode_symbol_aec(p_aec, 0, p_ctx); if (!act_ctx) { p_ctx++; } } if (ref_idx < h->i_ref - 1) { biari_encode_symbol_aec(p_aec, 1, p_ctx); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the motion vector data */ static INLINE int aec_write_mvd(aec_t *p_aec, int mvd, int xy) { context_t *p_ctx = p_aec->p_ctx_set->mvd_contexts[xy]; int org_bits = arienco_bits_written(p_aec); uint32_t act_sym = XAVS2_ABS(mvd); if (act_sym < 3) { // 0, 1, 2 if (act_sym == 0) { biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); } else if (act_sym == 1) { biari_encode_symbol_aec(p_aec, 1, p_ctx + 0); biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); } else { // act_sym == 2 biari_encode_symbol_aec(p_aec, 1, p_ctx + 0); biari_encode_symbol_aec(p_aec, 1, p_ctx + 1); biari_encode_symbol_aec(p_aec, 0, p_ctx + 2); } } else { int exp_golomb_order = 0; biari_encode_symbol_aec(p_aec, 1, p_ctx + 0); biari_encode_symbol_aec(p_aec, 1, p_ctx + 1); biari_encode_symbol_aec(p_aec, 1, p_ctx + 2); if ((act_sym & 1) == 1) { // odds >3 biari_encode_symbol_eq_prob_aec(p_aec, 0); act_sym = (act_sym - 3) >> 1; } else { // even >3 biari_encode_symbol_eq_prob_aec(p_aec, 1); act_sym = (act_sym - 4) >> 1; } /* exp_golomb part */ while (act_sym >= (uint32_t)(1 << exp_golomb_order)) { act_sym -= (1 << exp_golomb_order); exp_golomb_order++; } biari_encode_symbols_eq_prob_aec(p_aec, 1, exp_golomb_order + 1); // Exp-Golomb: prefix and 1 biari_encode_symbols_eq_prob_aec(p_aec, act_sym, exp_golomb_order); // Exp-Golomb: suffix } if (mvd != 0) { // mv sign biari_encode_symbol_eq_prob_aec(p_aec, (uint8_t)(mvd < 0)); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_dmh_mode(aec_t *p_aec, int i_cu_level, int dmh_mode) { static const int iEncMapTab[9] = { 0, 5, 6, 1, 2, 7, 8, 3, 4 }; context_t *p_ctx = p_aec->p_ctx_set->pu_type_index + 3; int org_bits = arienco_bits_written(p_aec); int symbol = dmh_mode != 0; p_ctx += (i_cu_level - MIN_CU_SIZE_IN_BIT) * 3; biari_encode_symbol_aec(p_aec, (uint8_t)symbol, p_ctx); if (symbol) { int iMapVal = iEncMapTab[dmh_mode]; if (iMapVal < 3) { symbol = (iMapVal != 1); biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); biari_encode_symbol_eq_prob_aec(p_aec, (uint8_t)symbol); } else if (iMapVal < 5) { symbol = (iMapVal != 3); biari_encode_symbol_aec(p_aec, 1, p_ctx + 1); biari_encode_symbol_aec(p_aec, 0, p_ctx + 2); biari_encode_symbol_eq_prob_aec(p_aec, (uint8_t)symbol); } else { biari_encode_symbol_aec(p_aec, 1, p_ctx + 1); biari_encode_symbol_aec(p_aec, 1, p_ctx + 2); symbol = (iMapVal >= 7); biari_encode_symbol_eq_prob_aec(p_aec, (uint8_t)symbol); symbol = !(iMapVal & 1); biari_encode_symbol_eq_prob_aec(p_aec, (uint8_t)symbol); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * write "transform_split_flag" and SDIP type for intra CU */ static INLINE int aec_write_intra_cutype(aec_t *p_aec, int i_cu_type, int i_cu_level, int i_tu_split, int is_sdip_enabled) { context_t *p_ctx = p_aec->p_ctx_set->transform_split_flag; int org_bits = arienco_bits_written(p_aec); uint8_t transform_split_flag = i_tu_split != TU_SPLIT_NON; /* just write split or not */ if (i_cu_level == B8X8_IN_BIT) { biari_encode_symbol_aec(p_aec, transform_split_flag, p_ctx + 1); } else if (is_sdip_enabled && (i_cu_level == B32X32_IN_BIT || i_cu_level == B16X16_IN_BIT)) { biari_encode_symbol_aec(p_aec, transform_split_flag, p_ctx + 2); } #if XAVS2_TRACE if (p_aec->b_writting) { xavs2_trace("Transform_Size = %3d \n", transform_split_flag); } #endif if (is_sdip_enabled) { if ((i_cu_level == B32X32_IN_BIT || i_cu_level == B16X16_IN_BIT) && i_tu_split) { p_ctx = p_aec->p_ctx_set->intra_pu_type_contexts; biari_encode_symbol_aec(p_aec, i_cu_type == PRED_I_2Nxn, p_ctx); #if XAVS2_TRACE if (i_cu_type != PRED_I_2Nxn && i_cu_type != PRED_I_nx2N) { xavs2_log(NULL, XAVS2_LOG_ERROR, "!!!error cu_type!!!\n"); } } else { i_cu_type = PRED_I_2Nx2N; #endif } #if XAVS2_TRACE if (p_aec->b_writting) { write_trace_info2("cuType", i_cu_type, 1); } #endif } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_pdir(aec_t *p_aec, int i_cu_type, int i_cu_level, int pdir0, int pdir1) { int new_pdir[4] = { 2, 1, 3, 0 }; context_t *p_ctx = p_aec->p_ctx_set->pu_type_index; int org_bits = arienco_bits_written(p_aec); int act_ctx = 0; int act_sym; int symbol; if (i_cu_type == PRED_2Nx2N) { /* һCUֻһPUPUĸʹ3: 0, 1, 2 */ act_sym = pdir0; while (act_sym >= 1) { biari_encode_symbol_aec(p_aec, 0, p_ctx + act_ctx); act_sym--; act_ctx++; } if (pdir0 != 3) { biari_encode_symbol_aec(p_aec, 1, p_ctx + act_ctx); } } else if ((i_cu_type >= PRED_2NxN && i_cu_type <= PRED_nRx2N) && i_cu_level == B8X8_IN_BIT) { /* һCUΪPUCUСΪ8x8ʱԤΪ4x88x4ÿPUֻǵԤ⣬ * ܼ4ϣҪλBit b_pu_type_min_index ʹ */ p_ctx = p_aec->p_ctx_set->b_pu_type_min_index; pdir0 = new_pdir[pdir0]; pdir1 = new_pdir[pdir1]; act_sym = (pdir0 != 1); biari_encode_symbol_aec(p_aec, (int8_t)act_sym, p_ctx + 0); act_sym = (pdir0 == pdir1); biari_encode_symbol_aec(p_aec, (int8_t)act_sym, p_ctx + 1); } else if (i_cu_type >= PRED_2NxN || i_cu_type <= PRED_nRx2N) { //1010 /* act_ctx: 3,...,14 */ pdir0 = new_pdir[pdir0]; pdir1 = new_pdir[pdir1]; act_sym = pdir0; act_ctx = 3; /* 3,4,5 */ while (act_sym >= 1) { biari_encode_symbol_aec(p_aec, 0, p_ctx + act_ctx); act_sym--; act_ctx++; } if (pdir0 != 3) { biari_encode_symbol_aec(p_aec, 1, p_ctx + act_ctx); } symbol = (pdir0 == pdir1); biari_encode_symbol_aec(p_aec, (uint8_t)symbol, p_ctx + 6); /* 7,...,14 */ if (!symbol) { switch (pdir0) { case 0: symbol = (pdir1 == 1); biari_encode_symbol_aec(p_aec, (uint8_t)symbol, p_ctx + 7); if (!symbol) { symbol = (pdir1 == 2); biari_encode_symbol_aec(p_aec, (uint8_t)symbol, p_ctx + 8); } break; case 1: symbol = (pdir1 == 0); biari_encode_symbol_aec(p_aec, (uint8_t)symbol, p_ctx + 9); if (!symbol) { symbol = (pdir1 == 2); biari_encode_symbol_aec(p_aec, (uint8_t)symbol, p_ctx + 10); } break; case 2: symbol = (pdir1 == 0); biari_encode_symbol_aec(p_aec, (uint8_t)symbol, p_ctx + 11); if (!symbol) { symbol = (pdir1 == 1); biari_encode_symbol_aec(p_aec, (uint8_t)symbol, p_ctx + 12); } break; case 3: symbol = (pdir1 == 0); biari_encode_symbol_aec(p_aec, (uint8_t)symbol, p_ctx + 13); if (!symbol) { symbol = (pdir1 == 1); biari_encode_symbol_aec(p_aec, (uint8_t)symbol, p_ctx + 14); } break; } } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_pdir_dhp(aec_t *p_aec, int i_cu_type, int pdir0, int pdir1) { context_t *p_ctx = p_aec->p_ctx_set->pu_type_index; int org_bits = arienco_bits_written(p_aec); pdir0 = (pdir0 != 0); pdir1 = (pdir1 != 0); if (i_cu_type == PRED_2Nx2N) { biari_encode_symbol_aec(p_aec, (uint8_t)pdir0, p_ctx); } else if (i_cu_type >= PRED_2NxN || i_cu_type <= PRED_nRx2N) { // 1010 biari_encode_symbol_aec(p_aec, (uint8_t)pdir0, p_ctx + 1); biari_encode_symbol_aec(p_aec, (uint8_t)(pdir0 == pdir1), p_ctx + 2); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_wpm(aec_t *p_aec, int ref_idx, int num_ref) { context_t *p_ctx = p_aec->p_ctx_set->weighted_skip_mode; int org_bits = arienco_bits_written(p_aec); int i, idx_bin = 0; for (i = 0; i < ref_idx; i++) { biari_encode_symbol_aec(p_aec, 0, p_ctx + idx_bin); idx_bin = XAVS2_MIN(idx_bin + 1, 2); } if (ref_idx < num_ref - 1) { biari_encode_symbol_aec(p_aec, 1, p_ctx + idx_bin); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_spatial_skip_mode(aec_t *p_aec, int mode) { context_t *p_ctx = p_aec->p_ctx_set->cu_subtype_index; int org_bits = arienco_bits_written(p_aec); int offset; for (offset = 0; offset < mode; offset++) { biari_encode_symbol_aec(p_aec, 0, p_ctx + offset); } if (mode < DS_MAX_NUM) { biari_encode_symbol_aec(p_aec, 1, p_ctx + offset); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the chroma intra prediction mode of an 8x8 block */ static INLINE int aec_write_intra_pred_cmode(aec_t *p_aec, cu_info_t *p_cu_info, int i_left_cmode) { context_t *p_ctx = p_aec->p_ctx_set->intra_chroma_pred_mode; int i_chroma_mode = p_cu_info->i_intra_mode_c; int org_bits = arienco_bits_written(p_aec); int act_ctx = i_left_cmode != DM_PRED_C; // ? 1 : 0; if (i_chroma_mode == DM_PRED_C) { biari_encode_symbol_aec(p_aec, 1, p_ctx + act_ctx); } else { int lmode = tab_intra_mode_luma2chroma[p_cu_info->real_intra_modes[0]]; int is_redundant = lmode >= 0; biari_encode_symbol_aec(p_aec, 0, p_ctx + act_ctx); i_chroma_mode -= (1 + (is_redundant && i_chroma_mode > lmode)); p_ctx += 2; switch (i_chroma_mode) { case 0: biari_encode_symbol_aec(p_aec, 1, p_ctx); break; case 1: biari_encode_symbol_aec(p_aec, 0, p_ctx); biari_encode_symbol_aec(p_aec, 1, p_ctx); break; case 2: biari_encode_symbol_aec(p_aec, 0, p_ctx); biari_encode_symbol_aec(p_aec, 0, p_ctx); biari_encode_symbol_aec(p_aec, 1, p_ctx); break; case 3: biari_encode_symbol_aec(p_aec, 0, p_ctx); biari_encode_symbol_aec(p_aec, 0, p_ctx); biari_encode_symbol_aec(p_aec, 0, p_ctx); break; default: xavs2_log(NULL, XAVS2_LOG_ERROR, "invalid chroma mode %d\n", i_chroma_mode); break; } } #if XAVS2_TRACE if (p_aec->b_writting) { xavs2_trace("@%d Chroma intra pred mode\t\t\t%d\n", g_sym_count++, i_chroma_mode); } #endif /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the coded block pattern of an luma CB */ static int write_cbp_bit(xavs2_t *h, aec_t *p_aec, cu_info_t *p_cu_info, int slice_index_cur_cu, int b8, int bit) { int org_bits = arienco_bits_written(p_aec); int i_cu_level = p_cu_info->i_level; int transform_split_flag = p_cu_info->i_tu_split != TU_SPLIT_NON; int is_hor_part = p_cu_info->i_tu_split == TU_SPLIT_HOR; int is_ver_part = p_cu_info->i_tu_split == TU_SPLIT_VER; int a, b; int x_4x4, y_4x4; ///< ǰ任4x4λ int w_4x4, h_4x4; ///< ǰ任4x4С context_t *p_ctx; /* get context pointer */ if (b8 == 4) { p_ctx = p_aec->p_ctx_set->cbp_contexts + 8; } else { w_4x4 = h_4x4 = 1 << (i_cu_level - MIN_PU_SIZE_IN_BIT); x_4x4 = p_cu_info->i_scu_x << (MIN_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT); y_4x4 = p_cu_info->i_scu_y << (MIN_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT); if (b8 != 4 && transform_split_flag) { if (is_hor_part) { h_4x4 >>= 2; y_4x4 += h_4x4 * b8; } else if (is_ver_part) { w_4x4 >>= 2; x_4x4 += w_4x4 * b8; } else { w_4x4 >>= 1; h_4x4 >>= 1; x_4x4 += (b8 & 1) ? w_4x4 : 0; y_4x4 += (b8 >> 1) ? h_4x4 : 0; } } a = get_neighbor_cbp_y(h, p_cu_info, slice_index_cur_cu, x_4x4 - 1, y_4x4 ); b = get_neighbor_cbp_y(h, p_cu_info, slice_index_cur_cu, x_4x4 , y_4x4 - 1); p_ctx = p_aec->p_ctx_set->cbp_contexts + a + 2 * b; } /* write bits */ biari_encode_symbol_aec(p_aec, (uint8_t)bit, p_ctx); /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the coded block pattern of a cu */ int aec_write_cu_cbp(aec_t *p_aec, cu_info_t *p_cu_info, int slice_index_cur_cu, xavs2_t *h) { context_t *p_ctx = p_aec->p_ctx_set->cbp_contexts + 4; int org_bits = arienco_bits_written(p_aec); int i_cu_cbp = p_cu_info->i_cbp; int i_cu_type = p_cu_info->i_mode; int transform_split_flag = p_cu_info->i_tu_split != TU_SPLIT_NON; if (IS_INTER_MODE(i_cu_type)) { /* write cbp for inter pred mode --------------------------- */ if (!IS_SKIP_MODE(i_cu_type)) { write_cbp_bit(h, p_aec, p_cu_info, slice_index_cur_cu, 4, i_cu_cbp == 0); } if (i_cu_cbp) { // write tr_size biari_encode_symbol_aec(p_aec, (uint8_t)transform_split_flag, p_aec->p_ctx_set->transform_split_flag); // write cbp for chroma if (h->param->chroma_format != CHROMA_400) { switch ((i_cu_cbp >> 4) & 0x03) { case 0: biari_encode_symbol_aec(p_aec, 0, p_ctx); break; case 1: biari_encode_symbol_aec(p_aec, 1, p_ctx); biari_encode_symbol_aec(p_aec, 0, p_ctx + 2); biari_encode_symbol_aec(p_aec, 0, p_ctx + 2); break; case 2: biari_encode_symbol_aec(p_aec, 1, p_ctx); biari_encode_symbol_aec(p_aec, 0, p_ctx + 2); biari_encode_symbol_aec(p_aec, 1, p_ctx + 2); break; case 3: biari_encode_symbol_aec(p_aec, 1, p_ctx); biari_encode_symbol_aec(p_aec, 1, p_ctx + 2); break; } } // write cbp for luma if (transform_split_flag == 0) { if (i_cu_cbp > 15) { write_cbp_bit(h, p_aec, p_cu_info, slice_index_cur_cu, 0, (i_cu_cbp & 1) != 0); } } else { write_cbp_bit(h, p_aec, p_cu_info, slice_index_cur_cu, 0, (i_cu_cbp & 1) != 0); write_cbp_bit(h, p_aec, p_cu_info, slice_index_cur_cu, 1, (i_cu_cbp & 2) != 0); write_cbp_bit(h, p_aec, p_cu_info, slice_index_cur_cu, 2, (i_cu_cbp & 4) != 0); write_cbp_bit(h, p_aec, p_cu_info, slice_index_cur_cu, 3, (i_cu_cbp & 8) != 0); } } } else { /* write cbp for intra pred mode --------------------------- */ // write bits for luma if (transform_split_flag == 0 || i_cu_type == PRED_I_2Nx2N) { write_cbp_bit(h, p_aec, p_cu_info, slice_index_cur_cu, 0, (i_cu_cbp & 0x0F) != 0); } else { write_cbp_bit(h, p_aec, p_cu_info, slice_index_cur_cu, 0, (i_cu_cbp & 1) != 0); write_cbp_bit(h, p_aec, p_cu_info, slice_index_cur_cu, 1, (i_cu_cbp & 2) != 0); write_cbp_bit(h, p_aec, p_cu_info, slice_index_cur_cu, 2, (i_cu_cbp & 4) != 0); write_cbp_bit(h, p_aec, p_cu_info, slice_index_cur_cu, 3, (i_cu_cbp & 8) != 0); } // write bits for chroma if (h->param->chroma_format != CHROMA_400) { switch ((i_cu_cbp >> 4) & 0x03) { case 0: biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); break; case 1: biari_encode_symbol_aec(p_aec, 1, p_ctx + 1); biari_encode_symbol_aec(p_aec, 0, p_ctx + 3); biari_encode_symbol_aec(p_aec, 0, p_ctx + 3); break; case 2: biari_encode_symbol_aec(p_aec, 1, p_ctx + 1); biari_encode_symbol_aec(p_aec, 0, p_ctx + 3); biari_encode_symbol_aec(p_aec, 1, p_ctx + 3); break; case 3: biari_encode_symbol_aec(p_aec, 1, p_ctx + 1); biari_encode_symbol_aec(p_aec, 1, p_ctx + 3); break; } } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } #if ENABLE_RATE_CONTROL_CU /* --------------------------------------------------------------------------- */ static INLINE int aec_write_dqp(aec_t *p_aec, int delta_qp, int last_dqp) { context_t *p_ctx = p_aec->p_ctx_set->delta_qp_contexts; int org_bits = arienco_bits_written(p_aec); int act_ctx = (last_dqp) ? 1 : 0; int act_sym = (delta_qp > 0) ? (2 * delta_qp - 1) : (-2 * delta_qp); if (act_sym == 0) { biari_encode_symbol_aec(p_aec, 1, p_ctx + act_ctx); } else { biari_encode_symbol_aec(p_aec, 0, p_ctx + act_ctx); act_ctx = 2; if (act_sym == 1) { biari_encode_symbol_aec(p_aec, 1, p_ctx + act_ctx); } else { biari_encode_symbol_aec(p_aec, 0, p_ctx + act_ctx); act_ctx++; while (act_sym > 2) { biari_encode_symbol_aec(p_aec, 0, p_ctx + act_ctx); act_sym--; } biari_encode_symbol_aec(p_aec, 1, p_ctx + act_ctx); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } #endif /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void aec_write_last_cg_pos(aec_t *p_aec, int b_luma, int b_dc_diag, int i_cg, int cg_last_x, int cg_last_y, int num_cg, int num_cg_x_minus1, int num_cg_y_minus1) { context_t *p_ctx = p_aec->p_ctx_set->last_cg_contexts + (b_luma ? 0 : NUM_LAST_CG_CTX_LUMA); int count; if (num_cg == 4) { // 8x8 switch (i_cg) { case 0: biari_encode_symbol_aec(p_aec, 1, p_ctx + 0); break; case 1: biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); biari_encode_symbol_aec(p_aec, 1, p_ctx + 1); break; case 2: biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); biari_encode_symbol_aec(p_aec, 1, p_ctx + 2); break; default: // case 3: biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); biari_encode_symbol_aec(p_aec, 0, p_ctx + 2); break; } } else { if (b_luma && b_dc_diag) { XAVS2_SWAP(cg_last_x, cg_last_y); XAVS2_SWAP(num_cg_x_minus1, num_cg_y_minus1); } if (cg_last_x == 0 && cg_last_y == 0) { biari_encode_symbol_aec(p_aec, 0, p_ctx + 3); /* last_cg0_flag */ } else { biari_encode_symbol_aec(p_aec, 1, p_ctx + 3); /* last_cg0_flag */ /* last_cg_x */ biari_encode_tu_aec(p_aec, cg_last_x, num_cg_x_minus1, p_ctx + 4); /* last_cg_y or last_cg_y_minus1 */ count = (cg_last_x == 0); // cg_last_xΪ㣬cg_last_yдһ㣨һ㣩 biari_encode_tu_aec(p_aec, cg_last_y - count, num_cg_y_minus1 - count, p_ctx + 5); } } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void aec_write_last_coeff_pos(aec_t *p_aec, context_t *p_ctx, int isLastCG, int b_one_cg, int cg_x, int cg_y, int last_coeff_pos_x, int last_coeff_pos_y, int b_luma, int b_dc_diag) { int offset; if (!isLastCG) { last_coeff_pos_x = 3 - last_coeff_pos_x; if (b_dc_diag) { last_coeff_pos_y = 3 - last_coeff_pos_y; } } if (cg_x == 0 && cg_y > 0 && b_dc_diag) { XAVS2_SWAP(last_coeff_pos_x, last_coeff_pos_y); } /* AVS2-P2: 8.3.3.2.14 ȷlast_coeff_pos_x last_coeff_pos_y ctxIdxInc */ if (b_luma == 0) { // ɫȷռ12 offset = b_one_cg ? 0 : 4 + isLastCG * 4; } else if (b_one_cg) { // Log2TransformSize Ϊ 2ռ8 offset = 40 + (b_dc_diag) * 4; } else if (cg_x != 0 && cg_y != 0) { // cg_x cg_y Ϊ㣬ռ8 offset = 32 + isLastCG * 4; } else { // λռ40 offset = (4 * isLastCG + 2 * (cg_x == 0 && cg_y == 0) + (b_dc_diag)) * 4; } p_ctx += offset; switch (last_coeff_pos_x) { case 0: biari_encode_symbol_aec(p_aec, 1, p_ctx + 0); break; case 1: biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); biari_encode_symbol_aec(p_aec, 1, p_ctx + 1); break; case 2: biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); biari_encode_symbol_aec(p_aec, 1, p_ctx + 1); break; default: // case 3: biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); break; } p_ctx += 2; switch (last_coeff_pos_y) { case 0: biari_encode_symbol_aec(p_aec, 1, p_ctx + 0); break; case 1: biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); biari_encode_symbol_aec(p_aec, 1, p_ctx + 1); break; case 2: biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); biari_encode_symbol_aec(p_aec, 1, p_ctx + 1); break; default: // case 3: biari_encode_symbol_aec(p_aec, 0, p_ctx + 0); biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); biari_encode_symbol_aec(p_aec, 0, p_ctx + 1); break; } } /* --------------------------------------------------------------------------- */ static int aec_write_run_level_luma(aec_t *p_aec, int b_dc_diag, runlevel_t *runlevel, xavs2_t *h, int maxvalue) { static const int8_t tab_rank[6] = { 0, 1, 2, 3, 3, 4/*, 4 ...*/ }; const int16_t(*p_tab_cg_scan)[2] = runlevel->tab_cg_scan; context_t(*Primary)[NUM_MAP_CTX] = p_aec->p_ctx_set->coeff_run[0]; context_t *p_ctx_last_coeff_pos = p_aec->p_ctx_set->last_pos_contexts; runlevel_pair_t *p_runlevel = runlevel->runlevels_cg; int level_max = 0; int rank = 0; int num_cg = runlevel->num_cg; int org_bits = arienco_bits_written(p_aec); int i_cg; int cur_bits; UNUSED_PARAMETER(h); /* write coefficients in CG */ for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { context_t *p_ctx; int CGx = 0; int CGy = 0; uint32_t Level_sign = 0; int pos; int num_pairs; int pairs; int pairsInCG; int i; /* 1. 鵱ǰCGǷзϵ */ coeff_t *quant_coeff = runlevel->quant_coeff; const int b_hor = runlevel->b_hor; quant_coeff += ((p_tab_cg_scan[i_cg][!b_hor] << runlevel->i_stride_shift) + p_tab_cg_scan[i_cg][b_hor]) << 2; num_pairs = tu_get_cg_run_level_info(runlevel, quant_coeff, runlevel->i_stride_shift, runlevel->b_hor); i = num_pairs; // number of pairs in CG /* 2, Sig CG Flag, "nonzero_cg_flag" */ if (rank > 0) { p_ctx = p_aec->p_ctx_set->nonzero_cg_flag + (i_cg != 0); if (i) { // i > 0 cg_flag Ϊ1зϵ biari_encode_symbol_aec(p_aec, 1, p_ctx); } else { biari_encode_symbol_aec(p_aec, 0, p_ctx); continue; // ޷ϵǰCG } CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; } else if (i > 0) { if (num_cg > 1) { // for TB > 4x4, need to write int num_cg_x = p_tab_cg_scan[num_cg - 1][0]; int num_cg_y = p_tab_cg_scan[num_cg - 1][1]; CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; aec_write_last_cg_pos(p_aec, 1, b_dc_diag, i_cg, CGx, CGy, num_cg, num_cg_x, num_cg_y); } } else { continue; } /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); /* 3, (Run, Level) */ /* 3.1, LAST IN CG, "last_coeff_pos_x", "last_coeff_pos_y" */ pos = runlevel->last_pos_cg; pairs = num_pairs - 1; { int scan_pos = tab_1d_scan_4x4[15 - pos]; int x_in_cg = scan_pos & 3; int y_in_cg = scan_pos >> 2; aec_write_last_coeff_pos(p_aec, p_ctx_last_coeff_pos, rank == 0, num_cg == 1, CGx, CGy, x_in_cg, y_in_cg, 1, b_dc_diag); } for (pairsInCG = 0; i > 0 && pos < NUM_OF_COEFFS_IN_CG; i--, pairs--, pairsInCG++) { int absSum5 = 0; int k, n = 0; int ctxpos, offset = 0; int Level = p_runlevel[pairs].level; int Run = p_runlevel[pairs].run; int absLevel = XAVS2_ABS(Level); int symbol = absLevel - 1; Level_sign |= (Level < 0) << i; // record Sign /* 3.2, level, "coeff_level_minus1_band[i]", "coeff_level_minus1_pos_in_band[i]" */ if (symbol > 31) { int exp_golomb_order = 0; biari_encode_symbol_final_aec(p_aec, 1); // "coeff_level_minus1_band[i]", > 32 /* coeff_level_minus1_pos_in_band[i] */ symbol -= 32; while (symbol >= (1 << exp_golomb_order)) { symbol -= (1 << exp_golomb_order); exp_golomb_order++; } biari_encode_symbols_eq_prob_aec(p_aec, 1, exp_golomb_order + 1); // Exp-Golomb: prefix and 1 biari_encode_symbols_eq_prob_aec(p_aec, symbol, exp_golomb_order); // Exp-Golomb: suffix } else { int pairsInCGIdx = XAVS2_MIN(2, ((pairsInCG + 1) >> 1)); biari_encode_symbol_final_aec(p_aec, 0); // "coeff_level_minus1_band[i]", <= 32 /* coeff_level_minus1_pos_in_band[i] */ p_ctx = p_aec->p_ctx_set->coeff_level; p_ctx += 10 * (i_cg == 0 && pos > 12) + XAVS2_MIN(rank, pairsInCGIdx + 2) + ((5 * pairsInCGIdx) >> 1); biari_encode_tu_aec(p_aec, symbol, 31, p_ctx); } level_max = XAVS2_MAX(level_max, absLevel); rank = tab_rank[XAVS2_MIN(5, level_max)]; // update rank /* 3.3, run, "coeff_run[i]" */ for (k = pairs; k < pairs + pairsInCG; k++) { n += p_runlevel[k + 1].run + 1; if (n >= 7) { break; } absSum5 += XAVS2_ABS(p_runlevel[k + 1].level); } absSum5 = (absSum5 + absLevel) >> 1; p_ctx = Primary[XAVS2_MIN(absSum5, 2)]; ctxpos = pos; symbol = Run; for (;;) { if (ctxpos < NUM_OF_COEFFS_IN_CG - 1) { int py = (tab_scan_4x4[14 - ctxpos][1] + 1) >> 1; // 0, 1, 2 int moddiv = b_dc_diag ? (ctxpos > 11 ? 0 : (ctxpos > 4 ? 1 : 2)) : py; // 012 offset = ((i_cg == 0) ? (ctxpos == 14 ? 0 : (1 + moddiv)) : (4 + moddiv)) + (num_cg == 1 ? 0 : 4); // 0,...,10 } if (symbol-- > 0) { assert(offset >= 0 && offset < NUM_MAP_CTX); biari_encode_symbol_aec(p_aec, 0, p_ctx + offset); ctxpos++; } else { break; } } pos += (Run + 1); // update position if (pos < NUM_OF_COEFFS_IN_CG) { assert(offset >= 0 && offset < NUM_MAP_CTX); biari_encode_symbol_aec(p_aec, 1, p_ctx + offset); } else { pairs--; pairsInCG++; break; } } // run-level loop /* 4, sign of coefficient */ biari_encode_symbols_eq_prob_aec(p_aec, Level_sign >> 1, num_pairs); /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); } // for (; i_cg >= 0; i_cg--) /* get the number of written bits */ org_bits = arienco_bits_written(p_aec) - org_bits; #ifdef DEBUG if (rank == 0) { xavs2_log(h, XAVS2_LOG_ERROR, "no non-zero run-level luma, POC[%d]: p_cu: (%d, %d), level %d, cu_type %d\n", h->fdec->i_poc, runlevel->p_cu_info->i_scu_x, runlevel->p_cu_info->i_scu_y, runlevel->p_cu_info->i_level, runlevel->p_cu_info->i_mode); } #endif assert(rank > 0); // зϵʱrankֵӦ /* return the number of written bits */ return org_bits; } /* --------------------------------------------------------------------------- */ static int aec_write_run_level_chroma(aec_t *p_aec, runlevel_t *runlevel, xavs2_t *h, int maxvalue) { static const int8_t tab_rank[6] = { 0, 1, 2, 3, 3, 4/*, 4 ...*/ }; const int16_t(*p_tab_cg_scan)[2] = runlevel->tab_cg_scan; context_t(*Primary)[NUM_MAP_CTX] = p_aec->p_ctx_set->coeff_run[1]; context_t *p_ctx_last_coeff_pos = p_aec->p_ctx_set->last_pos_contexts + NUM_LAST_POS_CTX_LUMA; runlevel_pair_t *p_runlevel = runlevel->runlevels_cg; int level_max = 0; int rank = 0; int num_cg = runlevel->num_cg; int org_bits = arienco_bits_written(p_aec); int i_cg; int cur_bits; UNUSED_PARAMETER(h); /* write coefficients in CG */ for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { context_t *p_ctx; int CGx = 0; int CGy = 0; uint32_t Level_sign = 0; int pos; int num_pairs; int pairs; int pairsInCG; int i; /* 1. 鵱ǰCGǷзϵ */ coeff_t *quant_coeff = runlevel->quant_coeff; const int b_hor = 0; // runlevel->b_hor; quant_coeff += ((p_tab_cg_scan[i_cg][!b_hor] << runlevel->i_stride_shift) + p_tab_cg_scan[i_cg][b_hor]) << 2; num_pairs = tu_get_cg_run_level_info(runlevel, quant_coeff, runlevel->i_stride_shift, b_hor); i = num_pairs; // number of pairs in CG /* 2, Sig CG Flag, "nonzero_cg_flag" */ if (rank > 0) { p_ctx = p_aec->p_ctx_set->nonzero_cg_flag + (NUM_SIGN_CG_CTX_LUMA); if (i) { // i > 0 cg_flag Ϊ1зϵ biari_encode_symbol_aec(p_aec, 1, p_ctx); } else { biari_encode_symbol_aec(p_aec, 0, p_ctx); continue; // ޷ϵǰCG } CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; } else if (i > 0) { if (num_cg > 1) { int num_cg_x = p_tab_cg_scan[num_cg - 1][0]; int num_cg_y = p_tab_cg_scan[num_cg - 1][1]; CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; aec_write_last_cg_pos(p_aec, 0, INTRA_PRED_DC_DIAG, i_cg, CGx, CGy, num_cg, num_cg_x, num_cg_y); } } else { continue; // δҵһϵCG } /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); /* 3, (Run, Level) */ /* 3.1, LAST IN CG, "last_coeff_pos_x", "last_coeff_pos_y" */ pos = runlevel->last_pos_cg; pairs = num_pairs - 1; { int scan_pos = tab_1d_scan_4x4[15 - pos]; int x_in_cg = scan_pos & 3; int y_in_cg = scan_pos >> 2; aec_write_last_coeff_pos(p_aec, p_ctx_last_coeff_pos, rank == 0, num_cg == 1, CGx, CGy, x_in_cg, y_in_cg, 0, 1); } for (pairsInCG = 0; i > 0 && pos < NUM_OF_COEFFS_IN_CG; i--, pairs--, pairsInCG++) { int absSum5 = 0; int k, n = 0; int ctxpos, offset = 0; int Level = p_runlevel[pairs].level; int Run = p_runlevel[pairs].run; int absLevel = XAVS2_ABS(Level); int symbol = absLevel - 1; Level_sign |= (Level < 0) << i; // record Sign /* 3.2, level, "coeff_level_minus1_band[i]", "coeff_level_minus1_pos_in_band[i]" */ if (symbol > 31) { int exp_golomb_order = 0; biari_encode_symbol_final_aec(p_aec, 1); // "coeff_level_minus1_band[i]", > 32 /* coeff_level_minus1_pos_in_band[i] */ symbol -= 32; while (symbol >= (1 << exp_golomb_order)) { symbol -= (1 << exp_golomb_order); exp_golomb_order++; } biari_encode_symbols_eq_prob_aec(p_aec, 1, exp_golomb_order + 1); // Exp-Golomb: prefix and 1 biari_encode_symbols_eq_prob_aec(p_aec, symbol, exp_golomb_order); // Exp-Golomb: suffix } else { int pairsInCGIdx = XAVS2_MIN(2, ((pairsInCG + 1) >> 1)); biari_encode_symbol_final_aec(p_aec, 0); // "coeff_level_minus1_band[i]", <= 32 /* coeff_level_minus1_pos_in_band[i] */ p_ctx = p_aec->p_ctx_set->coeff_level; p_ctx += 10 * (i_cg == 0 && pos > 12) + XAVS2_MIN(rank, pairsInCGIdx + 2) + ((5 * pairsInCGIdx) >> 1) + 20; biari_encode_tu_aec(p_aec, symbol, 31, p_ctx); } level_max = XAVS2_MAX(level_max, absLevel); rank = tab_rank[XAVS2_MIN(5, level_max)]; // update rank /* 3.3, run, "coeff_run[i]" */ for (k = pairs; k < pairs + pairsInCG; k++) { n += p_runlevel[k + 1].run + 1; if (n >= 7) { break; } absSum5 += XAVS2_ABS(p_runlevel[k + 1].level); } absSum5 = (absSum5 + absLevel) >> 1; p_ctx = Primary[XAVS2_MIN(absSum5, 2)]; ctxpos = pos; symbol = Run; for (;;) { if (ctxpos < NUM_OF_COEFFS_IN_CG - 1) { int moddiv = (ctxpos <= 9); offset = ((i_cg == 0) ? (ctxpos == 14 ? 0 : (1 + moddiv)) : (3 + moddiv)) + (num_cg == 1 ? 0 : 3); } if (symbol-- > 0) { assert(offset >= 0 && offset < NUM_MAP_CTX); biari_encode_symbol_aec(p_aec, 0, p_ctx + offset); ctxpos++; } else { break; } } pos += (Run + 1); // update position if (pos < NUM_OF_COEFFS_IN_CG) { assert(offset >= 0 && offset < NUM_MAP_CTX); biari_encode_symbol_aec(p_aec, 1, p_ctx + offset); } else { pairs--; pairsInCG++; break; } } // run-level loop /* 4, sign of coefficient */ biari_encode_symbols_eq_prob_aec(p_aec, Level_sign >> 1, num_pairs); /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); } // for (; i_cg >= 0; i_cg--) /* get the number of written bits */ org_bits = arienco_bits_written(p_aec) - org_bits; #ifdef DEBUG if (rank == 0) { xavs2_log(h, XAVS2_LOG_ERROR, "no non-zero run-level chroma, p_cu: (%d, %d), level %d, cu_type %d\n", runlevel->p_cu_info->i_scu_x, runlevel->p_cu_info->i_scu_y, runlevel->p_cu_info->i_level, runlevel->p_cu_info->i_mode); } #endif assert(rank > 0); // зϵʱrankֵӦ /* return the number of written bits */ return org_bits; } /* --------------------------------------------------------------------------- */ int aec_write_split_flag(aec_t *p_aec, int i_cu_split, int i_cu_level) { context_t *p_ctx = p_aec->p_ctx_set->split_flag + (MAX_CU_SIZE_IN_BIT - i_cu_level); int org_bits = arienco_bits_written(p_aec); biari_encode_symbol_aec(p_aec, (uint8_t)i_cu_split, p_ctx); #if XAVS2_TRACE if (p_aec->b_writting) { xavs2_trace("SplitFlag = %3d\n", i_cu_split); } #endif /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ int write_sao_mergeflag(aec_t *p_aec, int avail_left, int avail_up, SAOBlkParam *p_sao_param) { int b_merge_left = 0; int b_merge_up; int val = 0; context_t *p_ctx = p_aec->p_ctx_set->sao_merge_type_index; int org_bits = arienco_bits_written(p_aec); int ctx_offset = avail_left + avail_up; if (avail_left) { b_merge_left = (p_sao_param->mergeIdx == SAO_MERGE_LEFT); val = b_merge_left ? 1 : 0; } if (avail_up && !b_merge_left) { b_merge_up = (p_sao_param->mergeIdx == SAO_MERGE_ABOVE); val = b_merge_up ? (1 + avail_left) : 0; } if (ctx_offset == 1) { assert(val <= 1); biari_encode_symbol_aec(p_aec, (uint8_t)val, p_ctx + 0); } else if (ctx_offset == 2) { assert(val <= 2); biari_encode_symbol_aec(p_aec, val & 0x01, p_ctx + 1); if (val != 1) { biari_encode_symbol_aec(p_aec, (val >> 1) & 0x01, p_ctx + 2); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ int write_sao_mode(aec_t *p_aec, SAOBlkParam *saoBlkParam) { context_t *p_ctx = p_aec->p_ctx_set->sao_mode; int org_bits = arienco_bits_written(p_aec); int sao_type = saoBlkParam->typeIdc; if (sao_type == SAO_TYPE_OFF) { biari_encode_symbol_aec(p_aec, 1, p_ctx); } else if (sao_type == SAO_TYPE_BO) { biari_encode_symbol_aec(p_aec, 0, p_ctx); biari_encode_symbol_eq_prob_aec(p_aec, 1); } else { // SAO_TYPE_EO (0~3) biari_encode_symbol_aec(p_aec, 0, p_ctx); biari_encode_symbol_eq_prob_aec(p_aec, 0); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static int aec_write_sao_offset(aec_t *p_aec, int val, int offset_type) { /* --------------------------------------------------------------------------- */ static const int EO_OFFSET_MAP[8] = { 3, 1, 0, 2, 4, 5, 6, 7 }; context_t *p_ctx = p_aec->p_ctx_set->sao_interval_offset_abs; int org_bits = arienco_bits_written(p_aec); int act_sym; assert(offset_type != SAO_CLASS_EO_PLAIN); if (offset_type == SAO_CLASS_EO_FULL_VALLEY) { act_sym = EO_OFFSET_MAP[val + 1]; } else if (offset_type == SAO_CLASS_EO_FULL_PEAK) { act_sym = EO_OFFSET_MAP[-val + 1]; } else { act_sym = XAVS2_ABS(val); } if (act_sym == 0) { if (offset_type == SAO_CLASS_BO) { biari_encode_symbol_aec(p_aec, 1, p_ctx); } else { biari_encode_symbol_eq_prob_aec(p_aec, 1); } } else { int maxvalue = tab_saoclip[offset_type][2]; int temp = act_sym; while (temp != 0) { if (offset_type == SAO_CLASS_BO && temp == act_sym) { biari_encode_symbol_aec(p_aec, 0, p_ctx); } else { biari_encode_symbol_eq_prob_aec(p_aec, 0); } temp--; } if (act_sym < maxvalue) { biari_encode_symbol_eq_prob_aec(p_aec, 1); } } if (offset_type == SAO_CLASS_BO && act_sym) { // write sign symbol biari_encode_symbol_eq_prob_aec(p_aec, (uint8_t)(val >= 0 ? 0 : 1)); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ int write_sao_offset(aec_t *p_aec, SAOBlkParam *saoBlkParam) { int rate = 0; assert(saoBlkParam->typeIdc != SAO_TYPE_OFF); if (saoBlkParam->typeIdc == SAO_TYPE_BO) { int bandIdxBO[4]; bandIdxBO[0] = saoBlkParam->startBand; bandIdxBO[1] = bandIdxBO[0] + 1; bandIdxBO[2] = (saoBlkParam->startBand + saoBlkParam->deltaBand) & 31; bandIdxBO[3] = bandIdxBO[2] + 1; rate += aec_write_sao_offset(p_aec, saoBlkParam->offset[bandIdxBO[0]], SAO_CLASS_BO); rate += aec_write_sao_offset(p_aec, saoBlkParam->offset[bandIdxBO[1]], SAO_CLASS_BO); rate += aec_write_sao_offset(p_aec, saoBlkParam->offset[bandIdxBO[2]], SAO_CLASS_BO); rate += aec_write_sao_offset(p_aec, saoBlkParam->offset[bandIdxBO[3]], SAO_CLASS_BO); } else { assert(saoBlkParam->typeIdc >= SAO_TYPE_EO_0 && saoBlkParam->typeIdc <= SAO_TYPE_EO_45); rate += aec_write_sao_offset(p_aec, saoBlkParam->offset[SAO_CLASS_EO_FULL_VALLEY], SAO_CLASS_EO_FULL_VALLEY); rate += aec_write_sao_offset(p_aec, saoBlkParam->offset[SAO_CLASS_EO_HALF_VALLEY], SAO_CLASS_EO_HALF_VALLEY); rate += aec_write_sao_offset(p_aec, saoBlkParam->offset[SAO_CLASS_EO_HALF_PEAK], SAO_CLASS_EO_HALF_PEAK); rate += aec_write_sao_offset(p_aec, saoBlkParam->offset[SAO_CLASS_EO_FULL_PEAK], SAO_CLASS_EO_FULL_PEAK); } return rate; } /* --------------------------------------------------------------------------- */ int write_sao_type(aec_t *p_aec, SAOBlkParam *saoBlkParam) { int rate = 0; int val; assert(saoBlkParam->typeIdc != SAO_TYPE_OFF); if (saoBlkParam->typeIdc == SAO_TYPE_BO) { int exp_golomb_order = 1; /* start band */ val = saoBlkParam->startBand; biari_encode_symbol_eq_prob_aec(p_aec, val & 0x01); biari_encode_symbol_eq_prob_aec(p_aec, (val >> 1) & 0x01); biari_encode_symbol_eq_prob_aec(p_aec, (val >> 2) & 0x01); biari_encode_symbol_eq_prob_aec(p_aec, (val >> 3) & 0x01); biari_encode_symbol_eq_prob_aec(p_aec, (val >> 4) & 0x01); /* delta band */ assert(saoBlkParam->deltaBand >= 2); val = saoBlkParam->deltaBand - 2; while (val >= (1 << exp_golomb_order)) { biari_encode_symbol_eq_prob_aec(p_aec, 0); val -= (1 << exp_golomb_order); exp_golomb_order++; } if (exp_golomb_order == 4) { exp_golomb_order = 0; } else { biari_encode_symbol_eq_prob_aec(p_aec, 1); } while (exp_golomb_order--) { // next binary part biari_encode_symbol_eq_prob_aec(p_aec, (uint8_t)((val >> exp_golomb_order) & 1)); } #if XAVS2_TRACE if (p_aec->b_writting) { xavs2_trace("coded band = %d, second band = %d, delta band = %d\n", saoBlkParam->startBand, (saoBlkParam->startBand + saoBlkParam->deltaBand) & 31, saoBlkParam->deltaBand); } #endif } else { assert(saoBlkParam->typeIdc >= SAO_TYPE_EO_0 && saoBlkParam->typeIdc <= SAO_TYPE_EO_45); val = saoBlkParam->typeIdc; biari_encode_symbol_eq_prob_aec(p_aec, val & 0x01); biari_encode_symbol_eq_prob_aec(p_aec, (val >> 1) & 0x01); #if XAVS2_TRACE if (p_aec->b_writting) { xavs2_trace("EO type %d\n", saoBlkParam->typeIdc); } #endif } return rate; } /* --------------------------------------------------------------------------- */ int aec_write_alf_lcu_ctrl(aec_t *p_aec, uint8_t iflag) { int org_bits = arienco_bits_written(p_aec); context_t *p_ctx = &(p_aec->p_ctx_set->alf_cu_enable_scmodel[0][0]); biari_encode_symbol_aec(p_aec, iflag, p_ctx); /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * codes cu header */ static INLINE int write_cu_header(xavs2_t *h, aec_t *p_aec, cu_info_t *p_cu_info, int scu_xy) { int rate = 0; int level = p_cu_info->i_level; int mode = p_cu_info->i_mode; int i; // write bits for inter cu type if (h->i_type != SLICE_TYPE_I) { rate += aec_write_cutype(p_aec, mode, level, p_cu_info->i_cbp, h->param->enable_amp); if (h->i_type == SLICE_TYPE_B && (mode >= PRED_2Nx2N && mode <= PRED_nRx2N)) { rate += aec_write_pdir(p_aec, mode, level, p_cu_info->b8pdir[0], p_cu_info->b8pdir[1]); #if XAVS2_TRACE if (p_aec->b_writting) { if (h->i_type == SLICE_TYPE_B) { if (mode >= PRED_2NxN && mode <= PRED_nRx2N) { write_trace_info2("B_Pred_Dir0 ", p_cu_info->b8pdir[0], 1); write_trace_info2("B_Pred_Dir1 ", p_cu_info->b8pdir[1], 1); } else if (mode == PRED_2Nx2N) { write_trace_info2("B_Pred_Dir ", p_cu_info->b8pdir[0], 1); } } } #endif } else if (h->i_type == SLICE_TYPE_F && h->param->enable_dhp && (h->i_ref > 1) && ((mode >= PRED_2Nx2N && mode <= PRED_nRx2N && level > B8X8_IN_BIT) || (mode == PRED_2Nx2N && level == B8X8_IN_BIT))) { rate += aec_write_pdir_dhp(p_aec, mode, p_cu_info->b8pdir[0], p_cu_info->b8pdir[1]); #if XAVS2_TRACE if (p_aec->b_writting) { if (mode >= PRED_2NxN && mode <= PRED_nRx2N) { write_trace_info2("P_Pred_Dir0 ", p_cu_info->b8pdir[0], 1); write_trace_info2("P_Pred_Dir1 ", p_cu_info->b8pdir[1], 1); } else if (mode == PRED_2Nx2N) { write_trace_info2("P_Pred_Dir ", p_cu_info->b8pdir[0], 1); } } #endif } /* write bits for F slice skip/direct mode */ if (h->i_type == SLICE_TYPE_F && IS_SKIP_MODE(mode)) { int weighted_skip_mode = p_cu_info->directskip_wsm_idx; /* write weighted skip mode */ if (h->param->enable_wsm && h->i_ref > 1) { rate += aec_write_wpm(p_aec, weighted_skip_mode, h->i_ref); #if XAVS2_TRACE if (p_aec->b_writting) { xavs2_trace("weighted_skipmode1 = %3d \n", weighted_skip_mode); } #endif } /* write bits for F-spatial-skip mode */ if (h->param->enable_mhp_skip && weighted_skip_mode == 0) { int ds_mode = p_cu_info->directskip_mhp_idx; rate += aec_write_spatial_skip_mode(p_aec, ds_mode + 1); #if XAVS2_TRACE if (p_aec->b_writting) { xavs2_trace("p_directskip_mode = %3d \n", ds_mode + 1); } #endif } } /* write bits for b-direct-skip mode */ if (SLICE_TYPE_B == h->i_type && IS_SKIP_MODE(mode)) { int ds_mode = p_cu_info->directskip_mhp_idx; rate += aec_write_spatial_skip_mode(p_aec, ds_mode + 1); #if XAVS2_TRACE if (p_aec->b_writting) { xavs2_trace("directskip_mhp_idx = %3d \n", ds_mode + 1); } #endif } } /* write bits for intra modes */ if (IS_INTRA_MODE(mode)) { int num_of_intra_block = mode != PRED_I_2Nx2N ? 4 : 1; #if XAVS2_TRACE // trace the CU type for intra modes if (p_aec->b_writting) { write_trace_info2("cuType", PRED_I_2Nx2N, 1); } #endif /* write "transform_split_flag" and intra CU type for SDIP */ rate += aec_write_intra_cutype(p_aec, mode, level, p_cu_info->i_tu_split, h->param->enable_sdip); /* write intra pred mode */ for (i = 0; i < num_of_intra_block; i++) { rate += aec_write_intra_pred_mode(p_aec, p_cu_info->pred_intra_modes[i]); } if (h->param->chroma_format != CHROMA_400) { int i_left_cmode = DM_PRED_C; /* check left */ if (p_cu_info->i_scu_x > 0) { i_left_cmode = h->cu_info[scu_xy - 1].i_intra_mode_c; } rate += aec_write_intra_pred_cmode(p_aec, p_cu_info, i_left_cmode); } } return rate; } /* --------------------------------------------------------------------------- * writes motion vectors of an 8x8 block */ static ALWAYS_INLINE int write_mvd(aec_t *p_aec, cu_info_t *p_cu_info, int k, int bwd_flag) { int curr_mvd_x = p_cu_info->mvd[bwd_flag][k].x; int curr_mvd_y = p_cu_info->mvd[bwd_flag][k].y; int rate; rate = aec_write_mvd(p_aec, curr_mvd_x, 0); rate += aec_write_mvd(p_aec, curr_mvd_y, 1); #if XAVS2_TRACE if (p_aec->b_writting) { mv_t mvp = p_cu_info->mvp[bwd_flag][k]; if (bwd_flag) { xavs2_trace("@%d BMVD (pred %3d)\t\t\t%d\n", g_sym_count++, mvp.x, curr_mvd_x); xavs2_trace("@%d BMVD (pred %3d)\t\t\t%d\n", g_sym_count++, mvp.y, curr_mvd_y); } else { xavs2_trace("@%d FMVD (pred %3d)\t\t\t%d\n", g_sym_count++, mvp.x, curr_mvd_x); xavs2_trace("@%d FMVD (pred %3d)\t\t\t%d\n", g_sym_count++, mvp.y, curr_mvd_y); } } #endif return rate; } /* --------------------------------------------------------------------------- */ static INLINE int write_cu_refs_mvds(xavs2_t *h, aec_t *p_aec, cu_info_t *p_cu_info) { int mode = p_cu_info->i_mode; int rate = 0; int k, refframe; int pdir; int dmh_mode; /* When CU is intra or Skip mode, no need to code ref_idx */ if (IS_INTRA_MODE(mode) || IS_SKIP_MODE(mode)) { return 0; } // forward reference if (h->i_type != SLICE_TYPE_B && h->i_ref > 1) { for (k = 0; k < p_cu_info->num_pu; k++) { if (p_cu_info->b8pdir[k] == PDIR_FWD || p_cu_info->b8pdir[k] == PDIR_DUAL) { refframe = p_cu_info->ref_idx_1st[k]; rate += aec_write_ref(h, p_aec, refframe); #if XAVS2_TRACE if (p_aec->b_writting) { xavs2_trace("Fwd Ref frame no %d\n", refframe); } #endif } } } /* write backward reference indexes of this CU, no need for current AVS2 */ #if XAVS2_TRACE if (p_aec->b_writting) { // xavs2_trace("Bwd Ref frame no %d\n", 0); } #endif /* write DMH mode, "dir_multi_hypothesis_mode" */ if (h->i_type == SLICE_TYPE_F /*&& h->param->enable_dmh*/ && p_cu_info->b8pdir[0] == PDIR_FWD && p_cu_info->b8pdir[1] == PDIR_FWD && p_cu_info->b8pdir[2] == PDIR_FWD && p_cu_info->b8pdir[3] == PDIR_FWD) { if (!(p_cu_info->i_level == B8X8_IN_BIT && p_cu_info->i_mode >= PRED_2NxN && p_cu_info->i_mode <= PRED_nRx2N)) { dmh_mode = p_cu_info->dmh_mode; rate += aec_write_dmh_mode(p_aec, p_cu_info->i_level, dmh_mode); #if XAVS2_TRACE if (p_aec->b_writting) { xavs2_trace("dmh_mode = %3d\n", dmh_mode); } #endif } } /* write forward MVD */ for (k = 0; k < p_cu_info->num_pu; k++) { pdir = p_cu_info->b8pdir[k]; if (pdir != PDIR_BWD) { rate += write_mvd(p_aec, p_cu_info, k, 0); } } /* write backward MVD */ if (h->i_type == SLICE_TYPE_B) { for (k = 0; k < p_cu_info->num_pu; k++) { pdir = p_cu_info->b8pdir[k]; if (pdir == PDIR_BWD || pdir == PDIR_BID) { // has backward vector rate += write_mvd(p_aec, p_cu_info, k, 1); } } } return rate; } #if ENABLE_RATE_CONTROL_CU /* --------------------------------------------------------------------------- */ int write_cu_cbp_dqp(xavs2_t *h, aec_t *p_aec, cu_info_t *p_cu_info, int slice_index_cur_cu, int *last_dqp) { int rate = aec_write_cu_cbp(p_aec, p_cu_info, slice_index_cur_cu, h); #if XAVS2_TRACE if (p_aec->b_writting) { xavs2_trace("@%d CBP\t\t\t\t%d\n", g_sym_count++, p_cu_info->i_cbp); } #endif if (!p_cu_info->i_cbp) { *last_dqp = 0; } if (p_cu_info->i_cbp != 0 && h->param->i_rc_method == XAVS2_RC_CBR_SCU) { rate += aec_write_dqp(p_aec, cu_get_qp(h, p_cu_info), *last_dqp); #if ENABLE_RATE_CONTROL_CU *last_dqp = p_cu_info->i_delta_qp; #endif #if XAVS2_TRACE if (p_aec->b_writting) { xavs2_trace("@%d Delta quant \t\t\t\t%d\n", g_sym_count++, *last_dqp ); } #endif } return rate; } #endif /* --------------------------------------------------------------------------- */ static INLINE int write_luma_block_coeff(xavs2_t *h, aec_t *p_aec, cu_info_t *p_cu_info, coeff_t *quant_coeff, runlevel_t *runlevel, int i_level, int i_stride_shift, int is_intra, int intra_mode) { const int16_t(*cg_scan)[2] = NULL; int b_ver = p_cu_info->i_tu_split == TU_SPLIT_VER; int b_hor = p_cu_info->i_tu_split == TU_SPLIT_HOR; int num_cg; int maxvalue = INT32_MAX; int intra_pred_class = INTRA_PRED_DC_DIAG; if (b_hor) { cg_scan = tab_cg_scan_list_hor[i_level - 2]; } else if (b_ver) { cg_scan = tab_cg_scan_list_ver[i_level - 2]; } else { cg_scan = tab_cg_scan_list_nxn[i_level - 2]; } // reset b_hor and b_ver b_hor = (is_intra && tab_intra_mode_scan_type[intra_mode] == INTRA_PRED_HOR && p_cu_info->i_mode != PRED_I_2Nxn && p_cu_info->i_mode != PRED_I_nx2N); b_ver = !b_hor; num_cg = 1 << (i_level + i_level - 4); // number of CGs /* ʼRunLevelṹ */ runlevel->tab_cg_scan = cg_scan; runlevel->num_cg = num_cg; runlevel->i_stride_shift = i_stride_shift; runlevel->b_hor = b_hor; runlevel->quant_coeff = quant_coeff; runlevel->p_cu_info = p_cu_info; // return rate if (IS_INTRA_MODE(p_cu_info->i_mode)) { assert(intra_mode < NUM_INTRA_MODE); intra_pred_class = tab_intra_mode_scan_type[intra_mode]; } return aec_write_run_level_luma(p_aec, intra_pred_class == INTRA_PRED_DC_DIAG, runlevel, h, maxvalue); } /* --------------------------------------------------------------------------- */ static INLINE int write_chroma_block_coeff(xavs2_t *h, aec_t *p_aec, cu_info_t *p_cu_info, coeff_t *quant_coeff, runlevel_t *runlevel, int i_level) { const int num_cg = 1 << (i_level + i_level - 4); int maxvalue = INT32_MAX; /* ʼRunLevelṹ */ UNUSED_PARAMETER(p_cu_info); runlevel->tab_cg_scan = tab_cg_scan_list_nxn[i_level - 2]; runlevel->num_cg = num_cg; runlevel->i_stride_shift = i_level; runlevel->b_hor = 0; runlevel->quant_coeff = quant_coeff; runlevel->p_cu_info = p_cu_info; // return rate return aec_write_run_level_chroma(p_aec, runlevel, h, maxvalue); } /* --------------------------------------------------------------------------- * write CBP, DQUANT, and Luma Coefficients of an cu */ static void xavs2_cu_write(xavs2_t *h, aec_t *p_aec, lcu_info_t *lcu_info, cu_info_t *p_cu_info, int i_level, int img_x, int img_y) { int scu_x = (img_x >> MIN_CU_SIZE_IN_BIT); int scu_y = (img_y >> MIN_CU_SIZE_IN_BIT); int slice_index_cur_cu = cu_get_slice_index(h, scu_x, scu_y); int scu_xy = scu_y * h->i_width_in_mincu + scu_x; /* write CU header */ write_cu_header(h, p_aec, p_cu_info, scu_xy); /* write CU ref and MVD info */ write_cu_refs_mvds(h, p_aec, p_cu_info); /* write coefficients */ if (!(IS_SKIP(p_cu_info)) || p_cu_info->i_cbp != 0) { int pix_x_in_lcu = img_x - lcu_info->pix_x; int pix_y_in_lcu = img_y - lcu_info->pix_y; int idx_zorder = tab_b8xy_to_zigzag[pix_y_in_lcu >> MIN_CU_SIZE_IN_BIT][pix_x_in_lcu >> MIN_CU_SIZE_IN_BIT]; int block_idx; int mode = p_cu_info->i_mode; int is_tu_split = p_cu_info->i_tu_split != TU_SPLIT_NON; /* write CBP & DQP */ #if ENABLE_RATE_CONTROL_CU write_cu_cbp_dqp(h, p_aec, p_cu_info, slice_index_cur_cu, &lcu_info->last_dqp); #else aec_write_cu_cbp(p_aec, p_cu_info, slice_index_cur_cu, h); #endif /* write luma coefficients */ for (block_idx = 0; block_idx < 4; block_idx++) { if (p_cu_info->i_cbp & (1 << block_idx)) { int use_wavelet = (i_level == B64X64_IN_BIT && p_cu_info->i_tu_split != TU_SPLIT_CROSS); int i_tu_level = i_level - is_tu_split - use_wavelet; cb_t tb; cu_init_transform_block(i_level, p_cu_info->i_tu_split, block_idx, &tb); write_luma_block_coeff(h, p_aec, p_cu_info, lcu_info->coeffs_y + (idx_zorder << 6) + (block_idx << ((i_level - 1) << 1)), &h->lcu.run_level_write, i_tu_level, xavs2_log2u(tb.w) - use_wavelet, IS_INTRA_MODE(mode), p_cu_info->real_intra_modes[block_idx]); } if (is_tu_split == 0) { break; } } /* write chroma coefficients */ if (h->param->chroma_format != CHROMA_400) { for (block_idx = 4; block_idx < 6; block_idx++) { if (p_cu_info->i_cbp & (1 << block_idx)) { write_chroma_block_coeff(h, p_aec, p_cu_info, lcu_info->coeffs_uv[block_idx - 4] + (idx_zorder << 4), &h->lcu.run_level_write, i_level - 1); } } } } } /* --------------------------------------------------------------------------- */ void xavs2_lcu_write(xavs2_t *h, aec_t *p_aec, lcu_info_t *lcu_info, int i_level, int img_x, int img_y) { int cu_ex = img_x + (1 << i_level); /* down-right point position x */ int cu_ey = img_y + (1 << i_level); /* down-right point position y */ int inside = cu_ex <= h->i_width && cu_ey <= h->i_height; /* down-right point is inside of the frame */ int i_cu_pos = (img_y >> MIN_CU_SIZE_IN_BIT) * h->i_width_in_mincu + (img_x >> MIN_CU_SIZE_IN_BIT); cu_info_t *p_cu_info; assert(img_x < h->i_width && img_y < h->i_height); #if XAVS2_TRACE if (p_aec->b_writting) { if (i_level == h->i_lcu_level) { int slice_type = h->i_type; if (slice_type == SLICE_TYPE_F) { slice_type = 4; /* the AVS2 reference software uses 4 for SLICE_TYPE_F */ } xavs2_trace("\n*********** Pic: %i (I/P) MB: %i Slice: %i Type %d **********\n", h->fenc->i_frame, i_cu_pos, h->i_slice_index, slice_type); } } #endif /* set CU pointer and its neighbor CUs */ p_cu_info = &h->cu_info[i_cu_pos]; if (p_cu_info->i_level < i_level) { int i_level_next = i_level - 1; int i; if (inside) { aec_write_split_flag(p_aec, 1, i_level); } /* 4 sub-CU */ for (i = 0; i < 4; i++) { int sub_pix_x = img_x + ((i & 1) << i_level_next); int sub_pix_y = img_y + ((i >> 1) << i_level_next); if (sub_pix_x >= h->i_width || sub_pix_y >= h->i_height) { continue; /* is outside of the frame */ } xavs2_lcu_write(h, p_aec, lcu_info, i_level_next, sub_pix_x, sub_pix_y); } } else { assert(inside); if (i_level > MIN_CU_SIZE_IN_BIT) { aec_write_split_flag(p_aec, 0, i_level); } xavs2_cu_write(h, p_aec, lcu_info, p_cu_info, i_level, img_x, img_y); } } /* --------------------------------------------------------------------------- * write termination symbol after encoding one lcu */ void xavs2_lcu_terminat_bit_write(aec_t *p_aec, uint8_t bit) { biari_encode_symbol_final_aec(p_aec, bit); #if XAVS2_TRACE if (p_aec->b_writting) { xavs2_trace("@%d %s\t\t%d\n", g_sym_count++, "Decode Sliceterm", bit); } #endif } /** * =========================================================================== * function handler * =========================================================================== */ binary_t gf_aec_default = { /* syntax elements */ .write_intra_pred_mode = aec_write_intra_pred_mode, .write_ctu_split_flag = aec_write_split_flag, // .est_cu_header = write_cu_header, // noi available // .est_cu_refs_mvds = write_cu_refs_mvds, // not available // .est_luma_block_coeff = write_luma_block_coeff, // not available // .est_chroma_block_coeff = write_chroma_block_coeff, // not available #if ENABLE_RATE_CONTROL_CU .write_cu_cbp_dqp = write_cu_cbp_dqp, #else .write_cu_cbp = aec_write_cu_cbp, #endif .write_sao_mergeflag = write_sao_mergeflag, .write_sao_mode = write_sao_mode, .write_sao_offset = write_sao_offset, .write_sao_type = write_sao_type, .write_alf_lcu_ctrl = aec_write_alf_lcu_ctrl, }; xavs2-1.3/source/encoder/aec.h000066400000000000000000000226531340660520300162470ustar00rootroot00000000000000/* * aec.h * * Description of this file: * AEC functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_AEC_H #define XAVS2_AEC_H #include "common.h" /* * =========================================================================== * global variables * =========================================================================== */ extern binary_t gf_aec_default; extern binary_t gf_aec_rdo; extern binary_t gf_aec_fastrdo; extern binary_t gf_aec_vrdo; #define tab_intra_mode_scan_type FPFX(tab_intra_mode_scan_type) extern const int tab_intra_mode_scan_type[NUM_INTRA_MODE]; #if CTRL_OPT_AEC extern context_t g_tab_ctx_mps[4096 * 5]; /* [2 * lg_pmps + mps + cycno * 4096] */ extern context_t g_tab_ctx_lps[4096 * 5]; /* [2 * lg_pmps + mps + cycno * 4096] */ #endif /* --------------------------------------------------------------------------- * number of maximum flush bits in p_aec->reg_flush_bits */ static const uint32_t NUM_FLUSH_BITS = 24; /* --------------------------------------------------------------------------- * AC ENGINE PARAMETERS */ static const uint32_t tab_cwr[4] = { 197, 197, 95, 46 }; /* --------------------------------------------------------------------------- * cu type mapping */ static const int MAP_CU_TYPE[MAX_PRED_MODES] = { 1, 2, 3, 4, 3, 3, 4, 4, 6, 6, 6, 6 }; /* --------------------------------------------------------------------------- * macros */ #define NUM_OF_COEFFS_IN_CG 16 #define CHECK_EARLY_RETURN_RUNLEVEL(aec) \ if ((cur_bits = arienco_bits_written(aec) - org_bits) > maxvalue) {\ return cur_bits;\ } #define MAKE_CONTEXT(lg_pmps, mps, cycno) (((uint16_t)(cycno) << 12) | ((uint16_t)(mps) << 0) | (uint16_t)(lg_pmps << 1)) /* --------------------------------------------------------------------------- * AC ENGINE PARAMETERS */ #define B_BITS 10 #define QUARTER (1 << (B_BITS-2)) #define LG_PMPS_SHIFTNO 2 /* * =========================================================================== * inline function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * copy coding state */ static ALWAYS_INLINE void aec_copy_aec_state(aec_t *dst, aec_t *src) { memcpy(dst, src, sizeof(aec_t)); dst->p_ctx_set = &dst->ctx_set; } /* --------------------------------------------------------------------------- * copy coding state */ static ALWAYS_INLINE void aec_copy_aec_state_rdo(aec_t *dst, aec_t *src) { memcpy(dst, src, sizeof(aec_t) - sizeof(ctx_set_t)); (dst)->p_ctx_set = (src)->p_ctx_set; } /* --------------------------------------------------------------------------- * SAOص״̬״̬ */ static ALWAYS_INLINE void aec_copy_coding_state_sao(aec_t *p_dst, aec_t *p_src) { int num_bytes_aec = (int)((uint8_t *)&p_dst->ctx_set - (uint8_t *)p_dst); int num_bytes_context = (int)(sizeof(ctx_set_t) - ((uint8_t *)&p_dst->ctx_set.sao_merge_type_index[0] - (uint8_t *)&p_dst->ctx_set)); memcpy(p_dst, p_src, num_bytes_aec); p_dst->p_ctx_set = &p_dst->ctx_set; memcpy(&p_dst->ctx_set.sao_merge_type_index[0], &p_src->ctx_set.sao_merge_type_index[0], num_bytes_context); } /* --------------------------------------------------------------------------- * returns the number of currently written bits */ static ALWAYS_INLINE int arienco_bits_written(aec_t *p_aec) { return (int)(((p_aec->p - p_aec->p_start) << 3) + p_aec->i_bits_to_follow + NUM_FLUSH_BITS - p_aec->num_left_flush_bits); } /* --------------------------------------------------------------------------- * ļflush bits */ static INLINE void bitstr_flush_bits(aec_t *p_aec) { switch (NUM_FLUSH_BITS) { case 24: p_aec->p[0] = (uint8_t)(p_aec->reg_flush_bits >> 16); p_aec->p[1] = (uint8_t)(p_aec->reg_flush_bits >> 8); p_aec->p[2] = (uint8_t)(p_aec->reg_flush_bits); p_aec->p += 3; break; case 16: p_aec->p[0] = (uint8_t)(p_aec->reg_flush_bits >> 8); p_aec->p[1] = (uint8_t)(p_aec->reg_flush_bits); p_aec->p += 2; break; case 8: p_aec->p[0] = (uint8_t)p_aec->reg_flush_bits; p_aec->p += 1; break; default: fprintf(stderr, "Unsupported number of flush bits %d\n", NUM_FLUSH_BITS); assert(0); break; } p_aec->reg_flush_bits = 0; } /* --------------------------------------------------------------------------- * ļone bit */ static INLINE void bitstr_put_one_bit(aec_t *p_aec, uint32_t b) { p_aec->reg_flush_bits |= ((b) << --p_aec->num_left_flush_bits); if (!p_aec->num_left_flush_bits) { bitstr_flush_bits(p_aec); p_aec->num_left_flush_bits = NUM_FLUSH_BITS; } } /* --------------------------------------------------------------------------- * жCGǷΪȫ顣 * 򷵻1򷵻0 */ static ALWAYS_INLINE int aec_is_cg_allzero(const coeff_t *src_coeff, int i_stride_shift) { assert(sizeof(coeff_t) * 4 == sizeof(uint64_t)); /* 64 bit */ return (*(uint64_t *)(src_coeff) == 0 && *(uint64_t *)(src_coeff + (uint64_t)(1 << i_stride_shift)) == 0 && *(uint64_t *)(src_coeff + (uint64_t)(2 << i_stride_shift)) == 0 && *(uint64_t *)(src_coeff + (uint64_t)(3 << i_stride_shift)) == 0); } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int tu_get_cg_run_level_info(runlevel_t *runlevel, const coeff_t *quant_coeff, int i_stride_shift, const int b_hor) { uint64_t c1 = *(uint64_t *)(quant_coeff); uint64_t c2 = *(uint64_t *)(quant_coeff + (intptr_t)(1 << i_stride_shift)); uint64_t c3 = *(uint64_t *)(quant_coeff + (intptr_t)(2 << i_stride_shift)); uint64_t c4 = *(uint64_t *)(quant_coeff + (intptr_t)(3 << i_stride_shift)); if (c1 == 0 && c2 == 0 && c3 == 0 && c4 == 0) { return 0; } else { ALIGN32(coeff_t res[16]); runlevel_pair_t *p_runlevel = runlevel->runlevels_cg; int8_t run = 0; // here is not -1 int num_runlevel_pair = 0; int i; g_funcs.transpose_coeff_4x4[b_hor](res, c1, c2, c3, c4); /* prepare run-level pairs in one CG */ for (i = 0; i < 16; i++) { coeff_t level = res[i]; if (level != 0) { num_runlevel_pair++; p_runlevel->level = level; p_runlevel->run = run; p_runlevel++; run = 0; } else { run++; } } runlevel->last_pos_cg = run; return num_runlevel_pair; } } /* * =========================================================================== * function declares * =========================================================================== */ #if CTRL_OPT_AEC /* init AEC context table */ #define init_aec_context_tab FPFX(init_aec_context_tab) void init_aec_context_tab(void); #endif /* --------------------------------------------------------------------------- * coding state initialization (no need to destroy, just free the space is OK) */ #define aec_init_coding_state FPFX(aec_init_coding_state) void aec_init_coding_state (aec_t *p_aec); /* --------------------------------------------------------------------------- * aec functions */ #define aec_start FPFX(aec_start) void aec_start(xavs2_t *h, aec_t *p_aec, uint8_t *p_bs_start, uint8_t *p_bs_end, int b_writing); #define aec_done FPFX(aec_done) void aec_done(aec_t *p_aec); /* AEC */ #define xavs2_lcu_write FPFX(lcu_write) void xavs2_lcu_write(xavs2_t *h, aec_t *p_aec, lcu_info_t *lcu_info, int i_level, int img_x, int img_y); #define xavs2_lcu_terminat_bit_write FPFX(lcu_terminat_bit_write) void xavs2_lcu_terminat_bit_write(aec_t *p_aec, uint8_t bit); #endif // XAVS2_AEC_H xavs2-1.3/source/encoder/aec_ctx.c000066400000000000000000000261031340660520300171120ustar00rootroot00000000000000/* * aec_ctx.c * * Description of this file: * AEC functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "cudata.h" #include "aec.h" #include "bitstream.h" #include "block_info.h" /* --------------------------------------------------------------------------- * 0: INTRA_PRED_VER * 1: INTRA_PRED_HOR * 2: INTRA_PRED_DC_DIAG */ const int tab_intra_mode_scan_type[NUM_INTRA_MODE] = { 2, 2, 2, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 0 }; const int8_t tab_intra_mode_luma2chroma[NUM_INTRA_MODE] = { DC_PRED_C, -1, BI_PRED_C, -1, -1, -1, -1, -1, -1, -1, -1, -1, VERT_PRED_C, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, HOR_PRED_C, -1, -1, -1, -1, -1, -1, -1, -1 }; #if CTRL_OPT_AEC context_t g_tab_ctx_mps[4096 * 5]; /* [2 * lg_pmps + mps + cycno * 4096] */ context_t g_tab_ctx_lps[4096 * 5]; /* [2 * lg_pmps + mps + cycno * 4096] */ static const uint8_t tab_cwr_shift[] = { 3, 3, 4, 5, 5, 5, 5 /* 5, 5, 5, 5 */ }; static const uint16_t tab_lg_pmps_offset[6] = { 0, 0, 0, 197, 95, 46 /* 5, 5, 5, 5 */ }; #endif /* --------------------------------------------------------------------------- * ļʣbits */ static INLINE void bitstr_end_stream(aec_t *p_aec) { if (p_aec->num_left_flush_bits == NUM_FLUSH_BITS) { return; } switch (NUM_FLUSH_BITS - p_aec->num_left_flush_bits) { case 24: p_aec->p[0] = (uint8_t)(p_aec->reg_flush_bits >> (NUM_FLUSH_BITS - 8)); p_aec->p[1] = (uint8_t)(p_aec->reg_flush_bits >> (NUM_FLUSH_BITS - 16)); p_aec->p[2] = (uint8_t)(p_aec->reg_flush_bits >> (NUM_FLUSH_BITS - 24)); p_aec->p += 3; break; case 16: p_aec->p[0] = (uint8_t)(p_aec->reg_flush_bits >> (NUM_FLUSH_BITS - 8)); p_aec->p[1] = (uint8_t)(p_aec->reg_flush_bits >> (NUM_FLUSH_BITS - 16)); p_aec->p += 2; break; case 8: p_aec->p[0] = (uint8_t)(p_aec->reg_flush_bits >> (NUM_FLUSH_BITS - 8)); p_aec->p += 1; break; default: fprintf(stderr, "Un-aligned tail bits %d\n", p_aec->num_left_flush_bits); assert(0); break; } p_aec->num_left_flush_bits = NUM_FLUSH_BITS; } /* --------------------------------------------------------------------------- * ļone bitʣλ */ static INLINE void bitstt_put_one_bit_and_remainder(aec_t *p_aec, const int b) { uint32_t N = 1 + p_aec->i_bits_to_follow; // ܹı if (N > p_aec->num_left_flush_bits) { /* ıǰֽʣı */ int header_bits = p_aec->num_left_flush_bits; // ǰһֽʣλ uint32_t header_byte = (1 << (header_bits - 1)) - (!b); // ʣλֵ int num_left_bytes = (N - header_bits) >> 3; // ǰֽ⣬ʣӦֽ int num_left_bits = N - header_bits - (num_left_bytes << 3); // ı p_aec->reg_flush_bits |= header_byte; bitstr_flush_bits(p_aec); p_aec->num_left_flush_bits = NUM_FLUSH_BITS - num_left_bits; if (b == 0) { /* b Ϊʱмbitsȫ 1 */ while (num_left_bytes != 0) { *(p_aec->p) = 0xff; p_aec->p++; num_left_bytes--; } /* num_left_bits λ reg_flush_bits λ */ p_aec->reg_flush_bits = 0xffu >> (8 - num_left_bits) << p_aec->num_left_flush_bits; } else { p_aec->p += num_left_bytes; } } else { /* ǰҪbitСдֽʣbit */ uint32_t bits = (1 << p_aec->i_bits_to_follow) - (!b); // ıɵĶֵ p_aec->reg_flush_bits |= bits << (p_aec->num_left_flush_bits - N); p_aec->num_left_flush_bits -= N; if (p_aec->num_left_flush_bits == 0) { bitstr_flush_bits(p_aec); p_aec->reg_flush_bits = 0; p_aec->num_left_flush_bits = NUM_FLUSH_BITS; } } p_aec->i_bits_to_follow = 0; } /** * =========================================================================== * binary * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int aec_get_shift(uint32_t v) { #if SYS_WINDOWS && !ARCH_X86_64 __asm { bsr eax, v mov v, eax } return 8 - v; #else int i; for (i = 0; !(v & 0x100); i++) { v <<= 1; } return i; #endif } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void aec_set_function_handles(xavs2_t *h, binary_t *fh, int b_writing) { if (b_writing) { // write bitstream to buffer memcpy(fh, &gf_aec_default, sizeof(binary_t)); } else { // estimate bit rate without writing (during RDO) switch (h->param->rdo_bit_est_method) { case 1: memcpy(fh, &gf_aec_fastrdo, sizeof(binary_t)); break; case 2: memcpy(fh, &gf_aec_vrdo, sizeof(binary_t)); break; default: memcpy(fh, &gf_aec_rdo, sizeof(binary_t)); break; } } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void init_contexts(aec_t *p_aec) { const uint16_t lg_pmps = ((QUARTER << LG_PMPS_SHIFTNO) - 1); uint16_t v = MAKE_CONTEXT(lg_pmps, 0, 0); uint16_t *d = (uint16_t *)&p_aec->ctx_set; int ctx_cnt = sizeof(ctx_set_t) / sizeof(uint16_t); while (ctx_cnt-- != 0) { *d++ = v; } p_aec->p_ctx_set = &p_aec->ctx_set; } #if CTRL_OPT_AEC /* --------------------------------------------------------------------------- */ void init_aec_context_tab(void) { context_t ctx_i; context_t ctx_o; int cycno; int mps; /* init context table */ ctx_i.v = 0; ctx_o.v = 0; /* mps */ for (cycno = 0; cycno < 4; cycno++) { uint32_t cwr = tab_cwr_shift[cycno]; ctx_i.cycno = cycno; ctx_o.cycno = (uint8_t)XAVS2_MAX(cycno, 1); for (mps = 0; mps < 2; mps++) { ctx_i.MPS = (uint8_t)mps; ctx_o.MPS = (uint8_t)mps; for (ctx_i.LG_PMPS = 0; ctx_i.LG_PMPS <= 1024; ctx_i.LG_PMPS++) { uint32_t lg_pmps = ctx_i.LG_PMPS; lg_pmps -= (lg_pmps >> cwr) + (lg_pmps >> (cwr + 2)); ctx_o.LG_PMPS = (uint16_t)lg_pmps; g_tab_ctx_mps[ctx_i.v].v = ctx_o.v; } } } /* lps */ for (cycno = 0; cycno < 4; cycno++) { uint32_t cwr = tab_cwr_shift[cycno]; ctx_i.cycno = cycno; ctx_o.cycno = (uint8_t)XAVS2_MIN(cycno + 1, 3); for (mps = 0; mps < 2; mps++) { ctx_i.MPS = (uint8_t)mps; ctx_o.MPS = (uint8_t)mps; for (ctx_i.LG_PMPS = 0; ctx_i.LG_PMPS <= 1024; ctx_i.LG_PMPS++) { uint32_t lg_pmps = ctx_i.LG_PMPS + tab_lg_pmps_offset[cwr]; if (lg_pmps >= (256 << LG_PMPS_SHIFTNO)) { lg_pmps = (512 << LG_PMPS_SHIFTNO) - 1 - lg_pmps; ctx_o.MPS = !mps; } ctx_o.LG_PMPS = (uint16_t)lg_pmps; g_tab_ctx_lps[ctx_i.v].v = ctx_o.v; } } } } #endif /* --------------------------------------------------------------------------- * initializes the aec_t for the arithmetic coder */ void aec_start(xavs2_t *h, aec_t *p_aec, uint8_t *p_bs_start, uint8_t *p_bs_end, int b_writing) { p_aec->p_start = p_bs_start; p_aec->p = p_bs_start; p_aec->p_end = p_bs_end; p_aec->i_low = 0; p_aec->i_t1 = 0xFF; p_aec->i_bits_to_follow = 0; p_aec->b_writting = 0; p_aec->num_left_flush_bits = NUM_FLUSH_BITS + 1; // to swallow first redundant bit p_aec->reg_flush_bits = 0; if (b_writing) { memset(p_aec->p_start, 0, p_bs_end - p_bs_start); } /* int function handles */ aec_set_function_handles(h, &p_aec->binary, b_writing); /* init contexts */ init_contexts(p_aec); } /* --------------------------------------------------------------------------- * terminates the arithmetic codeword, writes stop bit and stuffing bytes (if any) */ void aec_done(aec_t *p_aec) { int i; uint8_t bit_out_standing = (uint8_t)((p_aec->i_low >> (B_BITS - 1)) & 1); uint8_t bit_ending; bitstt_put_one_bit_and_remainder(p_aec, bit_out_standing); bit_ending = (uint8_t)((p_aec->i_low >> (B_BITS - 2)) & 1); bitstr_put_one_bit(p_aec, bit_ending); /* end of AEC */ bitstr_put_one_bit(p_aec, 1); for (i = 0; i < 7; i++) { bitstr_put_one_bit(p_aec, 0); } /* write stuffing pattern */ bitstr_put_one_bit(p_aec, 1); if (p_aec->num_left_flush_bits != NUM_FLUSH_BITS) { for (i = p_aec->num_left_flush_bits & 7; i > 0; i--) { bitstr_put_one_bit(p_aec, 0); } } /* end bitstream */ bitstr_end_stream(p_aec); } /* --------------------------------------------------------------------------- * create structure for storing coding state */ void aec_init_coding_state(aec_t *p_aec) { if (p_aec == NULL) { xavs2_log(NULL, XAVS2_LOG_ERROR, "aec_create_coding_state: p_aec"); } else { memset(p_aec, 0, sizeof(aec_t)); p_aec->p_ctx_set = &p_aec->ctx_set; } } xavs2-1.3/source/encoder/aec_fastrdo.c000066400000000000000000002043711340660520300177630ustar00rootroot00000000000000/* * aec_fastrdo.c * * Description of this file: * AEC functions definition of FAST_RDO module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "aec.h" #include "bitstream.h" #include "block_info.h" #include "cudata.h" /** * =========================================================================== * binary * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int aec_get_shift(uint32_t v) { #if SYS_WINDOWS && !ARCH_X86_64 __asm { bsr eax, v mov v, eax } return 8 - v; #else int i; for (i = 0; !(v & 0x100); i++) { v <<= 1; } return i; #endif } /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_symbol_fastrdo(aec_t *p_aec, uint8_t symbol, context_t *p_ctx) { const uint32_t lg_pmps = p_ctx->LG_PMPS; const uint32_t lg_pmps_shifted = lg_pmps >> LG_PMPS_SHIFTNO; const uint32_t t1 = p_aec->i_t1; const int s = (t1 < lg_pmps_shifted); if (symbol != p_ctx->MPS) { // LPS const uint32_t t = ((-s) & t1) + lg_pmps_shifted; const int shift = aec_get_shift(t); p_aec->i_bits_to_follow += s + shift; } else { // MPS happens p_aec->i_bits_to_follow += s; } } /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_tu_fastrdo(aec_t *p_aec, int num_zeros, int max_len, context_t *p_ctx) { max_len -= num_zeros; while (num_zeros != 0) { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); num_zeros--; } if (max_len) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); } } /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_symbol_eq_prob_fastrdo(aec_t *p_aec, uint8_t symbol) { UNUSED_PARAMETER(symbol); p_aec->i_bits_to_follow++; } /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_symbols_eq_prob_fastrdo(aec_t *p_aec, uint32_t val, int len) { UNUSED_PARAMETER(val); p_aec->i_bits_to_follow += len; } /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_symbol_final_fastrdo(aec_t *p_aec, uint8_t symbol) { const uint32_t t1 = p_aec->i_t1; if (symbol) { p_aec->i_bits_to_follow += (!t1) + 8; p_aec->i_t1 = 0; } else { // MPS p_aec->i_bits_to_follow += (!t1); p_aec->i_t1 = (t1 - 1) & 0xff; } } /** * =========================================================================== * syntax coding * =========================================================================== */ /* --------------------------------------------------------------------------- * cu type for B/F/P frame */ static INLINE int aec_write_cutype_fastrdo(aec_t *p_aec, int i_cu_type, int i_cu_level, int i_cu_cbp, int is_amp_enabled) { context_t *p_ctx = p_aec->p_ctx_set->cu_type_contexts; int org_bits = arienco_bits_written(p_aec); int act_sym = MAP_CU_TYPE[i_cu_type]; if (i_cu_type == PRED_SKIP && i_cu_cbp == 0) { act_sym = 0; } switch (act_sym) { case 0: // SKIP biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 0); break; case 1: // DIRECT biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 1); break; case 2: // 2Nx2N biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 2); break; case 3: // 2NxN, 2NxnU, 2NxnD biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 3); if (is_amp_enabled && i_cu_level >= B16X16_IN_BIT) { p_ctx = p_aec->p_ctx_set->shape_of_partition_index; if (i_cu_type == PRED_2NxN) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); // SMP - AMP signal bit } else { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); // SMP - AMP signal bit biari_encode_symbol_fastrdo(p_aec, (uint8_t)(i_cu_type == PRED_2NxnU), p_ctx + 1); // AMP shape } } break; case 4: // Nx2N, nLx2N, nRx2N biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 3); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 4); if (is_amp_enabled && i_cu_level >= B16X16_IN_BIT) { p_ctx = p_aec->p_ctx_set->shape_of_partition_index; if (i_cu_type == PRED_Nx2N) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); // SMP - AMP signal bit } else { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); // SMP - AMP signal bit biari_encode_symbol_fastrdo(p_aec, (uint8_t)(i_cu_type == PRED_nLx2N), p_ctx + 1); // AMP shape } } break; //case 5: // NxN, not enabled // biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); // biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); // biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 2); // biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 3); // biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 4); // if (i_cu_level > B8X8_IN_BIT) { // biari_encode_symbol_final_fastrdo(p_aec, 1); // } // break; default: // case 6: // Intra biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 3); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 4); if (i_cu_level > B8X8_IN_BIT) { biari_encode_symbol_final_fastrdo(p_aec, 0); } break; } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode a pair of intra prediction modes of a given cu */ static int aec_write_intra_pred_mode_fastrdo(aec_t *p_aec, int ipmode) { context_t *p_ctx = p_aec->p_ctx_set->intra_luma_pred_mode; int org_bits = arienco_bits_written(p_aec); if (ipmode >= 0) { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx ); biari_encode_symbol_fastrdo(p_aec, (uint8_t)((ipmode & 0x10) >> 4), p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, (uint8_t)((ipmode & 0x08) >> 3), p_ctx + 2); biari_encode_symbol_fastrdo(p_aec, (uint8_t)((ipmode & 0x04) >> 2), p_ctx + 3); biari_encode_symbol_fastrdo(p_aec, (uint8_t)((ipmode & 0x02) >> 1), p_ctx + 4); biari_encode_symbol_fastrdo(p_aec, (uint8_t)((ipmode & 0x01) ), p_ctx + 5); } else { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx ); biari_encode_symbol_fastrdo(p_aec, (uint8_t)(ipmode + 2), p_ctx + 6); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the reference parameter of a given cu */ static INLINE int aec_write_ref_fastrdo(xavs2_t *h, aec_t *p_aec, int ref_idx) { context_t *p_ctx = p_aec->p_ctx_set->pu_reference_index; int org_bits = arienco_bits_written(p_aec); int act_sym = ref_idx; /* 0λ0ģ1λ1ģ2 */ if (act_sym == 0) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); } else { int act_ctx = 0; biari_encode_symbol_fastrdo(p_aec, 0, p_ctx++); while (--act_sym != 0) { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); if (!act_ctx) { p_ctx++; } } if (ref_idx < h->i_ref - 1) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the motion vector data */ static INLINE int aec_write_mvd_fastrdo(aec_t *p_aec, int mvd, int xy) { context_t *p_ctx = p_aec->p_ctx_set->mvd_contexts[xy]; int org_bits = arienco_bits_written(p_aec); uint32_t act_sym = XAVS2_ABS(mvd); if (act_sym < 3) { // 0, 1, 2 if (act_sym == 0) { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); } else if (act_sym == 1) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); } else { // act_sym == 2 biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 2); } } else { int exp_golomb_order = 0; biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 2); if ((act_sym & 1) == 1) { // odds >3 biari_encode_symbol_eq_prob_fastrdo(p_aec, 0); act_sym = (act_sym - 3) >> 1; } else { // even >3 biari_encode_symbol_eq_prob_fastrdo(p_aec, 1); act_sym = (act_sym - 4) >> 1; } /* exp_golomb part */ while (act_sym >= (uint32_t)(1 << exp_golomb_order)) { act_sym -= (1 << exp_golomb_order); exp_golomb_order++; } biari_encode_symbols_eq_prob_fastrdo(p_aec, 1, exp_golomb_order + 1); // Exp-Golomb: prefix and 1 biari_encode_symbols_eq_prob_fastrdo(p_aec, act_sym, exp_golomb_order); // Exp-Golomb: suffix } if (mvd != 0) { // mv sign biari_encode_symbol_eq_prob_fastrdo(p_aec, (uint8_t)(mvd < 0)); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_dmh_mode_fastrdo(aec_t *p_aec, int i_cu_level, int dmh_mode) { static const int iEncMapTab[9] = { 0, 5, 6, 1, 2, 7, 8, 3, 4 }; context_t *p_ctx = p_aec->p_ctx_set->pu_type_index + 3; int org_bits = arienco_bits_written(p_aec); int symbol = dmh_mode != 0; p_ctx += (i_cu_level - MIN_CU_SIZE_IN_BIT) * 3; biari_encode_symbol_fastrdo(p_aec, (uint8_t)symbol, p_ctx); if (symbol) { int iMapVal = iEncMapTab[dmh_mode]; if (iMapVal < 3) { symbol = (iMapVal != 1); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_eq_prob_fastrdo(p_aec, (uint8_t)symbol); } else if (iMapVal < 5) { symbol = (iMapVal != 3); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_eq_prob_fastrdo(p_aec, (uint8_t)symbol); } else { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 2); symbol = (iMapVal >= 7); biari_encode_symbol_eq_prob_fastrdo(p_aec, (uint8_t)symbol); symbol = !(iMapVal & 1); biari_encode_symbol_eq_prob_fastrdo(p_aec, (uint8_t)symbol); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * write "transform_split_flag" and SDIP type for intra CU */ static INLINE int aec_write_intra_cutype_fastrdo(aec_t *p_aec, int i_cu_type, int i_cu_level, int i_tu_split, int is_sdip_enabled) { context_t *p_ctx = p_aec->p_ctx_set->transform_split_flag; int org_bits = arienco_bits_written(p_aec); uint8_t transform_split_flag = i_tu_split != TU_SPLIT_NON; /* just write split or not */ if (i_cu_level == B8X8_IN_BIT) { biari_encode_symbol_fastrdo(p_aec, transform_split_flag, p_ctx + 1); } else if (is_sdip_enabled && (i_cu_level == B32X32_IN_BIT || i_cu_level == B16X16_IN_BIT)) { biari_encode_symbol_fastrdo(p_aec, transform_split_flag, p_ctx + 2); if (transform_split_flag) { p_ctx = p_aec->p_ctx_set->intra_pu_type_contexts; biari_encode_symbol_fastrdo(p_aec, i_cu_type == PRED_I_2Nxn, p_ctx); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_pdir_fastrdo(aec_t *p_aec, int i_cu_type, int i_cu_level, int pdir0, int pdir1) { int new_pdir[4] = { 2, 1, 3, 0 }; context_t *p_ctx = p_aec->p_ctx_set->pu_type_index; int org_bits = arienco_bits_written(p_aec); int act_ctx = 0; int act_sym; int symbol; if (i_cu_type == PRED_2Nx2N) { /* һCUֻһPUPUĸʹ3: 0, 1, 2 */ act_sym = pdir0; while (act_sym >= 1) { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + act_ctx); act_sym--; act_ctx++; } if (pdir0 != 3) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + act_ctx); } } else if ((i_cu_type >= PRED_2NxN && i_cu_type <= PRED_nRx2N) && i_cu_level == B8X8_IN_BIT) { /* һCUΪPUCUСΪ8x8ʱԤΪ4x88x4ÿPUֻǵԤ⣬ * ܼ4ϣҪλBit b_pu_type_min_index ʹ */ p_ctx = p_aec->p_ctx_set->b_pu_type_min_index; pdir0 = new_pdir[pdir0]; pdir1 = new_pdir[pdir1]; act_sym = (pdir0 != 1); biari_encode_symbol_fastrdo(p_aec, (int8_t)act_sym, p_ctx + 0); act_sym = (pdir0 == pdir1); biari_encode_symbol_fastrdo(p_aec, (int8_t)act_sym, p_ctx + 1); } else if (i_cu_type >= PRED_2NxN || i_cu_type <= PRED_nRx2N) { //1010 /* act_ctx: 3,...,14 */ pdir0 = new_pdir[pdir0]; pdir1 = new_pdir[pdir1]; act_sym = pdir0; act_ctx = 3; /* 3,4,5 */ while (act_sym >= 1) { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + act_ctx); act_sym--; act_ctx++; } if (pdir0 != 3) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + act_ctx); } symbol = (pdir0 == pdir1); biari_encode_symbol_fastrdo(p_aec, (uint8_t)symbol, p_ctx + 6); /* 7,...,14 */ if (!symbol) { switch (pdir0) { case 0: symbol = (pdir1 == 1); biari_encode_symbol_fastrdo(p_aec, (uint8_t)symbol, p_ctx + 7); if (!symbol) { symbol = (pdir1 == 2); biari_encode_symbol_fastrdo(p_aec, (uint8_t)symbol, p_ctx + 8); } break; case 1: symbol = (pdir1 == 0); biari_encode_symbol_fastrdo(p_aec, (uint8_t)symbol, p_ctx + 9); if (!symbol) { symbol = (pdir1 == 2); biari_encode_symbol_fastrdo(p_aec, (uint8_t)symbol, p_ctx + 10); } break; case 2: symbol = (pdir1 == 0); biari_encode_symbol_fastrdo(p_aec, (uint8_t)symbol, p_ctx + 11); if (!symbol) { symbol = (pdir1 == 1); biari_encode_symbol_fastrdo(p_aec, (uint8_t)symbol, p_ctx + 12); } break; case 3: symbol = (pdir1 == 0); biari_encode_symbol_fastrdo(p_aec, (uint8_t)symbol, p_ctx + 13); if (!symbol) { symbol = (pdir1 == 1); biari_encode_symbol_fastrdo(p_aec, (uint8_t)symbol, p_ctx + 14); } break; } } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_pdir_dhp_fastrdo(aec_t *p_aec, int i_cu_type, int pdir0, int pdir1) { context_t *p_ctx = p_aec->p_ctx_set->pu_type_index; int org_bits = arienco_bits_written(p_aec); pdir0 = (pdir0 != 0); pdir1 = (pdir1 != 0); if (i_cu_type == PRED_2Nx2N) { biari_encode_symbol_fastrdo(p_aec, (uint8_t)pdir0, p_ctx); } else if (i_cu_type >= PRED_2NxN || i_cu_type <= PRED_nRx2N) { // 1010 biari_encode_symbol_fastrdo(p_aec, (uint8_t)pdir0, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, (uint8_t)(pdir0 == pdir1), p_ctx + 2); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_wpm_fastrdo(aec_t *p_aec, int ref_idx, int num_ref) { context_t *p_ctx = p_aec->p_ctx_set->weighted_skip_mode; int org_bits = arienco_bits_written(p_aec); int i, idx_bin = 0; for (i = 0; i < ref_idx; i++) { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + idx_bin); idx_bin = XAVS2_MIN(idx_bin + 1, 2); } if (ref_idx < num_ref - 1) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + idx_bin); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_spatial_skip_mode_fastrdo(aec_t *p_aec, int mode) { context_t *p_ctx = p_aec->p_ctx_set->cu_subtype_index; int org_bits = arienco_bits_written(p_aec); int offset; for (offset = 0; offset < mode; offset++) { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + offset); } if (mode < DS_MAX_NUM) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + offset); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the chroma intra prediction mode of an 8x8 block */ static INLINE int aec_write_intra_pred_cmode_fastrdo(aec_t *p_aec, cu_info_t *p_cu_info, int i_left_cmode) { context_t *p_ctx = p_aec->p_ctx_set->intra_chroma_pred_mode; int i_chroma_mode = p_cu_info->i_intra_mode_c; int org_bits = arienco_bits_written(p_aec); int act_ctx = i_left_cmode != DM_PRED_C; // ? 1 : 0; if (i_chroma_mode == DM_PRED_C) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + act_ctx); } else { int lmode = tab_intra_mode_luma2chroma[p_cu_info->real_intra_modes[0]]; int is_redundant = lmode >= 0; biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + act_ctx); i_chroma_mode -= (1 + (is_redundant && i_chroma_mode > lmode)); p_ctx += 2; switch (i_chroma_mode) { case 0: biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); break; case 1: biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); break; case 2: biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); break; case 3: biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); break; default: xavs2_log(NULL, XAVS2_LOG_ERROR, "invalid chroma mode %d\n", i_chroma_mode); break; } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the coded block pattern of an luma CB */ static int write_cbp_bit_fastrdo(xavs2_t *h, aec_t *p_aec, cu_info_t *p_cu_info, int slice_index_cur_cu, int b8, int bit) { int org_bits = arienco_bits_written(p_aec); int i_cu_level = p_cu_info->i_level; int transform_split_flag = p_cu_info->i_tu_split != TU_SPLIT_NON; int is_hor_part = p_cu_info->i_tu_split == TU_SPLIT_HOR; int is_ver_part = p_cu_info->i_tu_split == TU_SPLIT_VER; int a, b; int x_4x4, y_4x4; ///< ǰ任4x4λ int w_4x4, h_4x4; ///< ǰ任4x4С context_t *p_ctx; /* get context pointer */ if (b8 == 4) { p_ctx = p_aec->p_ctx_set->cbp_contexts + 8; } else { w_4x4 = h_4x4 = 1 << (i_cu_level - MIN_PU_SIZE_IN_BIT); x_4x4 = p_cu_info->i_scu_x << (MIN_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT); y_4x4 = p_cu_info->i_scu_y << (MIN_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT); if (b8 != 4 && transform_split_flag) { if (is_hor_part) { h_4x4 >>= 2; y_4x4 += h_4x4 * b8; } else if (is_ver_part) { w_4x4 >>= 2; x_4x4 += w_4x4 * b8; } else { w_4x4 >>= 1; h_4x4 >>= 1; x_4x4 += (b8 & 1) ? w_4x4 : 0; y_4x4 += (b8 >> 1) ? h_4x4 : 0; } } a = get_neighbor_cbp_y(h, p_cu_info, slice_index_cur_cu, x_4x4 - 1, y_4x4 ); b = get_neighbor_cbp_y(h, p_cu_info, slice_index_cur_cu, x_4x4 , y_4x4 - 1); p_ctx = p_aec->p_ctx_set->cbp_contexts + a + 2 * b; } /* write bits */ biari_encode_symbol_fastrdo(p_aec, (uint8_t)bit, p_ctx); /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the coded block pattern of a cu */ static INLINE int aec_write_cu_cbp_fastrdo(aec_t *p_aec, cu_info_t *p_cu_info, int slice_index_cur_cu, xavs2_t *h) { context_t *p_ctx = p_aec->p_ctx_set->cbp_contexts + 4; int org_bits = arienco_bits_written(p_aec); int i_cu_cbp = p_cu_info->i_cbp; int i_cu_type = p_cu_info->i_mode; int transform_split_flag = p_cu_info->i_tu_split != TU_SPLIT_NON; if (IS_INTER_MODE(i_cu_type)) { /* write cbp for inter pred mode --------------------------- */ if (!IS_SKIP_MODE(i_cu_type)) { write_cbp_bit_fastrdo(h, p_aec, p_cu_info, slice_index_cur_cu, 4, i_cu_cbp == 0); } if (i_cu_cbp) { // write tr_size biari_encode_symbol_fastrdo(p_aec, (uint8_t)transform_split_flag, p_aec->p_ctx_set->transform_split_flag); // write cbp for chroma if (h->param->chroma_format != CHROMA_400) { switch ((i_cu_cbp >> 4) & 0x03) { case 0: biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); break; case 1: biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 2); break; case 2: biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 2); break; case 3: biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 2); break; } } // write cbp for luma if (transform_split_flag == 0) { if (i_cu_cbp > 15) { write_cbp_bit_fastrdo(h, p_aec, p_cu_info, slice_index_cur_cu, 0, (i_cu_cbp & 1) != 0); } } else { write_cbp_bit_fastrdo(h, p_aec, p_cu_info, slice_index_cur_cu, 0, (i_cu_cbp & 1) != 0); write_cbp_bit_fastrdo(h, p_aec, p_cu_info, slice_index_cur_cu, 1, (i_cu_cbp & 2) != 0); write_cbp_bit_fastrdo(h, p_aec, p_cu_info, slice_index_cur_cu, 2, (i_cu_cbp & 4) != 0); write_cbp_bit_fastrdo(h, p_aec, p_cu_info, slice_index_cur_cu, 3, (i_cu_cbp & 8) != 0); } } } else { /* write cbp for intra pred mode --------------------------- */ // write bits for luma if (transform_split_flag == 0 || i_cu_type == PRED_I_2Nx2N) { write_cbp_bit_fastrdo(h, p_aec, p_cu_info, slice_index_cur_cu, 0, (i_cu_cbp & 0x0F) != 0); } else { write_cbp_bit_fastrdo(h, p_aec, p_cu_info, slice_index_cur_cu, 0, (i_cu_cbp & 1) != 0); write_cbp_bit_fastrdo(h, p_aec, p_cu_info, slice_index_cur_cu, 1, (i_cu_cbp & 2) != 0); write_cbp_bit_fastrdo(h, p_aec, p_cu_info, slice_index_cur_cu, 2, (i_cu_cbp & 4) != 0); write_cbp_bit_fastrdo(h, p_aec, p_cu_info, slice_index_cur_cu, 3, (i_cu_cbp & 8) != 0); } // write bits for chroma if (h->param->chroma_format != CHROMA_400) { switch ((i_cu_cbp >> 4) & 0x03) { case 0: biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); break; case 1: biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 3); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 3); break; case 2: biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 3); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 3); break; case 3: biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 3); break; } } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } #if ENABLE_RATE_CONTROL_CU /* --------------------------------------------------------------------------- */ static INLINE int aec_write_dqp_fastrdo(aec_t *p_aec, int delta_qp, int last_dqp) { context_t *p_ctx = p_aec->p_ctx_set->delta_qp_contexts; int org_bits = arienco_bits_written(p_aec); int act_ctx = (last_dqp) ? 1 : 0; int act_sym = (delta_qp > 0) ? (2 * delta_qp - 1) : (-2 * delta_qp); if (act_sym == 0) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + act_ctx); } else { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + act_ctx); act_ctx = 2; if (act_sym == 1) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + act_ctx); } else { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + act_ctx); act_ctx++; while (act_sym > 2) { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + act_ctx); act_sym--; } biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + act_ctx); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } #endif /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void aec_write_last_cg_pos(aec_t *p_aec, int b_luma, int intra_pred_class, int i_cg, int cg_last_x, int cg_last_y, int num_cg, int num_cg_x_minus1, int num_cg_y_minus1) { context_t *p_ctx = p_aec->p_ctx_set->last_cg_contexts + (b_luma ? 0 : NUM_LAST_CG_CTX_LUMA); int count; if (num_cg == 4) { // 8x8 switch (i_cg) { case 0: biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 0); break; case 1: biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 1); break; case 2: biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 2); break; default: // case 3: biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 2); break; } } else { if (b_luma && intra_pred_class == INTRA_PRED_DC_DIAG) { XAVS2_SWAP(cg_last_x, cg_last_y); XAVS2_SWAP(num_cg_x_minus1, num_cg_y_minus1); } if (cg_last_x == 0 && cg_last_y == 0) { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 3); /* last_cg0_flag */ } else { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 3); /* last_cg0_flag */ /* last_cg_x */ biari_encode_tu_fastrdo(p_aec, cg_last_x, num_cg_x_minus1, p_ctx + 4); /* last_cg_y or last_cg_y_minus1 */ count = (cg_last_x == 0); // cg_last_xΪ㣬cg_last_yдһ㣨һ㣩 biari_encode_tu_fastrdo(p_aec, cg_last_y - count, num_cg_y_minus1 - count, p_ctx + 5); } } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void aec_write_last_coeff_pos(aec_t *p_aec, int isLastCG, int b_one_cg, int cg_x, int cg_y, int last_coeff_pos_x, int last_coeff_pos_y, int b_luma, int intra_pred_class) { context_t *p_ctx = p_aec->p_ctx_set->last_pos_contexts + (b_luma ? 0 : NUM_LAST_POS_CTX_LUMA); int offset; if (!isLastCG) { last_coeff_pos_x = 3 - last_coeff_pos_x; if (intra_pred_class == INTRA_PRED_DC_DIAG) { last_coeff_pos_y = 3 - last_coeff_pos_y; } } if (cg_x == 0 && cg_y > 0 && intra_pred_class == INTRA_PRED_DC_DIAG) { XAVS2_SWAP(last_coeff_pos_x, last_coeff_pos_y); } /* AVS2-P2: 8.3.3.2.14 ȷlast_coeff_pos_x last_coeff_pos_y ctxIdxInc */ if (b_luma == 0) { // ɫȷռ12 offset = b_one_cg ? 0 : 4 + isLastCG * 4; } else if (b_one_cg) { // Log2TransformSize Ϊ 2ռ8 offset = 40 + (intra_pred_class == INTRA_PRED_DC_DIAG) * 4; } else if (cg_x != 0 && cg_y != 0) { // cg_x cg_y Ϊ㣬ռ8 offset = 32 + isLastCG * 4; } else { // λռ40 offset = (4 * isLastCG + 2 * (cg_x == 0 && cg_y == 0) + (intra_pred_class == INTRA_PRED_DC_DIAG)) * 4; } p_ctx += offset; switch (last_coeff_pos_x) { case 0: biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 0); break; case 1: biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 1); break; case 2: biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 1); break; default: // case 3: biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); break; } p_ctx += 2; switch (last_coeff_pos_y) { case 0: biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 0); break; case 1: biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 1); break; case 2: biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + 1); break; default: // case 3: biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + 1); break; } } /* --------------------------------------------------------------------------- */ static int aec_write_run_level_luma_fastrdo(aec_t *p_aec, int intra_pred_class, runlevel_t *runlevel, xavs2_t *h, int maxvalue) { static const int8_t tab_rank[6] = { 0, 1, 2, 3, 3, 4/*, 4 ...*/ }; const int16_t(*p_tab_cg_scan)[2] = runlevel->tab_cg_scan; context_t(*Primary)[NUM_MAP_CTX] = p_aec->p_ctx_set->coeff_run[0]; runlevel_pair_t *p_runlevel = runlevel->runlevels_cg; int level_max = 0; int rank = 0; int num_cg = runlevel->num_cg; int org_bits = arienco_bits_written(p_aec); int i_cg; int cur_bits; UNUSED_PARAMETER(h); /* write coefficients in CG */ for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { context_t *p_ctx; int CGx = 0; int CGy = 0; uint32_t Level_sign = 0; int pos; int num_pairs; int pairs; int pairsInCG; int i; /* 1. 鵱ǰCGǷзϵ */ coeff_t *quant_coeff = runlevel->quant_coeff; const int b_hor = runlevel->b_hor; quant_coeff += ((p_tab_cg_scan[i_cg][!b_hor] << runlevel->i_stride_shift) + p_tab_cg_scan[i_cg][b_hor]) << 2; num_pairs = tu_get_cg_run_level_info(runlevel, quant_coeff, runlevel->i_stride_shift, runlevel->b_hor); i = num_pairs; // number of pairs in CG /* 2, Sig CG Flag, "nonzero_cg_flag" */ if (rank > 0) { p_ctx = p_aec->p_ctx_set->nonzero_cg_flag + (i_cg != 0); if (i) { // i > 0 cg_flag Ϊ1зϵ biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); } else { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); continue; // ޷ϵǰCG } CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; } else if (i > 0) { if (num_cg > 1) { // for TB > 4x4, need to write int num_cg_x = p_tab_cg_scan[num_cg - 1][0]; int num_cg_y = p_tab_cg_scan[num_cg - 1][1]; CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; aec_write_last_cg_pos(p_aec, 1, intra_pred_class, i_cg, CGx, CGy, num_cg, num_cg_x, num_cg_y); } } else { continue; } /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); /* 3, (Run, Level) */ /* 3.1, LAST IN CG, "last_coeff_pos_x", "last_coeff_pos_y" */ pos = runlevel->last_pos_cg; pairs = num_pairs - 1; { int scan_pos = tab_1d_scan_4x4[15 - pos]; int x_in_cg = scan_pos & 3; int y_in_cg = scan_pos >> 2; aec_write_last_coeff_pos(p_aec, rank == 0, num_cg == 1, CGx, CGy, x_in_cg, y_in_cg, 1, intra_pred_class); } for (pairsInCG = 0; i > 0 && pos < NUM_OF_COEFFS_IN_CG; i--, pairs--, pairsInCG++) { int absSum5 = 0; int k, n = 0; int ctxpos, offset = 0; int Level = p_runlevel[pairs].level; int Run = p_runlevel[pairs].run; int absLevel = XAVS2_ABS(Level); int symbol = absLevel - 1; Level_sign |= (Level < 0) << i; // record Sign /* 3.2, level, "coeff_level_minus1_band[i]", "coeff_level_minus1_pos_in_band[i]" */ if (symbol > 31) { int exp_golomb_order = 0; biari_encode_symbol_final_fastrdo(p_aec, 1); // "coeff_level_minus1_band[i]", > 32 /* coeff_level_minus1_pos_in_band[i] */ symbol -= 32; while (symbol >= (1 << exp_golomb_order)) { symbol -= (1 << exp_golomb_order); exp_golomb_order++; } biari_encode_symbols_eq_prob_fastrdo(p_aec, 1, exp_golomb_order + 1); // Exp-Golomb: prefix and 1 biari_encode_symbols_eq_prob_fastrdo(p_aec, symbol, exp_golomb_order); // Exp-Golomb: suffix } else { int pairsInCGIdx = XAVS2_MIN(2, ((pairsInCG + 1) >> 1)); biari_encode_symbol_final_fastrdo(p_aec, 0); // "coeff_level_minus1_band[i]", <= 32 /* coeff_level_minus1_pos_in_band[i] */ p_ctx = p_aec->p_ctx_set->coeff_level; p_ctx += 10 * (i_cg == 0 && pos > 12) + XAVS2_MIN(rank, pairsInCGIdx + 2) + ((5 * pairsInCGIdx) >> 1); biari_encode_tu_fastrdo(p_aec, symbol, 31, p_ctx); } level_max = XAVS2_MAX(level_max, absLevel); rank = tab_rank[XAVS2_MIN(5, level_max)]; // update rank /* 3.3, run, "coeff_run[i]" */ for (k = pairs; k < pairs + pairsInCG; k++) { n += p_runlevel[k + 1].run + 1; if (n >= 7) { break; } absSum5 += XAVS2_ABS(p_runlevel[k + 1].level); } absSum5 = (absSum5 + absLevel) >> 1; p_ctx = Primary[XAVS2_MIN(absSum5, 2)]; ctxpos = pos; symbol = Run; for (;;) { if (ctxpos < NUM_OF_COEFFS_IN_CG - 1) { int py = (tab_scan_4x4[14 - ctxpos][1] + 1) >> 1; // 0, 1, 2 int moddiv = (intra_pred_class != INTRA_PRED_DC_DIAG) ? py : (ctxpos > 11 ? 0 : (ctxpos > 4 ? 1 : 2)); // 012 offset = ((i_cg == 0) ? (ctxpos == 14 ? 0 : (1 + moddiv)) : (4 + moddiv)) + (num_cg == 1 ? 0 : 4); // 0,...,10 } if (symbol-- > 0) { assert(offset >= 0 && offset < NUM_MAP_CTX); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + offset); ctxpos++; } else { break; } } pos += (Run + 1); // update position if (pos < NUM_OF_COEFFS_IN_CG) { assert(offset >= 0 && offset < NUM_MAP_CTX); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + offset); } else { pairs--; pairsInCG++; break; } } // run-level loop /* 4, sign of coefficient */ biari_encode_symbols_eq_prob_fastrdo(p_aec, Level_sign >> 1, num_pairs); /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); } // for (; i_cg >= 0; i_cg--) /* get the number of written bits */ org_bits = arienco_bits_written(p_aec) - org_bits; #ifdef DEBUG if (rank == 0) { xavs2_log(h, XAVS2_LOG_ERROR, "no non-zero run-level luma, POC[%d]: p_cu: (%d, %d), level %d, cu_type %d\n", h->fdec->i_poc, runlevel->p_cu_info->i_scu_x, runlevel->p_cu_info->i_scu_y, runlevel->p_cu_info->i_level, runlevel->p_cu_info->i_mode); } #endif assert(rank > 0); // зϵʱrankֵӦ /* return the number of written bits */ return org_bits; } /* --------------------------------------------------------------------------- */ static int aec_write_run_level_chroma_fastrdo(aec_t *p_aec, runlevel_t *runlevel, xavs2_t *h, int maxvalue) { static const int8_t tab_rank[6] = { 0, 1, 2, 3, 3, 4/*, 4 ...*/ }; const int16_t(*p_tab_cg_scan)[2] = runlevel->tab_cg_scan; context_t(*Primary)[NUM_MAP_CTX] = p_aec->p_ctx_set->coeff_run[1]; runlevel_pair_t *p_runlevel = runlevel->runlevels_cg; int level_max = 0; int rank = 0; int num_cg = runlevel->num_cg; int org_bits = arienco_bits_written(p_aec); int i_cg; int cur_bits; UNUSED_PARAMETER(h); /* write coefficients in CG */ for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { context_t *p_ctx; int CGx = 0; int CGy = 0; uint32_t Level_sign = 0; int pos; int num_pairs; int pairs; int pairsInCG; int i; /* 1. 鵱ǰCGǷзϵ */ coeff_t *quant_coeff = runlevel->quant_coeff; const int b_hor = 0; // runlevel->b_hor; quant_coeff += ((p_tab_cg_scan[i_cg][!b_hor] << runlevel->i_stride_shift) + p_tab_cg_scan[i_cg][b_hor]) << 2; num_pairs = tu_get_cg_run_level_info(runlevel, quant_coeff, runlevel->i_stride_shift, b_hor); i = num_pairs; // number of pairs in CG /* 2, Sig CG Flag, "nonzero_cg_flag" */ if (rank > 0) { p_ctx = p_aec->p_ctx_set->nonzero_cg_flag + (NUM_SIGN_CG_CTX_LUMA); if (i) { // i > 0 cg_flag Ϊ1зϵ biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); } else { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); continue; // ޷ϵǰCG } CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; } else if (i > 0) { if (num_cg > 1) { int num_cg_x = p_tab_cg_scan[num_cg - 1][0]; int num_cg_y = p_tab_cg_scan[num_cg - 1][1]; CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; aec_write_last_cg_pos(p_aec, 0, INTRA_PRED_DC_DIAG, i_cg, CGx, CGy, num_cg, num_cg_x, num_cg_y); } } else { continue; // δҵһϵCG } /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); /* 3, (Run, Level) */ /* 3.1, LAST IN CG, "last_coeff_pos_x", "last_coeff_pos_y" */ pos = runlevel->last_pos_cg; pairs = num_pairs - 1; { int scan_pos = tab_1d_scan_4x4[15 - pos]; int x_in_cg = scan_pos & 3; int y_in_cg = scan_pos >> 2; aec_write_last_coeff_pos(p_aec, rank == 0, num_cg == 1, CGx, CGy, x_in_cg, y_in_cg, 0, INTRA_PRED_DC_DIAG); } for (pairsInCG = 0; i > 0 && pos < NUM_OF_COEFFS_IN_CG; i--, pairs--, pairsInCG++) { int absSum5 = 0; int k, n = 0; int ctxpos, offset = 0; int Level = p_runlevel[pairs].level; int Run = p_runlevel[pairs].run; int absLevel = XAVS2_ABS(Level); int symbol = absLevel - 1; Level_sign |= (Level < 0) << i; // record Sign /* 3.2, level, "coeff_level_minus1_band[i]", "coeff_level_minus1_pos_in_band[i]" */ if (symbol > 31) { int exp_golomb_order = 0; biari_encode_symbol_final_fastrdo(p_aec, 1); // "coeff_level_minus1_band[i]", > 32 /* coeff_level_minus1_pos_in_band[i] */ symbol -= 32; while (symbol >= (1 << exp_golomb_order)) { symbol -= (1 << exp_golomb_order); exp_golomb_order++; } biari_encode_symbols_eq_prob_fastrdo(p_aec, 1, exp_golomb_order + 1); // Exp-Golomb: prefix and 1 biari_encode_symbols_eq_prob_fastrdo(p_aec, symbol, exp_golomb_order); // Exp-Golomb: suffix } else { int pairsInCGIdx = XAVS2_MIN(2, ((pairsInCG + 1) >> 1)); biari_encode_symbol_final_fastrdo(p_aec, 0); // "coeff_level_minus1_band[i]", <= 32 /* coeff_level_minus1_pos_in_band[i] */ p_ctx = p_aec->p_ctx_set->coeff_level; p_ctx += 10 * (i_cg == 0 && pos > 12) + XAVS2_MIN(rank, pairsInCGIdx + 2) + ((5 * pairsInCGIdx) >> 1) + 20; biari_encode_tu_fastrdo(p_aec, symbol, 31, p_ctx); } level_max = XAVS2_MAX(level_max, absLevel); rank = tab_rank[XAVS2_MIN(5, level_max)]; // update rank /* 3.3, run, "coeff_run[i]" */ for (k = pairs; k < pairs + pairsInCG; k++) { n += p_runlevel[k + 1].run + 1; if (n >= 7) { break; } absSum5 += XAVS2_ABS(p_runlevel[k + 1].level); } absSum5 = (absSum5 + absLevel) >> 1; p_ctx = Primary[XAVS2_MIN(absSum5, 2)]; ctxpos = pos; symbol = Run; for (;;) { if (ctxpos < NUM_OF_COEFFS_IN_CG - 1) { int moddiv = (ctxpos <= 9); offset = ((i_cg == 0) ? (ctxpos == 14 ? 0 : (1 + moddiv)) : (3 + moddiv)) + (num_cg == 1 ? 0 : 3); } if (symbol-- > 0) { assert(offset >= 0 && offset < NUM_MAP_CTX); biari_encode_symbol_fastrdo(p_aec, 0, p_ctx + offset); ctxpos++; } else { break; } } pos += (Run + 1); // update position if (pos < NUM_OF_COEFFS_IN_CG) { assert(offset >= 0 && offset < NUM_MAP_CTX); biari_encode_symbol_fastrdo(p_aec, 1, p_ctx + offset); } else { pairs--; pairsInCG++; break; } } // run-level loop /* 4, sign of coefficient */ biari_encode_symbols_eq_prob_fastrdo(p_aec, Level_sign >> 1, num_pairs); /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); } // for (; i_cg >= 0; i_cg--) /* get the number of written bits */ org_bits = arienco_bits_written(p_aec) - org_bits; #ifdef DEBUG if (rank == 0) { xavs2_log(h, XAVS2_LOG_ERROR, "no non-zero run-level chroma, p_cu: (%d, %d), level %d, cu_type %d\n", runlevel->p_cu_info->i_scu_x, runlevel->p_cu_info->i_scu_y, runlevel->p_cu_info->i_level, runlevel->p_cu_info->i_mode); } #endif assert(rank > 0); // зϵʱrankֵӦ /* return the number of written bits */ return org_bits; } /* --------------------------------------------------------------------------- */ int aec_write_split_flag_fastrdo(aec_t *p_aec, int i_cu_split, int i_cu_level) { context_t *p_ctx = p_aec->p_ctx_set->split_flag + (MAX_CU_SIZE_IN_BIT - i_cu_level); int org_bits = arienco_bits_written(p_aec); biari_encode_symbol_fastrdo(p_aec, (uint8_t)i_cu_split, p_ctx); /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ int write_sao_mergeflag_fastrdo(aec_t *p_aec, int avail_left, int avail_up, SAOBlkParam *p_sao_param) { int b_merge_left = 0; int b_merge_up; int val = 0; context_t *p_ctx = p_aec->p_ctx_set->sao_merge_type_index; int org_bits = arienco_bits_written(p_aec); int ctx_offset = avail_left + avail_up; if (avail_left) { b_merge_left = (p_sao_param->mergeIdx == SAO_MERGE_LEFT); val = b_merge_left ? 1 : 0; } if (avail_up && !b_merge_left) { b_merge_up = (p_sao_param->mergeIdx == SAO_MERGE_ABOVE); val = b_merge_up ? (1 + avail_left) : 0; } if (ctx_offset == 1) { assert(val <= 1); biari_encode_symbol_fastrdo(p_aec, (uint8_t)val, p_ctx + 0); } else if (ctx_offset == 2) { assert(val <= 2); biari_encode_symbol_fastrdo(p_aec, val & 0x01, p_ctx + 1); if (val != 1) { biari_encode_symbol_fastrdo(p_aec, (val >> 1) & 0x01, p_ctx + 2); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ int write_sao_mode_fastrdo(aec_t *p_aec, SAOBlkParam *saoBlkParam) { context_t *p_ctx = p_aec->p_ctx_set->sao_mode; int org_bits = arienco_bits_written(p_aec); int sao_type = saoBlkParam->typeIdc; if (sao_type == SAO_TYPE_OFF) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); } else if (sao_type == SAO_TYPE_BO) { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); biari_encode_symbol_eq_prob_fastrdo(p_aec, 1); } else { // SAO_TYPE_EO (0~3) biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); biari_encode_symbol_eq_prob_fastrdo(p_aec, 0); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static int aec_write_sao_offset_fastrdo(aec_t *p_aec, int val, int offset_type) { /* --------------------------------------------------------------------------- */ static const int EO_OFFSET_MAP[8] = { 3, 1, 0, 2, 4, 5, 6, 7 }; context_t *p_ctx = p_aec->p_ctx_set->sao_interval_offset_abs; int org_bits = arienco_bits_written(p_aec); int act_sym; assert(offset_type != SAO_CLASS_EO_PLAIN); if (offset_type == SAO_CLASS_EO_FULL_VALLEY) { act_sym = EO_OFFSET_MAP[val + 1]; } else if (offset_type == SAO_CLASS_EO_FULL_PEAK) { act_sym = EO_OFFSET_MAP[-val + 1]; } else { act_sym = XAVS2_ABS(val); } if (act_sym == 0) { if (offset_type == SAO_CLASS_BO) { biari_encode_symbol_fastrdo(p_aec, 1, p_ctx); } else { biari_encode_symbol_eq_prob_fastrdo(p_aec, 1); } } else { int maxvalue = tab_saoclip[offset_type][2]; int temp = act_sym; while (temp != 0) { if (offset_type == SAO_CLASS_BO && temp == act_sym) { biari_encode_symbol_fastrdo(p_aec, 0, p_ctx); } else { biari_encode_symbol_eq_prob_fastrdo(p_aec, 0); } temp--; } if (act_sym < maxvalue) { biari_encode_symbol_eq_prob_fastrdo(p_aec, 1); } } if (offset_type == SAO_CLASS_BO && act_sym) { // write sign symbol biari_encode_symbol_eq_prob_fastrdo(p_aec, (uint8_t)(val >= 0 ? 0 : 1)); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ int write_sao_offset_fastrdo(aec_t *p_aec, SAOBlkParam *saoBlkParam) { int rate = 0; assert(saoBlkParam->typeIdc != SAO_TYPE_OFF); if (saoBlkParam->typeIdc == SAO_TYPE_BO) { int bandIdxBO[4]; bandIdxBO[0] = saoBlkParam->startBand; bandIdxBO[1] = bandIdxBO[0] + 1; bandIdxBO[2] = (saoBlkParam->startBand + saoBlkParam->deltaBand) & 31; bandIdxBO[3] = bandIdxBO[2] + 1; rate += aec_write_sao_offset_fastrdo(p_aec, saoBlkParam->offset[bandIdxBO[0]], SAO_CLASS_BO); rate += aec_write_sao_offset_fastrdo(p_aec, saoBlkParam->offset[bandIdxBO[1]], SAO_CLASS_BO); rate += aec_write_sao_offset_fastrdo(p_aec, saoBlkParam->offset[bandIdxBO[2]], SAO_CLASS_BO); rate += aec_write_sao_offset_fastrdo(p_aec, saoBlkParam->offset[bandIdxBO[3]], SAO_CLASS_BO); } else { assert(saoBlkParam->typeIdc >= SAO_TYPE_EO_0 && saoBlkParam->typeIdc <= SAO_TYPE_EO_45); rate += aec_write_sao_offset_fastrdo(p_aec, saoBlkParam->offset[SAO_CLASS_EO_FULL_VALLEY], SAO_CLASS_EO_FULL_VALLEY); rate += aec_write_sao_offset_fastrdo(p_aec, saoBlkParam->offset[SAO_CLASS_EO_HALF_VALLEY], SAO_CLASS_EO_HALF_VALLEY); rate += aec_write_sao_offset_fastrdo(p_aec, saoBlkParam->offset[SAO_CLASS_EO_HALF_PEAK], SAO_CLASS_EO_HALF_PEAK); rate += aec_write_sao_offset_fastrdo(p_aec, saoBlkParam->offset[SAO_CLASS_EO_FULL_PEAK], SAO_CLASS_EO_FULL_PEAK); } return rate; } /* --------------------------------------------------------------------------- */ int write_sao_type_fastrdo(aec_t *p_aec, SAOBlkParam *saoBlkParam) { int rate = 0; int val; assert(saoBlkParam->typeIdc != SAO_TYPE_OFF); if (saoBlkParam->typeIdc == SAO_TYPE_BO) { int exp_golomb_order = 1; /* start band */ val = saoBlkParam->startBand; biari_encode_symbol_eq_prob_fastrdo(p_aec, val & 0x01); biari_encode_symbol_eq_prob_fastrdo(p_aec, (val >> 1) & 0x01); biari_encode_symbol_eq_prob_fastrdo(p_aec, (val >> 2) & 0x01); biari_encode_symbol_eq_prob_fastrdo(p_aec, (val >> 3) & 0x01); biari_encode_symbol_eq_prob_fastrdo(p_aec, (val >> 4) & 0x01); /* delta band */ assert(saoBlkParam->deltaBand >= 2); val = saoBlkParam->deltaBand - 2; while (val >= (1 << exp_golomb_order)) { biari_encode_symbol_eq_prob_fastrdo(p_aec, 0); val -= (1 << exp_golomb_order); exp_golomb_order++; } if (exp_golomb_order == 4) { exp_golomb_order = 0; } else { biari_encode_symbol_eq_prob_fastrdo(p_aec, 1); } while (exp_golomb_order--) { // next binary part biari_encode_symbol_eq_prob_fastrdo(p_aec, (uint8_t)((val >> exp_golomb_order) & 1)); } } else { assert(saoBlkParam->typeIdc >= SAO_TYPE_EO_0 && saoBlkParam->typeIdc <= SAO_TYPE_EO_45); val = saoBlkParam->typeIdc; biari_encode_symbol_eq_prob_fastrdo(p_aec, val & 0x01); biari_encode_symbol_eq_prob_fastrdo(p_aec, (val >> 1) & 0x01); } return rate; } /* --------------------------------------------------------------------------- */ int aec_write_alf_lcu_ctrl_fastrdo(aec_t *p_aec, uint8_t iflag) { int org_bits = arienco_bits_written(p_aec); context_t *p_ctx = &(p_aec->p_ctx_set->alf_cu_enable_scmodel[0][0]); biari_encode_symbol_fastrdo(p_aec, iflag, p_ctx); /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * codes cu header */ static int write_cu_header_fastrdo(xavs2_t *h, aec_t *p_aec, cu_t *p_cu) { int rate = 0; int level = p_cu->cu_info.i_level; int mode = p_cu->cu_info.i_mode; int i; // write bits for inter cu type if (h->i_type != SLICE_TYPE_I) { rate += aec_write_cutype_fastrdo(p_aec, mode, level, p_cu->cu_info.i_cbp, h->param->enable_amp); if (h->i_type == SLICE_TYPE_B && (mode >= PRED_2Nx2N && mode <= PRED_nRx2N)) { rate += aec_write_pdir_fastrdo(p_aec, mode, level, p_cu->cu_info.b8pdir[0], p_cu->cu_info.b8pdir[1]); } else if (h->i_type == SLICE_TYPE_F && h->param->enable_dhp && (h->i_ref > 1) && ((mode >= PRED_2Nx2N && mode <= PRED_nRx2N && level > B8X8_IN_BIT) || (mode == PRED_2Nx2N && level == B8X8_IN_BIT))) { rate += aec_write_pdir_dhp_fastrdo(p_aec, mode, p_cu->cu_info.b8pdir[0], p_cu->cu_info.b8pdir[1]); } /* write bits for F slice skip/direct mode */ if (IS_SKIP_MODE(mode)) { int b_write_spatial_skip = 0; if (h->i_type == SLICE_TYPE_F) { int weighted_skip_mode = p_cu->cu_info.directskip_wsm_idx; /* write weighted skip mode */ if (h->param->enable_wsm && h->i_ref > 1) { rate += aec_write_wpm_fastrdo(p_aec, weighted_skip_mode, h->i_ref); } /* write bits for F-spatial-skip mode */ b_write_spatial_skip = (h->param->enable_mhp_skip && (weighted_skip_mode == 0)); } b_write_spatial_skip = b_write_spatial_skip || (SLICE_TYPE_B == h->i_type); /* write bits for b-direct-skip mode */ if (b_write_spatial_skip) { rate += aec_write_spatial_skip_mode_fastrdo(p_aec, p_cu->cu_info.directskip_mhp_idx + 1); } } } // write bits for intra modes if (IS_INTRA_MODE(mode)) { int num_of_intra_block = mode != PRED_I_2Nx2N ? 4 : 1; /* write "transform_split_flag" and cu_type for SDIP */ rate += aec_write_intra_cutype_fastrdo(p_aec, mode, level, p_cu->cu_info.i_tu_split, h->param->enable_sdip); /* write intra pred mode */ for (i = 0; i < num_of_intra_block; i++) { rate += aec_write_intra_pred_mode_fastrdo(p_aec, p_cu->cu_info.pred_intra_modes[i]); } if (h->param->chroma_format != CHROMA_400) { int i_left_cmode = DM_PRED_C; /* check left */ if (p_cu->p_left_cu != NULL) { i_left_cmode = p_cu->p_left_cu->i_intra_mode_c; } rate += aec_write_intra_pred_cmode_fastrdo(p_aec, &p_cu->cu_info, i_left_cmode); } } return rate; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int write_mvd_fastrdo(aec_t *p_aec, cu_t *p_cu, int k, int bwd_flag) { int curr_mvd_x = p_cu->cu_info.mvd[bwd_flag][k].x; int curr_mvd_y = p_cu->cu_info.mvd[bwd_flag][k].y; int rate; rate = aec_write_mvd_fastrdo(p_aec, curr_mvd_x, 0); rate += aec_write_mvd_fastrdo(p_aec, curr_mvd_y, 1); return rate; } /* --------------------------------------------------------------------------- */ static int write_cu_refs_mvds_fastrdo(xavs2_t *h, aec_t *p_aec, cu_t *p_cu) { int mode = p_cu->cu_info.i_mode; int rate = 0; int k, refframe; int pdir; int dmh_mode; /* When CU is intra or skip mode, no need to code ref_idx and mvd */ if (IS_INTRA_MODE(mode) || IS_SKIP_MODE(mode)) { return 0; } /* only one frame on each direction, no need to code ref_idx */ // forward reference if (h->i_type != SLICE_TYPE_B && h->i_ref > 1) { for (k = 0; k < p_cu->cu_info.num_pu; k++) { if (p_cu->cu_info.b8pdir[k] == PDIR_FWD || p_cu->cu_info.b8pdir[k] == PDIR_DUAL) { refframe = p_cu->cu_info.ref_idx_1st[k]; rate += aec_write_ref_fastrdo(h, p_aec, refframe); } } } /* write backward reference indexes of this CU, no need for current AVS2 */ /* write DMH mode, "dir_multi_hypothesis_mode" */ if (h->i_type == SLICE_TYPE_F /*&& h->param->enable_dmh*/ && p_cu->cu_info.b8pdir[0] == PDIR_FWD && p_cu->cu_info.b8pdir[1] == PDIR_FWD && p_cu->cu_info.b8pdir[2] == PDIR_FWD && p_cu->cu_info.b8pdir[3] == PDIR_FWD) { if (!(p_cu->cu_info.i_level == B8X8_IN_BIT && p_cu->cu_info.i_mode >= PRED_2NxN && p_cu->cu_info.i_mode <= PRED_nRx2N)) { dmh_mode = p_cu->cu_info.dmh_mode; rate += aec_write_dmh_mode_fastrdo(p_aec, p_cu->cu_info.i_level, dmh_mode); } } /* write forward MVD */ for (k = 0; k < p_cu->cu_info.num_pu; k++) { pdir = p_cu->cu_info.b8pdir[k]; if (pdir != PDIR_BWD) { rate += write_mvd_fastrdo(p_aec, p_cu, k, 0); } } /* write backward MVD */ if (h->i_type == SLICE_TYPE_B) { for (k = 0; k < p_cu->cu_info.num_pu; k++) { pdir = p_cu->cu_info.b8pdir[k]; if (pdir == PDIR_BWD || pdir == PDIR_BID) { //has backward vector rate += write_mvd_fastrdo(p_aec, p_cu, k, 1); } } } return rate; } #if ENABLE_RATE_CONTROL_CU /* --------------------------------------------------------------------------- */ int write_cu_cbp_dqp_fastrdo(xavs2_t *h, aec_t *p_aec, cu_info_t *p_cu_info, int slice_index_cur_cu, int *last_dqp) { int rate = aec_write_cu_cbp_fastrdo(p_aec, p_cu_info, slice_index_cur_cu, h); if (!p_cu_info->i_cbp) { *last_dqp = 0; } if (p_cu_info->i_cbp != 0 && h->param->i_rc_method == XAVS2_RC_CBR_SCU) { rate += aec_write_dqp_fastrdo(p_aec, cu_get_qp(h, p_cu_info), *last_dqp); #if ENABLE_RATE_CONTROL_CU *last_dqp = p_cu_info->i_delta_qp; #else *last_dqp = 0; #endif } return rate; } #endif /* --------------------------------------------------------------------------- */ static int write_luma_block_coeff_fastrdo(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, coeff_t *quant_coeff, runlevel_t *runlevel, int i_level, int i_stride_shift, int is_intra, int intra_mode, int max_bits) { const int16_t(*cg_scan)[2] = NULL; int b_ver = p_cu->cu_info.i_tu_split == TU_SPLIT_VER; int b_hor = p_cu->cu_info.i_tu_split == TU_SPLIT_HOR; int intra_pred_class = INTRA_PRED_DC_DIAG; int num_cg; if (max_bits < 1) { return 1; ///< run_levelҪ1أsignΪbypassģʽ } if (b_hor) { cg_scan = tab_cg_scan_list_hor[i_level - 2]; } else if (b_ver) { cg_scan = tab_cg_scan_list_ver[i_level - 2]; } else { cg_scan = tab_cg_scan_list_nxn[i_level - 2]; } // reset b_hor and b_ver b_hor = (is_intra && tab_intra_mode_scan_type[intra_mode] == INTRA_PRED_HOR && p_cu->cu_info.i_mode != PRED_I_2Nxn && p_cu->cu_info.i_mode != PRED_I_nx2N); b_ver = !b_hor; num_cg = 1 << (i_level + i_level - 4); // number of CGs if (IS_ALG_ENABLE(OPT_BIT_EST_PSZT) && num_cg == 64 && !h->lcu.b_2nd_rdcost_pass) { // 32x32 TB num_cg = 25; } /* ʼRunLevelṹ */ runlevel->tab_cg_scan = cg_scan; runlevel->num_cg = num_cg; runlevel->i_stride_shift = i_stride_shift; runlevel->b_hor = b_hor; runlevel->quant_coeff = quant_coeff; runlevel->p_cu_info = &p_cu->cu_info; /* return bit rate */ if (IS_INTRA_MODE(p_cu->cu_info.i_mode)) { assert(intra_mode < NUM_INTRA_MODE); intra_pred_class = tab_intra_mode_scan_type[intra_mode]; } return aec_write_run_level_luma_fastrdo(p_aec, intra_pred_class, runlevel, h, max_bits); } /* --------------------------------------------------------------------------- */ static int write_chroma_block_coeff_fastrdo(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, coeff_t *quant_coeff, runlevel_t *runlevel, int i_level, int max_bits) { int num_cg = 1 << (i_level + i_level - 4); if (max_bits < 1) { return 1; ///< run_levelҪ1أsignΪbypassģʽ } if (IS_ALG_ENABLE(OPT_BIT_EST_PSZT) && num_cg == 64 && !h->lcu.b_2nd_rdcost_pass) { // 32x32 TB num_cg = 25; } /* ʼRunLevelṹ */ runlevel->tab_cg_scan = tab_cg_scan_list_nxn[i_level - 2]; runlevel->num_cg = num_cg; runlevel->i_stride_shift = i_level; runlevel->b_hor = 0; runlevel->quant_coeff = quant_coeff; runlevel->p_cu_info = &p_cu->cu_info; return aec_write_run_level_chroma_fastrdo(p_aec, runlevel, h, max_bits); } /** * =========================================================================== * function handler * =========================================================================== */ binary_t gf_aec_fastrdo = { /* syntax elements */ .write_intra_pred_mode = aec_write_intra_pred_mode_fastrdo, .write_ctu_split_flag = aec_write_split_flag_fastrdo, .est_cu_header = write_cu_header_fastrdo, .est_cu_refs_mvds = write_cu_refs_mvds_fastrdo, .est_luma_block_coeff = write_luma_block_coeff_fastrdo, .est_chroma_block_coeff = write_chroma_block_coeff_fastrdo, #if ENABLE_RATE_CONTROL_CU .write_cu_cbp_dqp = write_cu_cbp_dqp_fastrdo, #else .write_cu_cbp = aec_write_cu_cbp_fastrdo, #endif .write_sao_mergeflag = write_sao_mergeflag_fastrdo, .write_sao_mode = write_sao_mode_fastrdo, .write_sao_offset = write_sao_offset_fastrdo, .write_sao_type = write_sao_type_fastrdo, .write_alf_lcu_ctrl = aec_write_alf_lcu_ctrl_fastrdo, }; xavs2-1.3/source/encoder/aec_rdo.c000066400000000000000000002037441340660520300171100ustar00rootroot00000000000000/* * aec_rdo.c * * Description of this file: * AEC functions definition of RDO module of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "aec.h" #include "bitstream.h" #include "block_info.h" #include "cudata.h" /** * =========================================================================== * binary * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int aec_get_shift(uint32_t v) { #if SYS_WINDOWS && !ARCH_X86_64 __asm { bsr eax, v mov v, eax } return 8 - v; #else int i; for (i = 0; !(v & 0x100); i++) { v <<= 1; } return i; #endif } /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_symbol_rdo(aec_t *p_aec, uint8_t symbol, context_t *p_ctx) { #if !CTRL_OPT_AEC static const uint16_t thres_mps_update = (256 << LG_PMPS_SHIFTNO); static const uint16_t sum_mps_lps = (512 << LG_PMPS_SHIFTNO) - 1; #endif uint32_t lg_pmps = p_ctx->LG_PMPS; const uint32_t lg_pmps_shifted = lg_pmps >> LG_PMPS_SHIFTNO; const uint32_t t1 = p_aec->i_t1; const int s = (t1 < lg_pmps_shifted); if (symbol != p_ctx->MPS) { // LPS const uint32_t t = ((-s) & t1) + lg_pmps_shifted; const int shift = aec_get_shift(t); p_aec->i_bits_to_follow += s + shift; p_aec->i_t1 = (t << shift) & 0xff; #if CTRL_OPT_AEC p_ctx->v = g_tab_ctx_lps[p_ctx->v].v; #else lg_pmps += tab_cwr[p_ctx->cycno]; if (lg_pmps >= thres_mps_update) { lg_pmps = sum_mps_lps - lg_pmps; p_ctx->MPS = (uint8_t)(!p_ctx->MPS); } p_ctx->LG_PMPS = (uint16_t)lg_pmps; p_ctx->cycno += (p_ctx->cycno < 3); #endif } else { // MPS happens p_aec->i_bits_to_follow += s; p_aec->i_t1 = (t1 - lg_pmps_shifted) & 0xff; #if CTRL_OPT_AEC p_ctx->v = g_tab_ctx_mps[p_ctx->v].v; #else lg_pmps >>= XAVS2_MAX(p_ctx->cycno + 2, 3); p_ctx->LG_PMPS -= (uint16_t)(lg_pmps + (lg_pmps >> LG_PMPS_SHIFTNO)); p_ctx->cycno += (uint8_t)(!p_ctx->cycno); #endif } } /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_tu_rdo(aec_t *p_aec, int num_zeros, int max_len, context_t *p_ctx) { max_len -= num_zeros; while (num_zeros != 0) { biari_encode_symbol_rdo(p_aec, 0, p_ctx); num_zeros--; } if (max_len) { biari_encode_symbol_rdo(p_aec, 1, p_ctx); } } /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_symbol_eq_prob_rdo(aec_t *p_aec, uint8_t symbol) { UNUSED_PARAMETER(symbol); p_aec->i_bits_to_follow++; } /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_symbols_eq_prob_rdo(aec_t *p_aec, uint32_t val, int len) { UNUSED_PARAMETER(val); p_aec->i_bits_to_follow += len; } /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_symbol_final_rdo(aec_t *p_aec, uint8_t symbol) { const uint32_t t1 = p_aec->i_t1; if (symbol) { p_aec->i_bits_to_follow += (!t1) + 8; p_aec->i_t1 = 0; } else { // MPS p_aec->i_bits_to_follow += (!t1); p_aec->i_t1 = (t1 - 1) & 0xff; } } /** * =========================================================================== * syntax coding * =========================================================================== */ /* --------------------------------------------------------------------------- * cu type for B/F/P frame */ static INLINE int aec_write_cutype_rdo(aec_t *p_aec, int i_cu_type, int i_cu_level, int i_cu_cbp, int is_amp_enabled) { context_t *p_ctx = p_aec->p_ctx_set->cu_type_contexts; int org_bits = arienco_bits_written(p_aec); int act_sym = MAP_CU_TYPE[i_cu_type]; if (i_cu_type == PRED_SKIP && i_cu_cbp == 0) { act_sym = 0; } switch (act_sym) { case 0: // SKIP biari_encode_symbol_rdo(p_aec, 1, p_ctx + 0); break; case 1: // DIRECT biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 1); break; case 2: // 2Nx2N biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 2); break; case 3: // 2NxN, 2NxnU, 2NxnD biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 3); if (is_amp_enabled && i_cu_level >= B16X16_IN_BIT) { p_ctx = p_aec->p_ctx_set->shape_of_partition_index; if (i_cu_type == PRED_2NxN) { biari_encode_symbol_rdo(p_aec, 1, p_ctx); // SMP - AMP signal bit } else { biari_encode_symbol_rdo(p_aec, 0, p_ctx); // SMP - AMP signal bit biari_encode_symbol_rdo(p_aec, (uint8_t)(i_cu_type == PRED_2NxnU), p_ctx + 1); // AMP shape } } break; case 4: // Nx2N, nLx2N, nRx2N biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 3); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 4); if (is_amp_enabled && i_cu_level >= B16X16_IN_BIT) { p_ctx = p_aec->p_ctx_set->shape_of_partition_index; if (i_cu_type == PRED_Nx2N) { biari_encode_symbol_rdo(p_aec, 1, p_ctx); // SMP - AMP signal bit } else { biari_encode_symbol_rdo(p_aec, 0, p_ctx); // SMP - AMP signal bit biari_encode_symbol_rdo(p_aec, (uint8_t)(i_cu_type == PRED_nLx2N), p_ctx + 1); // AMP shape } } break; //case 5: // NxN, not enabled // biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); // biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); // biari_encode_symbol_rdo(p_aec, 0, p_ctx + 2); // biari_encode_symbol_rdo(p_aec, 0, p_ctx + 3); // biari_encode_symbol_rdo(p_aec, 0, p_ctx + 4); // if (i_cu_level > B8X8_IN_BIT) { // biari_encode_symbol_final_rdo(p_aec, 1); // } // break; default: // case 6: // Intra biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 3); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 4); if (i_cu_level > B8X8_IN_BIT) { biari_encode_symbol_final_rdo(p_aec, 0); } break; } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode a pair of intra prediction modes of a given cu */ static int aec_write_intra_pred_mode_rdo(aec_t *p_aec, int ipmode) { context_t *p_ctx = p_aec->p_ctx_set->intra_luma_pred_mode; int org_bits = arienco_bits_written(p_aec); if (ipmode >= 0) { biari_encode_symbol_rdo(p_aec, 0, p_ctx ); biari_encode_symbol_rdo(p_aec, (uint8_t)((ipmode & 0x10) >> 4), p_ctx + 1); biari_encode_symbol_rdo(p_aec, (uint8_t)((ipmode & 0x08) >> 3), p_ctx + 2); biari_encode_symbol_rdo(p_aec, (uint8_t)((ipmode & 0x04) >> 2), p_ctx + 3); biari_encode_symbol_rdo(p_aec, (uint8_t)((ipmode & 0x02) >> 1), p_ctx + 4); biari_encode_symbol_rdo(p_aec, (uint8_t)((ipmode & 0x01) ), p_ctx + 5); } else { biari_encode_symbol_rdo(p_aec, 1, p_ctx ); biari_encode_symbol_rdo(p_aec, (uint8_t)(ipmode + 2), p_ctx + 6); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the reference parameter of a given cu */ static INLINE int aec_write_ref_rdo(xavs2_t *h, aec_t *p_aec, int ref_idx) { context_t *p_ctx = p_aec->p_ctx_set->pu_reference_index; int org_bits = arienco_bits_written(p_aec); int act_sym = ref_idx; /* 0λ0ģ1λ1ģ2 */ if (act_sym == 0) { biari_encode_symbol_rdo(p_aec, 1, p_ctx); } else { int act_ctx = 0; biari_encode_symbol_rdo(p_aec, 0, p_ctx++); while (--act_sym != 0) { biari_encode_symbol_rdo(p_aec, 0, p_ctx); if (!act_ctx) { p_ctx++; } } if (ref_idx < h->i_ref - 1) { biari_encode_symbol_rdo(p_aec, 1, p_ctx); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the motion vector data */ static INLINE int aec_write_mvd_rdo(aec_t *p_aec, int mvd, int xy) { context_t *p_ctx = p_aec->p_ctx_set->mvd_contexts[xy]; int org_bits = arienco_bits_written(p_aec); uint32_t act_sym = XAVS2_ABS(mvd); if (act_sym < 3) { // 0, 1, 2 if (act_sym == 0) { biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); } else if (act_sym == 1) { biari_encode_symbol_rdo(p_aec, 1, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); } else { // act_sym == 2 biari_encode_symbol_rdo(p_aec, 1, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 2); } } else { int exp_golomb_order = 0; biari_encode_symbol_rdo(p_aec, 1, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 2); if ((act_sym & 1) == 1) { // odds >3 biari_encode_symbol_eq_prob_rdo(p_aec, 0); act_sym = (act_sym - 3) >> 1; } else { // even >3 biari_encode_symbol_eq_prob_rdo(p_aec, 1); act_sym = (act_sym - 4) >> 1; } /* exp_golomb part */ while (act_sym >= (uint32_t)(1 << exp_golomb_order)) { act_sym -= (1 << exp_golomb_order); exp_golomb_order++; } biari_encode_symbols_eq_prob_rdo(p_aec, 1, exp_golomb_order + 1); // Exp-Golomb: prefix and 1 biari_encode_symbols_eq_prob_rdo(p_aec, act_sym, exp_golomb_order); // Exp-Golomb: suffix } if (mvd != 0) { // mv sign biari_encode_symbol_eq_prob_rdo(p_aec, (uint8_t)(mvd < 0)); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_dmh_mode_rdo(aec_t *p_aec, int i_cu_level, int dmh_mode) { static const int iEncMapTab[9] = { 0, 5, 6, 1, 2, 7, 8, 3, 4 }; context_t *p_ctx = p_aec->p_ctx_set->pu_type_index + 3; int org_bits = arienco_bits_written(p_aec); int symbol = dmh_mode != 0; p_ctx += (i_cu_level - MIN_CU_SIZE_IN_BIT) * 3; biari_encode_symbol_rdo(p_aec, (uint8_t)symbol, p_ctx); if (symbol) { int iMapVal = iEncMapTab[dmh_mode]; if (iMapVal < 3) { symbol = (iMapVal != 1); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_eq_prob_rdo(p_aec, (uint8_t)symbol); } else if (iMapVal < 5) { symbol = (iMapVal != 3); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_eq_prob_rdo(p_aec, (uint8_t)symbol); } else { biari_encode_symbol_rdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 2); symbol = (iMapVal >= 7); biari_encode_symbol_eq_prob_rdo(p_aec, (uint8_t)symbol); symbol = !(iMapVal & 1); biari_encode_symbol_eq_prob_rdo(p_aec, (uint8_t)symbol); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * write "transform_split_flag" and SDIP type for intra CU */ static INLINE int aec_write_intra_cutype_rdo(aec_t *p_aec, int i_cu_type, int i_cu_level, int i_tu_split, int is_sdip_enabled) { context_t *p_ctx = p_aec->p_ctx_set->transform_split_flag; int org_bits = arienco_bits_written(p_aec); uint8_t transform_split_flag = i_tu_split != TU_SPLIT_NON; /* just write split or not */ if (i_cu_level == B8X8_IN_BIT) { biari_encode_symbol_rdo(p_aec, transform_split_flag, p_ctx + 1); } else if (is_sdip_enabled && (i_cu_level == B32X32_IN_BIT || i_cu_level == B16X16_IN_BIT)) { biari_encode_symbol_rdo(p_aec, transform_split_flag, p_ctx + 2); if (transform_split_flag) { p_ctx = p_aec->p_ctx_set->intra_pu_type_contexts; biari_encode_symbol_rdo(p_aec, i_cu_type == PRED_I_2Nxn, p_ctx); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_pdir_rdo(aec_t *p_aec, int i_cu_type, int i_cu_level, int pdir0, int pdir1) { int new_pdir[4] = { 2, 1, 3, 0 }; context_t *p_ctx = p_aec->p_ctx_set->pu_type_index; int org_bits = arienco_bits_written(p_aec); int act_ctx = 0; int act_sym; int symbol; if (i_cu_type == PRED_2Nx2N) { /* һCUֻһPUPUĸʹ3: 0, 1, 2 */ act_sym = pdir0; while (act_sym >= 1) { biari_encode_symbol_rdo(p_aec, 0, p_ctx + act_ctx); act_sym--; act_ctx++; } if (pdir0 != 3) { biari_encode_symbol_rdo(p_aec, 1, p_ctx + act_ctx); } } else if ((i_cu_type >= PRED_2NxN && i_cu_type <= PRED_nRx2N) && i_cu_level == B8X8_IN_BIT) { /* һCUΪPUCUСΪ8x8ʱԤΪ4x88x4ÿPUֻǵԤ⣬ * ܼ4ϣҪλBit b_pu_type_min_index ʹ */ p_ctx = p_aec->p_ctx_set->b_pu_type_min_index; pdir0 = new_pdir[pdir0]; pdir1 = new_pdir[pdir1]; act_sym = (pdir0 != 1); biari_encode_symbol_rdo(p_aec, (int8_t)act_sym, p_ctx + 0); act_sym = (pdir0 == pdir1); biari_encode_symbol_rdo(p_aec, (int8_t)act_sym, p_ctx + 1); } else if (i_cu_type >= PRED_2NxN || i_cu_type <= PRED_nRx2N) { //1010 /* act_ctx: 3,...,14 */ pdir0 = new_pdir[pdir0]; pdir1 = new_pdir[pdir1]; act_sym = pdir0; act_ctx = 3; /* 3,4,5 */ while (act_sym >= 1) { biari_encode_symbol_rdo(p_aec, 0, p_ctx + act_ctx); act_sym--; act_ctx++; } if (pdir0 != 3) { biari_encode_symbol_rdo(p_aec, 1, p_ctx + act_ctx); } symbol = (pdir0 == pdir1); biari_encode_symbol_rdo(p_aec, (uint8_t)symbol, p_ctx + 6); /* 7,...,14 */ if (!symbol) { switch (pdir0) { case 0: symbol = (pdir1 == 1); biari_encode_symbol_rdo(p_aec, (uint8_t)symbol, p_ctx + 7); if (!symbol) { symbol = (pdir1 == 2); biari_encode_symbol_rdo(p_aec, (uint8_t)symbol, p_ctx + 8); } break; case 1: symbol = (pdir1 == 0); biari_encode_symbol_rdo(p_aec, (uint8_t)symbol, p_ctx + 9); if (!symbol) { symbol = (pdir1 == 2); biari_encode_symbol_rdo(p_aec, (uint8_t)symbol, p_ctx + 10); } break; case 2: symbol = (pdir1 == 0); biari_encode_symbol_rdo(p_aec, (uint8_t)symbol, p_ctx + 11); if (!symbol) { symbol = (pdir1 == 1); biari_encode_symbol_rdo(p_aec, (uint8_t)symbol, p_ctx + 12); } break; case 3: symbol = (pdir1 == 0); biari_encode_symbol_rdo(p_aec, (uint8_t)symbol, p_ctx + 13); if (!symbol) { symbol = (pdir1 == 1); biari_encode_symbol_rdo(p_aec, (uint8_t)symbol, p_ctx + 14); } break; } } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_pdir_dhp_rdo(aec_t *p_aec, int i_cu_type, int pdir0, int pdir1) { context_t *p_ctx = p_aec->p_ctx_set->pu_type_index; int org_bits = arienco_bits_written(p_aec); pdir0 = (pdir0 != 0); pdir1 = (pdir1 != 0); if (i_cu_type == PRED_2Nx2N) { biari_encode_symbol_rdo(p_aec, (uint8_t)pdir0, p_ctx); } else if (i_cu_type >= PRED_2NxN || i_cu_type <= PRED_nRx2N) { // 1010 biari_encode_symbol_rdo(p_aec, (uint8_t)pdir0, p_ctx + 1); biari_encode_symbol_rdo(p_aec, (uint8_t)(pdir0 == pdir1), p_ctx + 2); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_wpm_rdo(aec_t *p_aec, int ref_idx, int num_ref) { context_t *p_ctx = p_aec->p_ctx_set->weighted_skip_mode; int org_bits = arienco_bits_written(p_aec); int i, idx_bin = 0; for (i = 0; i < ref_idx; i++) { biari_encode_symbol_rdo(p_aec, 0, p_ctx + idx_bin); idx_bin = XAVS2_MIN(idx_bin + 1, 2); } if (ref_idx < num_ref - 1) { biari_encode_symbol_rdo(p_aec, 1, p_ctx + idx_bin); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_spatial_skip_mode_rdo(aec_t *p_aec, int mode) { context_t *p_ctx = p_aec->p_ctx_set->cu_subtype_index; int org_bits = arienco_bits_written(p_aec); int offset; for (offset = 0; offset < mode; offset++) { biari_encode_symbol_rdo(p_aec, 0, p_ctx + offset); } if (mode < DS_MAX_NUM) { biari_encode_symbol_rdo(p_aec, 1, p_ctx + offset); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the chroma intra prediction mode of an 8x8 block */ static INLINE int aec_write_intra_pred_cmode_rdo(aec_t *p_aec, cu_info_t *p_cu_info, int i_left_cmode) { context_t *p_ctx = p_aec->p_ctx_set->intra_chroma_pred_mode; int i_chroma_mode = p_cu_info->i_intra_mode_c; int org_bits = arienco_bits_written(p_aec); int act_ctx = i_left_cmode != DM_PRED_C; // ? 1 : 0; if (i_chroma_mode == DM_PRED_C) { biari_encode_symbol_rdo(p_aec, 1, p_ctx + act_ctx); } else { int lmode = tab_intra_mode_luma2chroma[p_cu_info->real_intra_modes[0]]; int is_redundant = lmode >= 0; biari_encode_symbol_rdo(p_aec, 0, p_ctx + act_ctx); i_chroma_mode -= (1 + (is_redundant && i_chroma_mode > lmode)); p_ctx += 2; switch (i_chroma_mode) { case 0: biari_encode_symbol_rdo(p_aec, 1, p_ctx); break; case 1: biari_encode_symbol_rdo(p_aec, 0, p_ctx); biari_encode_symbol_rdo(p_aec, 1, p_ctx); break; case 2: biari_encode_symbol_rdo(p_aec, 0, p_ctx); biari_encode_symbol_rdo(p_aec, 0, p_ctx); biari_encode_symbol_rdo(p_aec, 1, p_ctx); break; case 3: biari_encode_symbol_rdo(p_aec, 0, p_ctx); biari_encode_symbol_rdo(p_aec, 0, p_ctx); biari_encode_symbol_rdo(p_aec, 0, p_ctx); break; default: xavs2_log(NULL, XAVS2_LOG_ERROR, "invalid chroma mode %d\n", i_chroma_mode); break; } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the coded block pattern of an luma CB */ static int write_cbp_bit_rdo(xavs2_t *h, aec_t *p_aec, cu_info_t *p_cu_info, int slice_index_cur_cu, int b8, int bit) { int org_bits = arienco_bits_written(p_aec); int i_cu_level = p_cu_info->i_level; int transform_split_flag = p_cu_info->i_tu_split != TU_SPLIT_NON; int is_hor_part = p_cu_info->i_tu_split == TU_SPLIT_HOR; int is_ver_part = p_cu_info->i_tu_split == TU_SPLIT_VER; int a, b; int x_4x4, y_4x4; ///< ǰ任4x4λ int w_4x4, h_4x4; ///< ǰ任4x4С context_t *p_ctx; /* get context pointer */ if (b8 == 4) { p_ctx = p_aec->p_ctx_set->cbp_contexts + 8; } else { w_4x4 = h_4x4 = 1 << (i_cu_level - MIN_PU_SIZE_IN_BIT); x_4x4 = p_cu_info->i_scu_x << (MIN_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT); y_4x4 = p_cu_info->i_scu_y << (MIN_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT); if (b8 != 4 && transform_split_flag) { if (is_hor_part) { h_4x4 >>= 2; y_4x4 += h_4x4 * b8; } else if (is_ver_part) { w_4x4 >>= 2; x_4x4 += w_4x4 * b8; } else { w_4x4 >>= 1; h_4x4 >>= 1; x_4x4 += (b8 & 1) ? w_4x4 : 0; y_4x4 += (b8 >> 1) ? h_4x4 : 0; } } a = get_neighbor_cbp_y(h, p_cu_info, slice_index_cur_cu, x_4x4 - 1, y_4x4 ); b = get_neighbor_cbp_y(h, p_cu_info, slice_index_cur_cu, x_4x4 , y_4x4 - 1); p_ctx = p_aec->p_ctx_set->cbp_contexts + a + 2 * b; } /* write bits */ biari_encode_symbol_rdo(p_aec, (uint8_t)bit, p_ctx); /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the coded block pattern of a cu */ static INLINE int aec_write_cu_cbp_rdo(aec_t *p_aec, cu_info_t *p_cu_info, int slice_index_cur_cu, xavs2_t *h) { context_t *p_ctx = p_aec->p_ctx_set->cbp_contexts + 4; int org_bits = arienco_bits_written(p_aec); int i_cu_cbp = p_cu_info->i_cbp; int i_cu_type = p_cu_info->i_mode; int transform_split_flag = p_cu_info->i_tu_split != TU_SPLIT_NON; if (IS_INTER_MODE(i_cu_type)) { /* write cbp for inter pred mode --------------------------- */ if (!IS_SKIP_MODE(i_cu_type)) { write_cbp_bit_rdo(h, p_aec, p_cu_info, slice_index_cur_cu, 4, i_cu_cbp == 0); } if (i_cu_cbp) { // write tr_size biari_encode_symbol_rdo(p_aec, (uint8_t)transform_split_flag, p_aec->p_ctx_set->transform_split_flag); // write cbp for chroma if (h->param->chroma_format != CHROMA_400) { switch ((i_cu_cbp >> 4) & 0x03) { case 0: biari_encode_symbol_rdo(p_aec, 0, p_ctx); break; case 1: biari_encode_symbol_rdo(p_aec, 1, p_ctx); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 2); break; case 2: biari_encode_symbol_rdo(p_aec, 1, p_ctx); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 2); break; case 3: biari_encode_symbol_rdo(p_aec, 1, p_ctx); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 2); break; } } // write cbp for luma if (transform_split_flag == 0) { if (i_cu_cbp > 15) { write_cbp_bit_rdo(h, p_aec, p_cu_info, slice_index_cur_cu, 0, (i_cu_cbp & 1) != 0); } } else { write_cbp_bit_rdo(h, p_aec, p_cu_info, slice_index_cur_cu, 0, (i_cu_cbp & 1) != 0); write_cbp_bit_rdo(h, p_aec, p_cu_info, slice_index_cur_cu, 1, (i_cu_cbp & 2) != 0); write_cbp_bit_rdo(h, p_aec, p_cu_info, slice_index_cur_cu, 2, (i_cu_cbp & 4) != 0); write_cbp_bit_rdo(h, p_aec, p_cu_info, slice_index_cur_cu, 3, (i_cu_cbp & 8) != 0); } } } else { /* write cbp for intra pred mode --------------------------- */ // write bits for luma if (transform_split_flag == 0 || i_cu_type == PRED_I_2Nx2N) { write_cbp_bit_rdo(h, p_aec, p_cu_info, slice_index_cur_cu, 0, (i_cu_cbp & 0x0F) != 0); } else { write_cbp_bit_rdo(h, p_aec, p_cu_info, slice_index_cur_cu, 0, (i_cu_cbp & 1) != 0); write_cbp_bit_rdo(h, p_aec, p_cu_info, slice_index_cur_cu, 1, (i_cu_cbp & 2) != 0); write_cbp_bit_rdo(h, p_aec, p_cu_info, slice_index_cur_cu, 2, (i_cu_cbp & 4) != 0); write_cbp_bit_rdo(h, p_aec, p_cu_info, slice_index_cur_cu, 3, (i_cu_cbp & 8) != 0); } // write bits for chroma if (h->param->chroma_format != CHROMA_400) { switch ((i_cu_cbp >> 4) & 0x03) { case 0: biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); break; case 1: biari_encode_symbol_rdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 3); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 3); break; case 2: biari_encode_symbol_rdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 3); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 3); break; case 3: biari_encode_symbol_rdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 3); break; } } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } #if ENABLE_RATE_CONTROL_CU /* --------------------------------------------------------------------------- */ static INLINE int aec_write_dqp_rdo(aec_t *p_aec, int delta_qp, int last_dqp) { context_t *p_ctx = p_aec->p_ctx_set->delta_qp_contexts; int org_bits = arienco_bits_written(p_aec); int act_ctx = (last_dqp) ? 1 : 0; int act_sym = (delta_qp > 0) ? (2 * delta_qp - 1) : (-2 * delta_qp); if (act_sym == 0) { biari_encode_symbol_rdo(p_aec, 1, p_ctx + act_ctx); } else { biari_encode_symbol_rdo(p_aec, 0, p_ctx + act_ctx); act_ctx = 2; if (act_sym == 1) { biari_encode_symbol_rdo(p_aec, 1, p_ctx + act_ctx); } else { biari_encode_symbol_rdo(p_aec, 0, p_ctx + act_ctx); act_ctx++; while (act_sym > 2) { biari_encode_symbol_rdo(p_aec, 0, p_ctx + act_ctx); act_sym--; } biari_encode_symbol_rdo(p_aec, 1, p_ctx + act_ctx); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } #endif /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void aec_write_last_cg_pos(aec_t *p_aec, int b_luma, int intra_pred_class, int i_cg, int cg_last_x, int cg_last_y, int num_cg, int num_cg_x_minus1, int num_cg_y_minus1) { context_t *p_ctx = p_aec->p_ctx_set->last_cg_contexts + (b_luma ? 0 : NUM_LAST_CG_CTX_LUMA); int count; if (num_cg == 4) { // 8x8 switch (i_cg) { case 0: biari_encode_symbol_rdo(p_aec, 1, p_ctx + 0); break; case 1: biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 1); break; case 2: biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 2); break; default: // case 3: biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 2); break; } } else { if (b_luma && intra_pred_class == INTRA_PRED_DC_DIAG) { XAVS2_SWAP(cg_last_x, cg_last_y); XAVS2_SWAP(num_cg_x_minus1, num_cg_y_minus1); } if (cg_last_x == 0 && cg_last_y == 0) { biari_encode_symbol_rdo(p_aec, 0, p_ctx + 3); /* last_cg0_flag */ } else { biari_encode_symbol_rdo(p_aec, 1, p_ctx + 3); /* last_cg0_flag */ /* last_cg_x */ biari_encode_tu_rdo(p_aec, cg_last_x, num_cg_x_minus1, p_ctx + 4); /* last_cg_y or last_cg_y_minus1 */ count = (cg_last_x == 0); // cg_last_xΪ㣬cg_last_yдһ㣨һ㣩 biari_encode_tu_rdo(p_aec, cg_last_y - count, num_cg_y_minus1 - count, p_ctx + 5); } } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void aec_write_last_coeff_pos(aec_t *p_aec, int isLastCG, int b_one_cg, int cg_x, int cg_y, int last_coeff_pos_x, int last_coeff_pos_y, int b_luma, int intra_pred_class) { context_t *p_ctx = p_aec->p_ctx_set->last_pos_contexts + (b_luma ? 0 : NUM_LAST_POS_CTX_LUMA); int offset; if (!isLastCG) { last_coeff_pos_x = 3 - last_coeff_pos_x; if (intra_pred_class == INTRA_PRED_DC_DIAG) { last_coeff_pos_y = 3 - last_coeff_pos_y; } } if (cg_x == 0 && cg_y > 0 && intra_pred_class == INTRA_PRED_DC_DIAG) { XAVS2_SWAP(last_coeff_pos_x, last_coeff_pos_y); } /* AVS2-P2: 8.3.3.2.14 ȷlast_coeff_pos_x last_coeff_pos_y ctxIdxInc */ if (b_luma == 0) { // ɫȷռ12 offset = b_one_cg ? 0 : 4 + isLastCG * 4; } else if (b_one_cg) { // Log2TransformSize Ϊ 2ռ8 offset = 40 + (intra_pred_class == INTRA_PRED_DC_DIAG) * 4; } else if (cg_x != 0 && cg_y != 0) { // cg_x cg_y Ϊ㣬ռ8 offset = 32 + isLastCG * 4; } else { // λռ40 offset = (4 * isLastCG + 2 * (cg_x == 0 && cg_y == 0) + (intra_pred_class == INTRA_PRED_DC_DIAG)) * 4; } p_ctx += offset; switch (last_coeff_pos_x) { case 0: biari_encode_symbol_rdo(p_aec, 1, p_ctx + 0); break; case 1: biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 1); break; case 2: biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 1); break; default: // case 3: biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); break; } p_ctx += 2; switch (last_coeff_pos_y) { case 0: biari_encode_symbol_rdo(p_aec, 1, p_ctx + 0); break; case 1: biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 1); break; case 2: biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 1, p_ctx + 1); break; default: // case 3: biari_encode_symbol_rdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_rdo(p_aec, 0, p_ctx + 1); break; } } /* --------------------------------------------------------------------------- */ static int aec_write_run_level_luma_rdo(aec_t *p_aec, int intra_pred_class, runlevel_t *runlevel, xavs2_t *h, int maxvalue) { static const int8_t tab_rank[6] = { 0, 1, 2, 3, 3, 4/*, 4 ...*/ }; const int16_t(*p_tab_cg_scan)[2] = runlevel->tab_cg_scan; context_t(*Primary)[NUM_MAP_CTX] = p_aec->p_ctx_set->coeff_run[0]; runlevel_pair_t *p_runlevel = runlevel->runlevels_cg; int level_max = 0; int rank = 0; int num_cg = runlevel->num_cg; int org_bits = arienco_bits_written(p_aec); int i_cg; int cur_bits; UNUSED_PARAMETER(h); /* write coefficients in CG */ for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { context_t *p_ctx; int CGx = 0; int CGy = 0; uint32_t Level_sign = 0; int pos; int num_pairs; int pairs; int pairsInCG; int i; /* 1. 鵱ǰCGǷзϵ */ coeff_t *quant_coeff = runlevel->quant_coeff; const int b_hor = runlevel->b_hor; quant_coeff += ((p_tab_cg_scan[i_cg][!b_hor] << runlevel->i_stride_shift) + p_tab_cg_scan[i_cg][b_hor]) << 2; num_pairs = tu_get_cg_run_level_info(runlevel, quant_coeff, runlevel->i_stride_shift, runlevel->b_hor); i = num_pairs; // number of pairs in CG /* 2, Sig CG Flag, "nonzero_cg_flag" */ if (rank > 0) { p_ctx = p_aec->p_ctx_set->nonzero_cg_flag + (i_cg != 0); if (i) { // i > 0 cg_flag Ϊ1зϵ biari_encode_symbol_rdo(p_aec, 1, p_ctx); } else { biari_encode_symbol_rdo(p_aec, 0, p_ctx); continue; // ޷ϵǰCG } CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; } else if (i > 0) { if (num_cg > 1) { // for TB > 4x4, need to write int num_cg_x = p_tab_cg_scan[num_cg - 1][0]; int num_cg_y = p_tab_cg_scan[num_cg - 1][1]; CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; aec_write_last_cg_pos(p_aec, 1, intra_pred_class, i_cg, CGx, CGy, num_cg, num_cg_x, num_cg_y); } } else { continue; } /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); /* 3, (Run, Level) */ /* 3.1, LAST IN CG, "last_coeff_pos_x", "last_coeff_pos_y" */ pos = runlevel->last_pos_cg; pairs = num_pairs - 1; { int scan_pos = tab_1d_scan_4x4[15 - pos]; int x_in_cg = scan_pos & 3; int y_in_cg = scan_pos >> 2; aec_write_last_coeff_pos(p_aec, rank == 0, num_cg == 1, CGx, CGy, x_in_cg, y_in_cg, 1, intra_pred_class); } for (pairsInCG = 0; i > 0 && pos < NUM_OF_COEFFS_IN_CG; i--, pairs--, pairsInCG++) { int absSum5 = 0; int k, n = 0; int ctxpos, offset = 0; int Level = p_runlevel[pairs].level; int Run = p_runlevel[pairs].run; int absLevel = XAVS2_ABS(Level); int symbol = absLevel - 1; Level_sign |= (Level < 0) << i; // record Sign /* 3.2, level, "coeff_level_minus1_band[i]", "coeff_level_minus1_pos_in_band[i]" */ if (symbol > 31) { int exp_golomb_order = 0; biari_encode_symbol_final_rdo(p_aec, 1); // "coeff_level_minus1_band[i]", > 32 /* coeff_level_minus1_pos_in_band[i] */ symbol -= 32; while (symbol >= (1 << exp_golomb_order)) { symbol -= (1 << exp_golomb_order); exp_golomb_order++; } biari_encode_symbols_eq_prob_rdo(p_aec, 1, exp_golomb_order + 1); // Exp-Golomb: prefix and 1 biari_encode_symbols_eq_prob_rdo(p_aec, symbol, exp_golomb_order); // Exp-Golomb: suffix } else { int pairsInCGIdx = XAVS2_MIN(2, ((pairsInCG + 1) >> 1)); biari_encode_symbol_final_rdo(p_aec, 0); // "coeff_level_minus1_band[i]", <= 32 /* coeff_level_minus1_pos_in_band[i] */ p_ctx = p_aec->p_ctx_set->coeff_level; p_ctx += 10 * (i_cg == 0 && pos > 12) + XAVS2_MIN(rank, pairsInCGIdx + 2) + ((5 * pairsInCGIdx) >> 1); biari_encode_tu_rdo(p_aec, symbol, 31, p_ctx); } level_max = XAVS2_MAX(level_max, absLevel); rank = tab_rank[XAVS2_MIN(5, level_max)]; // update rank /* 3.3, run, "coeff_run[i]" */ for (k = pairs; k < pairs + pairsInCG; k++) { n += p_runlevel[k + 1].run + 1; if (n >= 7) { break; } absSum5 += XAVS2_ABS(p_runlevel[k + 1].level); } absSum5 = (absSum5 + absLevel) >> 1; p_ctx = Primary[XAVS2_MIN(absSum5, 2)]; ctxpos = pos; symbol = Run; for (;;) { if (ctxpos < NUM_OF_COEFFS_IN_CG - 1) { int py = (tab_scan_4x4[14 - ctxpos][1] + 1) >> 1; // 0, 1, 2 int moddiv = (intra_pred_class != INTRA_PRED_DC_DIAG) ? py : (ctxpos > 11 ? 0 : (ctxpos > 4 ? 1 : 2)); // 012 offset = ((i_cg == 0) ? (ctxpos == 14 ? 0 : (1 + moddiv)) : (4 + moddiv)) + (num_cg == 1 ? 0 : 4); // 0,...,10 } if (symbol-- > 0) { assert(offset >= 0 && offset < NUM_MAP_CTX); biari_encode_symbol_rdo(p_aec, 0, p_ctx + offset); ctxpos++; } else { break; } } pos += (Run + 1); // update position if (pos < NUM_OF_COEFFS_IN_CG) { assert(offset >= 0 && offset < NUM_MAP_CTX); biari_encode_symbol_rdo(p_aec, 1, p_ctx + offset); } else { pairs--; pairsInCG++; break; } } // run-level loop /* 4, sign of coefficient */ biari_encode_symbols_eq_prob_rdo(p_aec, Level_sign >> 1, num_pairs); /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); } // for (; i_cg >= 0; i_cg--) /* get the number of written bits */ org_bits = arienco_bits_written(p_aec) - org_bits; #ifdef DEBUG if (rank == 0) { xavs2_log(h, XAVS2_LOG_ERROR, "no non-zero run-level luma, POC[%d]: p_cu: (%d, %d), level %d, cu_type %d\n", h->fdec->i_poc, runlevel->p_cu_info->i_scu_x, runlevel->p_cu_info->i_scu_y, runlevel->p_cu_info->i_level, runlevel->p_cu_info->i_mode); } #endif assert(rank > 0); // зϵʱrankֵӦ /* return the number of written bits */ return org_bits; } /* --------------------------------------------------------------------------- */ static int aec_write_run_level_chroma_rdo(aec_t *p_aec, runlevel_t *runlevel, xavs2_t *h, int maxvalue) { static const int8_t tab_rank[6] = { 0, 1, 2, 3, 3, 4/*, 4 ...*/ }; const int16_t(*p_tab_cg_scan)[2] = runlevel->tab_cg_scan; context_t(*Primary)[NUM_MAP_CTX] = p_aec->p_ctx_set->coeff_run[1]; runlevel_pair_t *p_runlevel = runlevel->runlevels_cg; int level_max = 0; int rank = 0; int num_cg = runlevel->num_cg; int org_bits = arienco_bits_written(p_aec); int i_cg; int cur_bits; UNUSED_PARAMETER(h); /* write coefficients in CG */ for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { context_t *p_ctx; int CGx = 0; int CGy = 0; uint32_t Level_sign = 0; int pos; int num_pairs; int pairs; int pairsInCG; int i; /* 1. 鵱ǰCGǷзϵ */ coeff_t *quant_coeff = runlevel->quant_coeff; const int b_hor = 0; // runlevel->b_hor; quant_coeff += ((p_tab_cg_scan[i_cg][!b_hor] << runlevel->i_stride_shift) + p_tab_cg_scan[i_cg][b_hor]) << 2; num_pairs = tu_get_cg_run_level_info(runlevel, quant_coeff, runlevel->i_stride_shift, b_hor); i = num_pairs; // number of pairs in CG /* 2, Sig CG Flag, "nonzero_cg_flag" */ if (rank > 0) { p_ctx = p_aec->p_ctx_set->nonzero_cg_flag + (NUM_SIGN_CG_CTX_LUMA); if (i) { // i > 0 cg_flag Ϊ1зϵ biari_encode_symbol_rdo(p_aec, 1, p_ctx); } else { biari_encode_symbol_rdo(p_aec, 0, p_ctx); continue; // ޷ϵǰCG } CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; } else if (i > 0) { if (num_cg > 1) { int num_cg_x = p_tab_cg_scan[num_cg - 1][0]; int num_cg_y = p_tab_cg_scan[num_cg - 1][1]; CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; aec_write_last_cg_pos(p_aec, 0, INTRA_PRED_DC_DIAG, i_cg, CGx, CGy, num_cg, num_cg_x, num_cg_y); } } else { continue; // δҵһϵCG } /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); /* 3, (Run, Level) */ /* 3.1, LAST IN CG, "last_coeff_pos_x", "last_coeff_pos_y" */ pos = runlevel->last_pos_cg; pairs = num_pairs - 1; { int scan_pos = tab_1d_scan_4x4[15 - pos]; int x_in_cg = scan_pos & 3; int y_in_cg = scan_pos >> 2; aec_write_last_coeff_pos(p_aec, rank == 0, num_cg == 1, CGx, CGy, x_in_cg, y_in_cg, 0, INTRA_PRED_DC_DIAG); } for (pairsInCG = 0; i > 0 && pos < NUM_OF_COEFFS_IN_CG; i--, pairs--, pairsInCG++) { int absSum5 = 0; int k, n = 0; int ctxpos, offset = 0; int Level = p_runlevel[pairs].level; int Run = p_runlevel[pairs].run; int absLevel = XAVS2_ABS(Level); int symbol = absLevel - 1; Level_sign |= (Level < 0) << i; // record Sign /* 3.2, level, "coeff_level_minus1_band[i]", "coeff_level_minus1_pos_in_band[i]" */ if (symbol > 31) { int exp_golomb_order = 0; biari_encode_symbol_final_rdo(p_aec, 1); // "coeff_level_minus1_band[i]", > 32 /* coeff_level_minus1_pos_in_band[i] */ symbol -= 32; while (symbol >= (1 << exp_golomb_order)) { symbol -= (1 << exp_golomb_order); exp_golomb_order++; } biari_encode_symbols_eq_prob_rdo(p_aec, 1, exp_golomb_order + 1); // Exp-Golomb: prefix and 1 biari_encode_symbols_eq_prob_rdo(p_aec, symbol, exp_golomb_order); // Exp-Golomb: suffix } else { int pairsInCGIdx = XAVS2_MIN(2, ((pairsInCG + 1) >> 1)); biari_encode_symbol_final_rdo(p_aec, 0); // "coeff_level_minus1_band[i]", <= 32 /* coeff_level_minus1_pos_in_band[i] */ p_ctx = p_aec->p_ctx_set->coeff_level; p_ctx += 10 * (i_cg == 0 && pos > 12) + XAVS2_MIN(rank, pairsInCGIdx + 2) + ((5 * pairsInCGIdx) >> 1) + 20; biari_encode_tu_rdo(p_aec, symbol, 31, p_ctx); } level_max = XAVS2_MAX(level_max, absLevel); rank = tab_rank[XAVS2_MIN(5, level_max)]; // update rank /* 3.3, run, "coeff_run[i]" */ for (k = pairs; k < pairs + pairsInCG; k++) { n += p_runlevel[k + 1].run + 1; if (n >= 7) { break; } absSum5 += XAVS2_ABS(p_runlevel[k + 1].level); } absSum5 = (absSum5 + absLevel) >> 1; p_ctx = Primary[XAVS2_MIN(absSum5, 2)]; ctxpos = pos; symbol = Run; for (;;) { if (ctxpos < NUM_OF_COEFFS_IN_CG - 1) { int moddiv = (ctxpos <= 9); offset = ((i_cg == 0) ? (ctxpos == 14 ? 0 : (1 + moddiv)) : (3 + moddiv)) + (num_cg == 1 ? 0 : 3); } if (symbol-- > 0) { assert(offset >= 0 && offset < NUM_MAP_CTX); biari_encode_symbol_rdo(p_aec, 0, p_ctx + offset); ctxpos++; } else { break; } } pos += (Run + 1); // update position if (pos < NUM_OF_COEFFS_IN_CG) { assert(offset >= 0 && offset < NUM_MAP_CTX); biari_encode_symbol_rdo(p_aec, 1, p_ctx + offset); } else { pairs--; pairsInCG++; break; } } // run-level loop /* 4, sign of coefficient */ biari_encode_symbols_eq_prob_rdo(p_aec, Level_sign >> 1, num_pairs); /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); } // for (; i_cg >= 0; i_cg--) /* get the number of written bits */ org_bits = arienco_bits_written(p_aec) - org_bits; #ifdef DEBUG if (rank == 0) { xavs2_log(h, XAVS2_LOG_ERROR, "no non-zero run-level chroma, p_cu: (%d, %d), level %d, cu_type %d\n", runlevel->p_cu_info->i_scu_x, runlevel->p_cu_info->i_scu_y, runlevel->p_cu_info->i_level, runlevel->p_cu_info->i_mode); } #endif assert(rank > 0); // зϵʱrankֵӦ /* return the number of written bits */ return org_bits; } /* --------------------------------------------------------------------------- */ int aec_write_split_flag_rdo(aec_t *p_aec, int i_cu_split, int i_cu_level) { context_t *p_ctx = p_aec->p_ctx_set->split_flag + (MAX_CU_SIZE_IN_BIT - i_cu_level); int org_bits = arienco_bits_written(p_aec); biari_encode_symbol_rdo(p_aec, (uint8_t)i_cu_split, p_ctx); /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ int write_sao_mergeflag_rdo(aec_t *p_aec, int avail_left, int avail_up, SAOBlkParam *p_sao_param) { int b_merge_left = 0; int b_merge_up; int val = 0; context_t *p_ctx = p_aec->p_ctx_set->sao_merge_type_index; int org_bits = arienco_bits_written(p_aec); int ctx_offset = avail_left + avail_up; if (avail_left) { b_merge_left = (p_sao_param->mergeIdx == SAO_MERGE_LEFT); val = b_merge_left ? 1 : 0; } if (avail_up && !b_merge_left) { b_merge_up = (p_sao_param->mergeIdx == SAO_MERGE_ABOVE); val = b_merge_up ? (1 + avail_left) : 0; } if (ctx_offset == 1) { assert(val <= 1); biari_encode_symbol_rdo(p_aec, (uint8_t)val, p_ctx + 0); } else if (ctx_offset == 2) { assert(val <= 2); biari_encode_symbol_rdo(p_aec, val & 0x01, p_ctx + 1); if (val != 1) { biari_encode_symbol_rdo(p_aec, (val >> 1) & 0x01, p_ctx + 2); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ int write_sao_mode_rdo(aec_t *p_aec, SAOBlkParam *saoBlkParam) { context_t *p_ctx = p_aec->p_ctx_set->sao_mode; int org_bits = arienco_bits_written(p_aec); int sao_type = saoBlkParam->typeIdc; if (sao_type == SAO_TYPE_OFF) { biari_encode_symbol_rdo(p_aec, 1, p_ctx); } else if (sao_type == SAO_TYPE_BO) { biari_encode_symbol_rdo(p_aec, 0, p_ctx); biari_encode_symbol_eq_prob_rdo(p_aec, 1); } else { // SAO_TYPE_EO (0~3) biari_encode_symbol_rdo(p_aec, 0, p_ctx); biari_encode_symbol_eq_prob_rdo(p_aec, 0); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static int aec_write_sao_offset_rdo(aec_t *p_aec, int val, int offset_type) { /* --------------------------------------------------------------------------- */ static const int EO_OFFSET_MAP[8] = { 3, 1, 0, 2, 4, 5, 6, 7 }; context_t *p_ctx = p_aec->p_ctx_set->sao_interval_offset_abs; int org_bits = arienco_bits_written(p_aec); int act_sym; assert(offset_type != SAO_CLASS_EO_PLAIN); if (offset_type == SAO_CLASS_EO_FULL_VALLEY) { act_sym = EO_OFFSET_MAP[val + 1]; } else if (offset_type == SAO_CLASS_EO_FULL_PEAK) { act_sym = EO_OFFSET_MAP[-val + 1]; } else { act_sym = XAVS2_ABS(val); } if (act_sym == 0) { if (offset_type == SAO_CLASS_BO) { biari_encode_symbol_rdo(p_aec, 1, p_ctx); } else { biari_encode_symbol_eq_prob_rdo(p_aec, 1); } } else { int maxvalue = tab_saoclip[offset_type][2]; int temp = act_sym; while (temp != 0) { if (offset_type == SAO_CLASS_BO && temp == act_sym) { biari_encode_symbol_rdo(p_aec, 0, p_ctx); } else { biari_encode_symbol_eq_prob_rdo(p_aec, 0); } temp--; } if (act_sym < maxvalue) { biari_encode_symbol_eq_prob_rdo(p_aec, 1); } } if (offset_type == SAO_CLASS_BO && act_sym) { // write sign symbol biari_encode_symbol_eq_prob_rdo(p_aec, (uint8_t)(val >= 0 ? 0 : 1)); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ int write_sao_offset_rdo(aec_t *p_aec, SAOBlkParam *saoBlkParam) { int rate = 0; assert(saoBlkParam->typeIdc != SAO_TYPE_OFF); if (saoBlkParam->typeIdc == SAO_TYPE_BO) { int bandIdxBO[4]; bandIdxBO[0] = saoBlkParam->startBand; bandIdxBO[1] = bandIdxBO[0] + 1; bandIdxBO[2] = (saoBlkParam->startBand + saoBlkParam->deltaBand) & 31; bandIdxBO[3] = bandIdxBO[2] + 1; rate += aec_write_sao_offset_rdo(p_aec, saoBlkParam->offset[bandIdxBO[0]], SAO_CLASS_BO); rate += aec_write_sao_offset_rdo(p_aec, saoBlkParam->offset[bandIdxBO[1]], SAO_CLASS_BO); rate += aec_write_sao_offset_rdo(p_aec, saoBlkParam->offset[bandIdxBO[2]], SAO_CLASS_BO); rate += aec_write_sao_offset_rdo(p_aec, saoBlkParam->offset[bandIdxBO[3]], SAO_CLASS_BO); } else { assert(saoBlkParam->typeIdc >= SAO_TYPE_EO_0 && saoBlkParam->typeIdc <= SAO_TYPE_EO_45); rate += aec_write_sao_offset_rdo(p_aec, saoBlkParam->offset[SAO_CLASS_EO_FULL_VALLEY], SAO_CLASS_EO_FULL_VALLEY); rate += aec_write_sao_offset_rdo(p_aec, saoBlkParam->offset[SAO_CLASS_EO_HALF_VALLEY], SAO_CLASS_EO_HALF_VALLEY); rate += aec_write_sao_offset_rdo(p_aec, saoBlkParam->offset[SAO_CLASS_EO_HALF_PEAK], SAO_CLASS_EO_HALF_PEAK); rate += aec_write_sao_offset_rdo(p_aec, saoBlkParam->offset[SAO_CLASS_EO_FULL_PEAK], SAO_CLASS_EO_FULL_PEAK); } return rate; } /* --------------------------------------------------------------------------- */ int write_sao_type_rdo(aec_t *p_aec, SAOBlkParam *saoBlkParam) { int rate = 0; int val; assert(saoBlkParam->typeIdc != SAO_TYPE_OFF); if (saoBlkParam->typeIdc == SAO_TYPE_BO) { int exp_golomb_order = 1; /* start band */ val = saoBlkParam->startBand; biari_encode_symbol_eq_prob_rdo(p_aec, val & 0x01); biari_encode_symbol_eq_prob_rdo(p_aec, (val >> 1) & 0x01); biari_encode_symbol_eq_prob_rdo(p_aec, (val >> 2) & 0x01); biari_encode_symbol_eq_prob_rdo(p_aec, (val >> 3) & 0x01); biari_encode_symbol_eq_prob_rdo(p_aec, (val >> 4) & 0x01); /* delta band */ assert(saoBlkParam->deltaBand >= 2); val = saoBlkParam->deltaBand - 2; while (val >= (1 << exp_golomb_order)) { biari_encode_symbol_eq_prob_rdo(p_aec, 0); val -= (1 << exp_golomb_order); exp_golomb_order++; } if (exp_golomb_order == 4) { exp_golomb_order = 0; } else { biari_encode_symbol_eq_prob_rdo(p_aec, 1); } while (exp_golomb_order--) { // next binary part biari_encode_symbol_eq_prob_rdo(p_aec, (uint8_t)((val >> exp_golomb_order) & 1)); } } else { assert(saoBlkParam->typeIdc >= SAO_TYPE_EO_0 && saoBlkParam->typeIdc <= SAO_TYPE_EO_45); val = saoBlkParam->typeIdc; biari_encode_symbol_eq_prob_rdo(p_aec, val & 0x01); biari_encode_symbol_eq_prob_rdo(p_aec, (val >> 1) & 0x01); } return rate; } /* --------------------------------------------------------------------------- */ int aec_write_alf_lcu_ctrl_rdo(aec_t *p_aec, uint8_t iflag) { int org_bits = arienco_bits_written(p_aec); context_t *p_ctx = &(p_aec->p_ctx_set->alf_cu_enable_scmodel[0][0]); biari_encode_symbol_rdo(p_aec, iflag, p_ctx); /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * codes cu header */ static int write_cu_header_rdo(xavs2_t *h, aec_t *p_aec, cu_t *p_cu) { int rate = 0; int level = p_cu->cu_info.i_level; int mode = p_cu->cu_info.i_mode; int i; // write bits for inter cu type if (h->i_type != SLICE_TYPE_I) { rate += aec_write_cutype_rdo(p_aec, mode, level, p_cu->cu_info.i_cbp, h->param->enable_amp); if (h->i_type == SLICE_TYPE_B && (mode >= PRED_2Nx2N && mode <= PRED_nRx2N)) { rate += aec_write_pdir_rdo(p_aec, mode, level, p_cu->cu_info.b8pdir[0], p_cu->cu_info.b8pdir[1]); } else if (h->i_type == SLICE_TYPE_F && h->param->enable_dhp && (h->i_ref > 1) && ((mode >= PRED_2Nx2N && mode <= PRED_nRx2N && level > B8X8_IN_BIT) || (mode == PRED_2Nx2N && level == B8X8_IN_BIT))) { rate += aec_write_pdir_dhp_rdo(p_aec, mode, p_cu->cu_info.b8pdir[0], p_cu->cu_info.b8pdir[1]); } /* write bits for F slice skip/direct mode */ if (IS_SKIP_MODE(mode)) { int b_write_spatial_skip = 0; if (h->i_type == SLICE_TYPE_F) { int weighted_skip_mode = p_cu->cu_info.directskip_wsm_idx; /* write weighted skip mode */ if (h->param->enable_wsm && h->i_ref > 1) { rate += aec_write_wpm_rdo(p_aec, weighted_skip_mode, h->i_ref); } /* write bits for F-spatial-skip mode */ b_write_spatial_skip = (h->param->enable_mhp_skip && (weighted_skip_mode == 0)); } b_write_spatial_skip = b_write_spatial_skip || (SLICE_TYPE_B == h->i_type); /* write bits for b-direct-skip mode */ if (b_write_spatial_skip) { rate += aec_write_spatial_skip_mode_rdo(p_aec, p_cu->cu_info.directskip_mhp_idx + 1); } } } // write bits for intra modes if (IS_INTRA_MODE(mode)) { int num_of_intra_block = mode != PRED_I_2Nx2N ? 4 : 1; /* write "transform_split_flag" and cu_type for SDIP */ rate += aec_write_intra_cutype_rdo(p_aec, mode, level, p_cu->cu_info.i_tu_split, h->param->enable_sdip); /* write intra pred mode */ for (i = 0; i < num_of_intra_block; i++) { rate += aec_write_intra_pred_mode_rdo(p_aec, p_cu->cu_info.pred_intra_modes[i]); } if (h->param->chroma_format != CHROMA_400) { int i_left_cmode = DM_PRED_C; /* check left */ if (p_cu->p_left_cu != NULL) { i_left_cmode = p_cu->p_left_cu->i_intra_mode_c; } rate += aec_write_intra_pred_cmode_rdo(p_aec, &p_cu->cu_info, i_left_cmode); } } return rate; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int write_mvd_rdo(aec_t *p_aec, cu_t *p_cu, int k, int bwd_flag) { int curr_mvd_x = p_cu->cu_info.mvd[bwd_flag][k].x; int curr_mvd_y = p_cu->cu_info.mvd[bwd_flag][k].y; int rate; rate = aec_write_mvd_rdo(p_aec, curr_mvd_x, 0); rate += aec_write_mvd_rdo(p_aec, curr_mvd_y, 1); return rate; } /* --------------------------------------------------------------------------- */ static int write_cu_refs_mvds_rdo(xavs2_t *h, aec_t *p_aec, cu_t *p_cu) { int mode = p_cu->cu_info.i_mode; int rate = 0; int k, refframe; int pdir; int dmh_mode; /* When CU is intra or skip mode, no need to code ref_idx and mvd */ if (IS_INTRA_MODE(mode) || IS_SKIP_MODE(mode)) { return 0; } /* only one frame on each direction, no need to code ref_idx */ // forward reference if (h->i_type != SLICE_TYPE_B && h->i_ref > 1) { for (k = 0; k < p_cu->cu_info.num_pu; k++) { if (p_cu->cu_info.b8pdir[k] == PDIR_FWD || p_cu->cu_info.b8pdir[k] == PDIR_DUAL) { refframe = p_cu->cu_info.ref_idx_1st[k]; rate += aec_write_ref_rdo(h, p_aec, refframe); } } } /* write backward reference indexes of this CU, no need for current AVS2 */ /* write DMH mode, "dir_multi_hypothesis_mode" */ if (h->i_type == SLICE_TYPE_F /*&& h->param->enable_dmh*/ && p_cu->cu_info.b8pdir[0] == PDIR_FWD && p_cu->cu_info.b8pdir[1] == PDIR_FWD && p_cu->cu_info.b8pdir[2] == PDIR_FWD && p_cu->cu_info.b8pdir[3] == PDIR_FWD) { if (!(p_cu->cu_info.i_level == B8X8_IN_BIT && p_cu->cu_info.i_mode >= PRED_2NxN && p_cu->cu_info.i_mode <= PRED_nRx2N)) { dmh_mode = p_cu->cu_info.dmh_mode; rate += aec_write_dmh_mode_rdo(p_aec, p_cu->cu_info.i_level, dmh_mode); } } /* write forward MVD */ for (k = 0; k < p_cu->cu_info.num_pu; k++) { pdir = p_cu->cu_info.b8pdir[k]; if (pdir != PDIR_BWD) { rate += write_mvd_rdo(p_aec, p_cu, k, 0); } } /* write backward MVD */ if (h->i_type == SLICE_TYPE_B) { for (k = 0; k < p_cu->cu_info.num_pu; k++) { pdir = p_cu->cu_info.b8pdir[k]; if (pdir == PDIR_BWD || pdir == PDIR_BID) { //has backward vector rate += write_mvd_rdo(p_aec, p_cu, k, 1); } } } return rate; } #if ENABLE_RATE_CONTROL_CU /* --------------------------------------------------------------------------- */ int write_cu_cbp_dqp_rdo(xavs2_t *h, aec_t *p_aec, cu_info_t *p_cu_info, int slice_index_cur_cu, int *last_dqp) { int rate = aec_write_cu_cbp_rdo(p_aec, p_cu_info, slice_index_cur_cu, h); if (!p_cu_info->i_cbp) { *last_dqp = 0; } if (p_cu_info->i_cbp != 0 && h->param->i_rc_method == XAVS2_RC_CBR_SCU) { rate += aec_write_dqp_rdo(p_aec, cu_get_qp(h, p_cu_info), *last_dqp); #if ENABLE_RATE_CONTROL_CU *last_dqp = p_cu_info->i_delta_qp; #else *last_dqp = 0; #endif } return rate; } #endif /* --------------------------------------------------------------------------- */ static int write_luma_block_coeff_rdo(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, coeff_t *quant_coeff, runlevel_t *runlevel, int i_level, int i_stride_shift, int is_intra, int intra_mode, int max_bits) { const int16_t(*cg_scan)[2] = NULL; int b_ver = p_cu->cu_info.i_tu_split == TU_SPLIT_VER; int b_hor = p_cu->cu_info.i_tu_split == TU_SPLIT_HOR; int intra_pred_class = INTRA_PRED_DC_DIAG; int num_cg; if (max_bits < 1) { return 1; ///< run_levelҪ1أsignΪbypassģʽ } if (b_hor) { cg_scan = tab_cg_scan_list_hor[i_level - 2]; } else if (b_ver) { cg_scan = tab_cg_scan_list_ver[i_level - 2]; } else { cg_scan = tab_cg_scan_list_nxn[i_level - 2]; } // reset b_hor and b_ver b_hor = (is_intra && tab_intra_mode_scan_type[intra_mode] == INTRA_PRED_HOR && p_cu->cu_info.i_mode != PRED_I_2Nxn && p_cu->cu_info.i_mode != PRED_I_nx2N); b_ver = !b_hor; num_cg = 1 << (i_level + i_level - 4); // number of CGs if (IS_ALG_ENABLE(OPT_BIT_EST_PSZT) && num_cg == 64 && !h->lcu.b_2nd_rdcost_pass) { // 32x32 TB num_cg = 25; } /* ʼRunLevelṹ */ runlevel->tab_cg_scan = cg_scan; runlevel->num_cg = num_cg; runlevel->i_stride_shift = i_stride_shift; runlevel->b_hor = b_hor; runlevel->quant_coeff = quant_coeff; runlevel->p_cu_info = &p_cu->cu_info; /* return bit rate */ if (IS_INTRA_MODE(p_cu->cu_info.i_mode)) { assert(intra_mode < NUM_INTRA_MODE); intra_pred_class = tab_intra_mode_scan_type[intra_mode]; } return aec_write_run_level_luma_rdo(p_aec, intra_pred_class, runlevel, h, max_bits); } /* --------------------------------------------------------------------------- */ static int write_chroma_block_coeff_rdo(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, coeff_t *quant_coeff, runlevel_t *runlevel, int i_level, int max_bits) { int num_cg = 1 << (i_level + i_level - 4); if (max_bits < 1) { return 1; ///< run_levelҪ1أsignΪbypassģʽ } if (IS_ALG_ENABLE(OPT_BIT_EST_PSZT) && num_cg == 64 && !h->lcu.b_2nd_rdcost_pass) { // 32x32 TB num_cg = 25; } /* ʼRunLevelṹ */ runlevel->tab_cg_scan = tab_cg_scan_list_nxn[i_level - 2]; runlevel->num_cg = num_cg; runlevel->i_stride_shift = i_level; runlevel->b_hor = 0; runlevel->quant_coeff = quant_coeff; runlevel->p_cu_info = &p_cu->cu_info; return aec_write_run_level_chroma_rdo(p_aec, runlevel, h, max_bits); } /** * =========================================================================== * function handler * =========================================================================== */ binary_t gf_aec_rdo = { /* syntax elements */ .write_intra_pred_mode = aec_write_intra_pred_mode_rdo, .write_ctu_split_flag = aec_write_split_flag_rdo, .est_cu_header = write_cu_header_rdo, .est_cu_refs_mvds = write_cu_refs_mvds_rdo, .est_luma_block_coeff = write_luma_block_coeff_rdo, .est_chroma_block_coeff = write_chroma_block_coeff_rdo, #if ENABLE_RATE_CONTROL_CU .write_cu_cbp_dqp = write_cu_cbp_dqp_rdo, #else .write_cu_cbp = aec_write_cu_cbp_rdo, #endif .write_sao_mergeflag = write_sao_mergeflag_rdo, .write_sao_mode = write_sao_mode_rdo, .write_sao_offset = write_sao_offset_rdo, .write_sao_type = write_sao_type_rdo, .write_alf_lcu_ctrl = aec_write_alf_lcu_ctrl_rdo, }; xavs2-1.3/source/encoder/aec_vrdo.c000066400000000000000000001540411340660520300172710ustar00rootroot00000000000000/* * aec_vrdo.c * * Description of this file: * AEC functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "aec.h" #include "bitstream.h" #include "block_info.h" #include "cudata.h" /** * =========================================================================== * binary * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int aec_get_shift(uint32_t v) { #if SYS_WINDOWS && !ARCH_X86_64 __asm { bsr eax, v mov v, eax } return 8 - v; #else int i; for (i = 0; !(v & 0x100); i++) { v <<= 1; } return i; #endif } /* --------------------------------------------------------------------------- */ #define biari_encode_symbol_vrdo(p_aec, symbol, p_ctx) \ p_aec->i_bits_to_follow++ #define DECLARE_CONTEXT(p_ctx) /* --------------------------------------------------------------------------- */ #define biari_encode_tu_vrdo(p_aec, num_zeros, max_len, p_ctx) \ p_aec->i_bits_to_follow += num_zeros + (max_len != num_zeros) /* --------------------------------------------------------------------------- */ #define biari_encode_symbol_eq_prob_vrdo(p_aec, symbol) p_aec->i_bits_to_follow++ /* --------------------------------------------------------------------------- */ #define biari_encode_symbols_eq_prob_vrdo(p_aec, val, len) p_aec->i_bits_to_follow += len /* --------------------------------------------------------------------------- */ static INLINE void biari_encode_symbol_final_vrdo(aec_t *p_aec, uint8_t symbol) { const uint32_t t1 = p_aec->i_t1; if (symbol) { p_aec->i_bits_to_follow += (!t1) + 8; p_aec->i_t1 = 0; } else { // MPS p_aec->i_bits_to_follow += (!t1); p_aec->i_t1 = (t1 - 1) & 0xff; } } /** * =========================================================================== * syntax coding * =========================================================================== */ /* --------------------------------------------------------------------------- * cu type for B/F/P frame */ static INLINE int aec_write_cutype_vrdo(aec_t *p_aec, int i_cu_type, int i_cu_level, int i_cu_cbp, int is_amp_enabled) { DECLARE_CONTEXT(context_t *p_ctx = p_aec->p_ctx_set->cu_type_contexts); int org_bits = arienco_bits_written(p_aec); int act_sym = MAP_CU_TYPE[i_cu_type]; if (i_cu_type == PRED_SKIP && i_cu_cbp == 0) { act_sym = 0; } switch (act_sym) { case 0: // SKIP biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 0); break; case 1: // DIRECT biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 1); break; case 2: // 2Nx2N biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 2); break; case 3: // 2NxN, 2NxnU, 2NxnD biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 3); if (is_amp_enabled && i_cu_level >= B16X16_IN_BIT) { DECLARE_CONTEXT(p_ctx = p_aec->p_ctx_set->shape_of_partition_index); if (i_cu_type == PRED_2NxN) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx); // SMP - AMP signal bit } else { biari_encode_symbol_vrdo(p_aec, 0, p_ctx); // SMP - AMP signal bit biari_encode_symbol_vrdo(p_aec, (uint8_t)(i_cu_type == PRED_2NxnU), p_ctx + 1); // AMP shape } } break; case 4: // Nx2N, nLx2N, nRx2N biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 3); biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 4); if (is_amp_enabled && i_cu_level >= B16X16_IN_BIT) { DECLARE_CONTEXT(p_ctx = p_aec->p_ctx_set->shape_of_partition_index); if (i_cu_type == PRED_Nx2N) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx); // SMP - AMP signal bit } else { biari_encode_symbol_vrdo(p_aec, 0, p_ctx); // SMP - AMP signal bit biari_encode_symbol_vrdo(p_aec, (uint8_t)(i_cu_type == PRED_nLx2N), p_ctx + 1); // AMP shape } } break; //case 5: // NxN, not enabled // biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 0); // biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 1); // biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 2); // biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 3); // biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 4); // if (i_cu_level > B8X8_IN_BIT) { // biari_encode_symbol_final_vrdo(p_aec, 1); // } // break; default: // case 6: // Intra biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 0); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 3); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 4); if (i_cu_level > B8X8_IN_BIT) { biari_encode_symbol_final_vrdo(p_aec, 0); } break; } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode a pair of intra prediction modes of a given cu */ static int aec_write_intra_pred_mode_vrdo(aec_t *p_aec, int ipmode) { DECLARE_CONTEXT(context_t *p_ctx = p_aec->p_ctx_set->intra_luma_pred_mode); int org_bits = arienco_bits_written(p_aec); if (ipmode >= 0) { biari_encode_symbol_vrdo(p_aec, 0, p_ctx ); biari_encode_symbol_vrdo(p_aec, (uint8_t)((ipmode & 0x10) >> 4), p_ctx + 1); biari_encode_symbol_vrdo(p_aec, (uint8_t)((ipmode & 0x08) >> 3), p_ctx + 2); biari_encode_symbol_vrdo(p_aec, (uint8_t)((ipmode & 0x04) >> 2), p_ctx + 3); biari_encode_symbol_vrdo(p_aec, (uint8_t)((ipmode & 0x02) >> 1), p_ctx + 4); biari_encode_symbol_vrdo(p_aec, (uint8_t)((ipmode & 0x01) ), p_ctx + 5); } else { biari_encode_symbol_vrdo(p_aec, 1, p_ctx ); biari_encode_symbol_vrdo(p_aec, (uint8_t)(ipmode + 2), p_ctx + 6); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the reference parameter of a given cu */ static INLINE int aec_write_ref_vrdo(xavs2_t *h, aec_t *p_aec, int ref_idx) { context_t *p_ctx = p_aec->p_ctx_set->pu_reference_index; int org_bits = arienco_bits_written(p_aec); int act_sym = ref_idx; /* 0λ0ģ1λ1ģ2 */ if (act_sym == 0) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx); } else { int act_ctx = 0; biari_encode_symbol_vrdo(p_aec, 0, p_ctx++); while (--act_sym != 0) { biari_encode_symbol_vrdo(p_aec, 0, p_ctx); if (!act_ctx) { p_ctx++; } } if (ref_idx < h->i_ref - 1) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the motion vector data */ static INLINE int aec_write_mvd_vrdo(aec_t *p_aec, int mvd, int xy) { int org_bits = arienco_bits_written(p_aec); uint32_t act_sym = XAVS2_ABS(mvd); UNUSED_PARAMETER(xy); if (act_sym == 0) { p_aec->i_bits_to_follow += 1; } else if (act_sym < 3) { // 1, 2 p_aec->i_bits_to_follow += act_sym + 2; } else { int exp_golomb_order = 0; // odds > 3: (act_sym - 3) >> 1 // even > 3: (act_sym - 4) >> 1 act_sym = (act_sym - 3) >> 1; /* exp_golomb part */ while (act_sym >= (uint32_t)(1 << exp_golomb_order)) { act_sym -= (1 << exp_golomb_order); exp_golomb_order++; } // Exp-Golomb: (prefix) + 1 + (suffix) p_aec->i_bits_to_follow += 5 + (exp_golomb_order + 1 + exp_golomb_order); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_dmh_mode_vrdo(aec_t *p_aec, int i_cu_level, int dmh_mode) { static const int iEncMapTab[9] = { 0, 5, 6, 1, 2, 7, 8, 3, 4 }; context_t *p_ctx = p_aec->p_ctx_set->pu_type_index + 3; int org_bits = arienco_bits_written(p_aec); int symbol = dmh_mode != 0; p_ctx += (i_cu_level - MIN_CU_SIZE_IN_BIT) * 3; biari_encode_symbol_vrdo(p_aec, (uint8_t)symbol, p_ctx); if (symbol) { int iMapVal = iEncMapTab[dmh_mode]; if (iMapVal < 3) { symbol = (iMapVal != 1); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 1); biari_encode_symbol_eq_prob_vrdo(p_aec, (uint8_t)symbol); } else if (iMapVal < 5) { symbol = (iMapVal != 3); biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_eq_prob_vrdo(p_aec, (uint8_t)symbol); } else { biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 2); symbol = (iMapVal >= 7); biari_encode_symbol_eq_prob_vrdo(p_aec, (uint8_t)symbol); symbol = !(iMapVal & 1); biari_encode_symbol_eq_prob_vrdo(p_aec, (uint8_t)symbol); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * write "transform_split_flag" and SDIP type for intra CU */ static INLINE int aec_write_intra_cutype_vrdo(aec_t *p_aec, int i_cu_type, int i_cu_level, int i_tu_split, int is_sdip_enabled) { int org_bits = arienco_bits_written(p_aec); UNUSED_PARAMETER(i_cu_type); if (i_cu_level == B8X8_IN_BIT) { p_aec->i_bits_to_follow++; } else if (is_sdip_enabled && (i_cu_level == B32X32_IN_BIT || i_cu_level == B16X16_IN_BIT)) { uint8_t transform_split_flag = i_tu_split != TU_SPLIT_NON; /* just write split or not */ p_aec->i_bits_to_follow += 1 + transform_split_flag; } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_pdir_vrdo(aec_t *p_aec, int i_cu_type, int i_cu_level, int pdir0, int pdir1) { int new_pdir[4] = { 2, 1, 3, 0 }; context_t *p_ctx = p_aec->p_ctx_set->pu_type_index; int org_bits = arienco_bits_written(p_aec); int act_ctx = 0; int act_sym; int symbol; if (i_cu_type == PRED_2Nx2N) { /* һCUֻһPUPUĸʹ3: 0, 1, 2 */ act_sym = pdir0; while (act_sym >= 1) { biari_encode_symbol_vrdo(p_aec, 0, p_ctx + act_ctx); act_sym--; act_ctx++; } if (pdir0 != 3) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx + act_ctx); } } else if ((i_cu_type >= PRED_2NxN && i_cu_type <= PRED_nRx2N) && i_cu_level == B8X8_IN_BIT) { /* һCUΪPUCUСΪ8x8ʱԤΪ4x88x4ÿPUֻǵԤ⣬ * ܼ4ϣҪλBit b_pu_type_min_index ʹ */ p_ctx = p_aec->p_ctx_set->b_pu_type_min_index; pdir0 = new_pdir[pdir0]; pdir1 = new_pdir[pdir1]; act_sym = (pdir0 != 1); biari_encode_symbol_vrdo(p_aec, (int8_t)act_sym, p_ctx + 0); act_sym = (pdir0 == pdir1); biari_encode_symbol_vrdo(p_aec, (int8_t)act_sym, p_ctx + 1); } else if (i_cu_type >= PRED_2NxN || i_cu_type <= PRED_nRx2N) { //1010 /* act_ctx: 3,...,14 */ pdir0 = new_pdir[pdir0]; pdir1 = new_pdir[pdir1]; act_sym = pdir0; act_ctx = 3; /* 3,4,5 */ while (act_sym >= 1) { biari_encode_symbol_vrdo(p_aec, 0, p_ctx + act_ctx); act_sym--; act_ctx++; } if (pdir0 != 3) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx + act_ctx); } symbol = (pdir0 == pdir1); biari_encode_symbol_vrdo(p_aec, (uint8_t)symbol, p_ctx + 6); /* 7,...,14 */ if (!symbol) { switch (pdir0) { case 0: symbol = (pdir1 == 1); biari_encode_symbol_vrdo(p_aec, (uint8_t)symbol, p_ctx + 7); if (!symbol) { symbol = (pdir1 == 2); biari_encode_symbol_vrdo(p_aec, (uint8_t)symbol, p_ctx + 8); } break; case 1: symbol = (pdir1 == 0); biari_encode_symbol_vrdo(p_aec, (uint8_t)symbol, p_ctx + 9); if (!symbol) { symbol = (pdir1 == 2); biari_encode_symbol_vrdo(p_aec, (uint8_t)symbol, p_ctx + 10); } break; case 2: symbol = (pdir1 == 0); biari_encode_symbol_vrdo(p_aec, (uint8_t)symbol, p_ctx + 11); if (!symbol) { symbol = (pdir1 == 1); biari_encode_symbol_vrdo(p_aec, (uint8_t)symbol, p_ctx + 12); } break; case 3: symbol = (pdir1 == 0); biari_encode_symbol_vrdo(p_aec, (uint8_t)symbol, p_ctx + 13); if (!symbol) { symbol = (pdir1 == 1); biari_encode_symbol_vrdo(p_aec, (uint8_t)symbol, p_ctx + 14); } break; } } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_pdir_dhp_vrdo(aec_t *p_aec, int i_cu_type, int pdir0, int pdir1) { DECLARE_CONTEXT(context_t *p_ctx = p_aec->p_ctx_set->pu_type_index); int org_bits = arienco_bits_written(p_aec); pdir0 = (pdir0 != 0); pdir1 = (pdir1 != 0); if (i_cu_type == PRED_2Nx2N) { biari_encode_symbol_vrdo(p_aec, (uint8_t)pdir0, p_ctx); } else if (i_cu_type >= PRED_2NxN || i_cu_type <= PRED_nRx2N) { // 1010 biari_encode_symbol_vrdo(p_aec, (uint8_t)pdir0, p_ctx + 1); biari_encode_symbol_vrdo(p_aec, (uint8_t)(pdir0 == pdir1), p_ctx + 2); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_wpm_vrdo(aec_t *p_aec, int ref_idx, int num_ref) { DECLARE_CONTEXT(context_t *p_ctx = p_aec->p_ctx_set->weighted_skip_mode); int org_bits = arienco_bits_written(p_aec); int i, idx_bin = 0; for (i = 0; i < ref_idx; i++) { biari_encode_symbol_vrdo(p_aec, 0, p_ctx + idx_bin); idx_bin = XAVS2_MIN(idx_bin + 1, 2); } if (ref_idx < num_ref - 1) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx + idx_bin); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static INLINE int aec_write_spatial_skip_mode_vrdo(aec_t *p_aec, int mode) { DECLARE_CONTEXT(context_t *p_ctx = p_aec->p_ctx_set->cu_subtype_index); int org_bits = arienco_bits_written(p_aec); int offset; for (offset = 0; offset < mode; offset++) { biari_encode_symbol_vrdo(p_aec, 0, p_ctx + offset); } if (mode < DS_MAX_NUM) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx + offset); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the chroma intra prediction mode of an 8x8 block */ static INLINE int aec_write_intra_pred_cmode_vrdo(aec_t *p_aec, cu_info_t *p_cu_info, int i_left_cmode) { int i_chroma_mode = p_cu_info->i_intra_mode_c; int org_bits = arienco_bits_written(p_aec); UNUSED_PARAMETER(i_left_cmode); if (i_chroma_mode == DM_PRED_C) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx + act_ctx); } else { int lmode = tab_intra_mode_luma2chroma[p_cu_info->real_intra_modes[0]]; int is_redundant = lmode >= 0; biari_encode_symbol_vrdo(p_aec, 0, p_ctx + act_ctx); i_chroma_mode -= (1 + (is_redundant && i_chroma_mode > lmode)); p_aec->i_bits_to_follow += i_chroma_mode; if (i_chroma_mode < 3) { p_aec->i_bits_to_follow++; } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * arithmetically encode the coded block pattern of an luma CB */ #define write_cbp_bit_vrdo(h, p_aec, p_cu_info, b8, bit) p_aec->i_bits_to_follow++ /* --------------------------------------------------------------------------- * arithmetically encode the coded block pattern of a cu */ static INLINE int aec_write_cu_cbp_vrdo(aec_t *p_aec, cu_info_t *p_cu_info, int slice_index_cur_cu, xavs2_t *h) { DECLARE_CONTEXT(context_t *p_ctx = p_aec->p_ctx_set->cbp_contexts + 4); int org_bits = arienco_bits_written(p_aec); int i_cu_cbp = p_cu_info->i_cbp; int i_cu_type = p_cu_info->i_mode; int transform_split_flag = p_cu_info->i_tu_split != TU_SPLIT_NON; UNUSED_PARAMETER(slice_index_cur_cu); if (IS_INTER_MODE(i_cu_type)) { /* write cbp for inter pred mode --------------------------- */ if (!IS_SKIP_MODE(i_cu_type)) { write_cbp_bit_vrdo(h, p_aec, p_cu_info, 4, i_cu_cbp == 0); } if (i_cu_cbp) { // write tr_size biari_encode_symbol_vrdo(p_aec, (uint8_t)transform_split_flag, p_aec->p_ctx_set->transform_split_flag); // write cbp for chroma if (h->param->chroma_format != CHROMA_400) { switch ((i_cu_cbp >> 4) & 0x03) { case 0: biari_encode_symbol_vrdo(p_aec, 0, p_ctx); break; case 1: biari_encode_symbol_vrdo(p_aec, 1, p_ctx); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 2); break; case 2: biari_encode_symbol_vrdo(p_aec, 1, p_ctx); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 2); biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 2); break; case 3: biari_encode_symbol_vrdo(p_aec, 1, p_ctx); biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 2); break; } } // write cbp for luma if (transform_split_flag == 0) { if (i_cu_cbp > 15) { write_cbp_bit_vrdo(h, p_aec, p_cu_info, 0, (i_cu_cbp & 1) != 0); } } else { write_cbp_bit_vrdo(h, p_aec, p_cu_info, 0, (i_cu_cbp & 1) != 0); write_cbp_bit_vrdo(h, p_aec, p_cu_info, 1, (i_cu_cbp & 2) != 0); write_cbp_bit_vrdo(h, p_aec, p_cu_info, 2, (i_cu_cbp & 4) != 0); write_cbp_bit_vrdo(h, p_aec, p_cu_info, 3, (i_cu_cbp & 8) != 0); } } } else { /* write cbp for intra pred mode --------------------------- */ // write bits for luma if (transform_split_flag == 0 || i_cu_type == PRED_I_2Nx2N) { write_cbp_bit_vrdo(h, p_aec, p_cu_info, 0, (i_cu_cbp & 0x0F) != 0); } else { write_cbp_bit_vrdo(h, p_aec, p_cu_info, 0, (i_cu_cbp & 1) != 0); write_cbp_bit_vrdo(h, p_aec, p_cu_info, 1, (i_cu_cbp & 2) != 0); write_cbp_bit_vrdo(h, p_aec, p_cu_info, 2, (i_cu_cbp & 4) != 0); write_cbp_bit_vrdo(h, p_aec, p_cu_info, 3, (i_cu_cbp & 8) != 0); } // write bits for chroma if (h->param->chroma_format != CHROMA_400) { switch ((i_cu_cbp >> 4) & 0x03) { case 0: biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 1); break; case 1: biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 3); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 3); break; case 2: biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_vrdo(p_aec, 0, p_ctx + 3); biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 3); break; case 3: biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 1); biari_encode_symbol_vrdo(p_aec, 1, p_ctx + 3); break; } } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } #if ENABLE_RATE_CONTROL_CU /* --------------------------------------------------------------------------- */ static INLINE int aec_write_dqp_vrdo(aec_t *p_aec, int delta_qp, int last_dqp) { context_t *p_ctx = p_aec->p_ctx_set->delta_qp_contexts; int org_bits = arienco_bits_written(p_aec); int act_ctx = (last_dqp) ? 1 : 0; int act_sym = (delta_qp > 0) ? (2 * delta_qp - 1) : (-2 * delta_qp); if (act_sym == 0) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx + act_ctx); } else { biari_encode_symbol_vrdo(p_aec, 0, p_ctx + act_ctx); act_ctx = 2; if (act_sym == 1) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx + act_ctx); } else { biari_encode_symbol_vrdo(p_aec, 0, p_ctx + act_ctx); act_ctx++; while (act_sym > 2) { biari_encode_symbol_vrdo(p_aec, 0, p_ctx + act_ctx); act_sym--; } biari_encode_symbol_vrdo(p_aec, 1, p_ctx + act_ctx); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } #endif /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void aec_write_last_cg_pos(aec_t *p_aec, int b_luma, int b_dc_diag, int i_cg, int cg_last_x, int cg_last_y, int num_cg, int num_cg_x_minus1, int num_cg_y_minus1) { int count; if (num_cg == 4) { // 8x8 p_aec->i_bits_to_follow += XAVS2_MIN(3, i_cg + 1); } else { if (b_luma && b_dc_diag) { XAVS2_SWAP(cg_last_x, cg_last_y); XAVS2_SWAP(num_cg_x_minus1, num_cg_y_minus1); } p_aec->i_bits_to_follow++; /* last_cg0_flag */ if (cg_last_x || cg_last_y) { /* last_cg_x */ biari_encode_tu_vrdo(p_aec, cg_last_x, num_cg_x_minus1, NULL); /* last_cg_y or last_cg_y_minus1 */ count = (cg_last_x == 0); // cg_last_xΪ㣬cg_last_yдһ㣨һ㣩 biari_encode_tu_vrdo(p_aec, cg_last_y - count, num_cg_y_minus1 - count, NULL); } } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void aec_write_last_coeff_pos(aec_t *p_aec, int isLastCG, int pos, int b_dc_diag) { static const ALIGN32(int8_t tab_bits_pos[4][16]) = { { 4, 5, 4, 3, 5, 6, 6, 6, 4, 2, 3, 5, 6, 5, 4, 4 }, { 2, 3, 3, 4, 4, 4, 4, 5, 5, 4, 5, 6, 5, 6, 6, 6 }, { 6, 6, 6, 5, 6, 5, 4, 5, 5, 4, 4, 4, 4, 3, 3, 2 }, { 6, 6, 6, 5, 6, 5, 4, 5, 5, 4, 4, 4, 4, 3, 3, 2 }, }; // int scan_pos = tab_1d_scan_4x4[15 - pos]; p_aec->i_bits_to_follow += tab_bits_pos[(isLastCG << 1) + b_dc_diag][pos]; } /* --------------------------------------------------------------------------- */ static int aec_write_run_level_luma_vrdo(aec_t *p_aec, int b_dc_diag, runlevel_t *runlevel, xavs2_t *h, int maxvalue) { const int16_t(*p_tab_cg_scan)[2] = runlevel->tab_cg_scan; runlevel_pair_t *p_runlevel = runlevel->runlevels_cg; int rank = 0; int num_cg = runlevel->num_cg; int org_bits = arienco_bits_written(p_aec); int i_cg; int cur_bits; UNUSED_PARAMETER(h); /* write coefficients in CG */ for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { DECLARE_CONTEXT(context_t *p_ctx = NULL); int CGx = 0; int CGy = 0; uint32_t Level_sign = 0; int pos; int num_pairs; int pairs; int pairsInCG; int i; /* 1. 鵱ǰCGǷзϵ */ coeff_t *quant_coeff = runlevel->quant_coeff; const int b_hor = runlevel->b_hor; quant_coeff += ((p_tab_cg_scan[i_cg][!b_hor] << runlevel->i_stride_shift) + p_tab_cg_scan[i_cg][b_hor]) << 2; num_pairs = tu_get_cg_run_level_info(runlevel, quant_coeff, runlevel->i_stride_shift, runlevel->b_hor); i = num_pairs; // number of pairs in CG /* 2, Sig CG Flag, "nonzero_cg_flag" */ if (rank > 0) { biari_encode_symbol_vrdo(p_aec, !!i, p_ctx); if (!i) { continue; // ޷ϵǰCG } CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; } else if (i > 0) { if (num_cg > 1) { // for TB > 4x4, need to write int num_cg_x = p_tab_cg_scan[num_cg - 1][0]; int num_cg_y = p_tab_cg_scan[num_cg - 1][1]; CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; aec_write_last_cg_pos(p_aec, 1, b_dc_diag, i_cg, CGx, CGy, num_cg, num_cg_x, num_cg_y); } } else { continue; } /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); /* 3, (Run, Level) */ /* 3.1, LAST IN CG, "last_coeff_pos_x", "last_coeff_pos_y" */ pos = runlevel->last_pos_cg; pairs = num_pairs - 1; { aec_write_last_coeff_pos(p_aec, rank == 0, pos, b_dc_diag); } for (pairsInCG = 0; i > 0 && pos < NUM_OF_COEFFS_IN_CG; i--, pairs--, pairsInCG++) { DECLARE_CONTEXT(int offset = 0); int Level = p_runlevel[pairs].level; int Run = p_runlevel[pairs].run; int absLevel = XAVS2_ABS(Level); int symbol = absLevel - 1; Level_sign |= (Level < 0) << i; // record Sign /* 3.2, level, "coeff_level_minus1_band[i]", "coeff_level_minus1_pos_in_band[i]" */ if (symbol > 31) { int exp_golomb_order = 0; biari_encode_symbol_final_vrdo(p_aec, 1); // "coeff_level_minus1_band[i]", > 32 /* coeff_level_minus1_pos_in_band[i] */ symbol -= 32; while (symbol >= (1 << exp_golomb_order)) { symbol -= (1 << exp_golomb_order); exp_golomb_order++; } biari_encode_symbols_eq_prob_vrdo(p_aec, 1, exp_golomb_order + 1); // Exp-Golomb: prefix and 1 biari_encode_symbols_eq_prob_vrdo(p_aec, symbol, exp_golomb_order); // Exp-Golomb: suffix } else { biari_encode_symbol_final_vrdo(p_aec, 0); // "coeff_level_minus1_band[i]", <= 32 /* coeff_level_minus1_pos_in_band[i] */ biari_encode_tu_vrdo(p_aec, symbol, 31, p_ctx); } rank = 1; /* 3.3, run, "coeff_run[i]" */ p_aec->i_bits_to_follow += Run; pos += (Run + 1); // update position if (pos < NUM_OF_COEFFS_IN_CG) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx + offset); } else { pairs--; pairsInCG++; break; } } // run-level loop /* 4, sign of coefficient */ biari_encode_symbols_eq_prob_vrdo(p_aec, Level_sign >> 1, num_pairs); /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); } // for (; i_cg >= 0; i_cg--) /* get the number of written bits */ org_bits = arienco_bits_written(p_aec) - org_bits; #ifdef DEBUG if (rank == 0) { xavs2_log(h, XAVS2_LOG_ERROR, "no non-zero run-level luma, POC[%d]: p_cu: (%d, %d), level %d, cu_type %d\n", h->fdec->i_poc, runlevel->p_cu_info->i_scu_x, runlevel->p_cu_info->i_scu_y, runlevel->p_cu_info->i_level, runlevel->p_cu_info->i_mode); } #endif assert(rank > 0); // зϵʱrankֵӦ /* return the number of written bits */ return org_bits; } /* --------------------------------------------------------------------------- */ static int aec_write_run_level_chroma_vrdo(aec_t *p_aec, runlevel_t *runlevel, xavs2_t *h, int maxvalue) { const int16_t(*p_tab_cg_scan)[2] = runlevel->tab_cg_scan; runlevel_pair_t *p_runlevel = runlevel->runlevels_cg; int rank = 0; int num_cg = runlevel->num_cg; int org_bits = arienco_bits_written(p_aec); int i_cg; int cur_bits; UNUSED_PARAMETER(h); /* write coefficients in CG */ for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { DECLARE_CONTEXT(context_t *p_ctx); int CGx = 0; int CGy = 0; uint32_t Level_sign = 0; int pos; int num_pairs; int pairs; int pairsInCG; int i; /* 1. 鵱ǰCGǷзϵ */ coeff_t *quant_coeff = runlevel->quant_coeff; const int b_hor = 0; // runlevel->b_hor; quant_coeff += ((p_tab_cg_scan[i_cg][!b_hor] << runlevel->i_stride_shift) + p_tab_cg_scan[i_cg][b_hor]) << 2; num_pairs = tu_get_cg_run_level_info(runlevel, quant_coeff, runlevel->i_stride_shift, b_hor); i = num_pairs; // number of pairs in CG /* 2, Sig CG Flag, "nonzero_cg_flag" */ if (rank > 0) { biari_encode_symbol_vrdo(p_aec, !!i, p_ctx); if (!i) { continue; // ޷ϵǰCG } CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; } else if (i > 0) { if (num_cg > 1) { int num_cg_x = p_tab_cg_scan[num_cg - 1][0]; int num_cg_y = p_tab_cg_scan[num_cg - 1][1]; CGx = p_tab_cg_scan[i_cg][0]; CGy = p_tab_cg_scan[i_cg][1]; aec_write_last_cg_pos(p_aec, 0, 1, i_cg, CGx, CGy, num_cg, num_cg_x, num_cg_y); } } else { continue; // δҵһϵCG } /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); /* 3, (Run, Level) */ /* 3.1, LAST IN CG, "last_coeff_pos_x", "last_coeff_pos_y" */ pos = runlevel->last_pos_cg; pairs = num_pairs - 1; { aec_write_last_coeff_pos(p_aec, rank == 0, pos, 1); } for (pairsInCG = 0; i > 0 && pos < NUM_OF_COEFFS_IN_CG; i--, pairs--, pairsInCG++) { DECLARE_CONTEXT(int offset = 0); int Level = p_runlevel[pairs].level; int Run = p_runlevel[pairs].run; int absLevel = XAVS2_ABS(Level); int symbol = absLevel - 1; Level_sign |= (Level < 0) << i; // record Sign /* 3.2, level, "coeff_level_minus1_band[i]", "coeff_level_minus1_pos_in_band[i]" */ if (symbol > 31) { int exp_golomb_order = 0; biari_encode_symbol_final_vrdo(p_aec, 1); // "coeff_level_minus1_band[i]", > 32 /* coeff_level_minus1_pos_in_band[i] */ symbol -= 32; while (symbol >= (1 << exp_golomb_order)) { symbol -= (1 << exp_golomb_order); exp_golomb_order++; } biari_encode_symbols_eq_prob_vrdo(p_aec, 1, exp_golomb_order + 1); // Exp-Golomb: prefix and 1 biari_encode_symbols_eq_prob_vrdo(p_aec, symbol, exp_golomb_order); // Exp-Golomb: suffix } else { biari_encode_symbol_final_vrdo(p_aec, 0); // "coeff_level_minus1_band[i]", <= 32 /* coeff_level_minus1_pos_in_band[i] */ biari_encode_tu_vrdo(p_aec, symbol, 31, p_ctx); } rank = 1; // update rank /* 3.3, run, "coeff_run[i]" */ p_aec->i_bits_to_follow += Run; pos += (Run + 1); // update position if (pos < NUM_OF_COEFFS_IN_CG) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx + offset); } else { pairs--; pairsInCG++; break; } } // run-level loop /* 4, sign of coefficient */ biari_encode_symbols_eq_prob_vrdo(p_aec, Level_sign >> 1, num_pairs); /* early terminate? */ CHECK_EARLY_RETURN_RUNLEVEL(p_aec); } // for (; i_cg >= 0; i_cg--) /* get the number of written bits */ org_bits = arienco_bits_written(p_aec) - org_bits; #ifdef DEBUG if (rank == 0) { xavs2_log(h, XAVS2_LOG_ERROR, "no non-zero run-level chroma, p_cu: (%d, %d), level %d, cu_type %d\n", runlevel->p_cu_info->i_scu_x, runlevel->p_cu_info->i_scu_y, runlevel->p_cu_info->i_level, runlevel->p_cu_info->i_mode); } #endif assert(rank > 0); // зϵʱrankֵӦ /* return the number of written bits */ return org_bits; } /* --------------------------------------------------------------------------- */ int aec_write_split_flag_vrdo(aec_t *p_aec, int i_cu_split, int i_cu_level) { int org_bits = arienco_bits_written(p_aec); UNUSED_PARAMETER(i_cu_level); UNUSED_PARAMETER(i_cu_split); p_aec->i_bits_to_follow++; /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ int write_sao_mergeflag_vrdo(aec_t *p_aec, int avail_left, int avail_up, SAOBlkParam *p_sao_param) { int b_merge_left = 0; int b_merge_up; int val = 0; DECLARE_CONTEXT(context_t *p_ctx = p_aec->p_ctx_set->sao_merge_type_index); int org_bits = arienco_bits_written(p_aec); int ctx_offset = avail_left + avail_up; if (avail_left) { b_merge_left = (p_sao_param->mergeIdx == SAO_MERGE_LEFT); val = b_merge_left ? 1 : 0; } if (avail_up && !b_merge_left) { b_merge_up = (p_sao_param->mergeIdx == SAO_MERGE_ABOVE); val = b_merge_up ? (1 + avail_left) : 0; } if (ctx_offset == 1) { assert(val <= 1); biari_encode_symbol_vrdo(p_aec, (uint8_t)val, p_ctx + 0); } else if (ctx_offset == 2) { assert(val <= 2); biari_encode_symbol_vrdo(p_aec, val & 0x01, p_ctx + 1); if (val != 1) { biari_encode_symbol_vrdo(p_aec, (val >> 1) & 0x01, p_ctx + 2); } } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ int write_sao_mode_vrdo(aec_t *p_aec, SAOBlkParam *saoBlkParam) { DECLARE_CONTEXT(context_t *p_ctx = p_aec->p_ctx_set->sao_mode); int org_bits = arienco_bits_written(p_aec); int sao_type = saoBlkParam->typeIdc; if (sao_type == SAO_TYPE_OFF) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx); } else if (sao_type == SAO_TYPE_BO) { biari_encode_symbol_vrdo(p_aec, 0, p_ctx); biari_encode_symbol_eq_prob_vrdo(p_aec, 1); } else { // SAO_TYPE_EO (0~3) biari_encode_symbol_vrdo(p_aec, 0, p_ctx); biari_encode_symbol_eq_prob_vrdo(p_aec, 0); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ static int aec_write_sao_offset_vrdo(aec_t *p_aec, int val, int offset_type) { /* --------------------------------------------------------------------------- */ static const int EO_OFFSET_MAP[8] = { 3, 1, 0, 2, 4, 5, 6, 7 }; DECLARE_CONTEXT(context_t *p_ctx = p_aec->p_ctx_set->sao_interval_offset_abs); int org_bits = arienco_bits_written(p_aec); int act_sym; assert(offset_type != SAO_CLASS_EO_PLAIN); if (offset_type == SAO_CLASS_EO_FULL_VALLEY) { act_sym = EO_OFFSET_MAP[val + 1]; } else if (offset_type == SAO_CLASS_EO_FULL_PEAK) { act_sym = EO_OFFSET_MAP[-val + 1]; } else { act_sym = XAVS2_ABS(val); } if (act_sym == 0) { if (offset_type == SAO_CLASS_BO) { biari_encode_symbol_vrdo(p_aec, 1, p_ctx); } else { biari_encode_symbol_eq_prob_vrdo(p_aec, 1); } } else { int maxvalue = tab_saoclip[offset_type][2]; int temp = act_sym; while (temp != 0) { if (offset_type == SAO_CLASS_BO && temp == act_sym) { biari_encode_symbol_vrdo(p_aec, 0, p_ctx); } else { biari_encode_symbol_eq_prob_vrdo(p_aec, 0); } temp--; } if (act_sym < maxvalue) { biari_encode_symbol_eq_prob_vrdo(p_aec, 1); } } if (offset_type == SAO_CLASS_BO && act_sym) { // write sign symbol biari_encode_symbol_eq_prob_vrdo(p_aec, (uint8_t)(val >= 0 ? 0 : 1)); } /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- */ int write_sao_offset_vrdo(aec_t *p_aec, SAOBlkParam *saoBlkParam) { int rate = 0; assert(saoBlkParam->typeIdc != SAO_TYPE_OFF); if (saoBlkParam->typeIdc == SAO_TYPE_BO) { int bandIdxBO[4]; bandIdxBO[0] = saoBlkParam->startBand; bandIdxBO[1] = bandIdxBO[0] + 1; bandIdxBO[2] = (saoBlkParam->startBand + saoBlkParam->deltaBand) & 31; bandIdxBO[3] = bandIdxBO[2] + 1; rate += aec_write_sao_offset_vrdo(p_aec, saoBlkParam->offset[bandIdxBO[0]], SAO_CLASS_BO); rate += aec_write_sao_offset_vrdo(p_aec, saoBlkParam->offset[bandIdxBO[1]], SAO_CLASS_BO); rate += aec_write_sao_offset_vrdo(p_aec, saoBlkParam->offset[bandIdxBO[2]], SAO_CLASS_BO); rate += aec_write_sao_offset_vrdo(p_aec, saoBlkParam->offset[bandIdxBO[3]], SAO_CLASS_BO); } else { assert(saoBlkParam->typeIdc >= SAO_TYPE_EO_0 && saoBlkParam->typeIdc <= SAO_TYPE_EO_45); rate += aec_write_sao_offset_vrdo(p_aec, saoBlkParam->offset[SAO_CLASS_EO_FULL_VALLEY], SAO_CLASS_EO_FULL_VALLEY); rate += aec_write_sao_offset_vrdo(p_aec, saoBlkParam->offset[SAO_CLASS_EO_HALF_VALLEY], SAO_CLASS_EO_HALF_VALLEY); rate += aec_write_sao_offset_vrdo(p_aec, saoBlkParam->offset[SAO_CLASS_EO_HALF_PEAK], SAO_CLASS_EO_HALF_PEAK); rate += aec_write_sao_offset_vrdo(p_aec, saoBlkParam->offset[SAO_CLASS_EO_FULL_PEAK], SAO_CLASS_EO_FULL_PEAK); } return rate; } /* --------------------------------------------------------------------------- */ int write_sao_type_vrdo(aec_t *p_aec, SAOBlkParam *saoBlkParam) { int rate = 0; int val; assert(saoBlkParam->typeIdc != SAO_TYPE_OFF); if (saoBlkParam->typeIdc == SAO_TYPE_BO) { int exp_golomb_order = 1; /* start band */ val = saoBlkParam->startBand; biari_encode_symbol_eq_prob_vrdo(p_aec, val & 0x01); biari_encode_symbol_eq_prob_vrdo(p_aec, (val >> 1) & 0x01); biari_encode_symbol_eq_prob_vrdo(p_aec, (val >> 2) & 0x01); biari_encode_symbol_eq_prob_vrdo(p_aec, (val >> 3) & 0x01); biari_encode_symbol_eq_prob_vrdo(p_aec, (val >> 4) & 0x01); /* delta band */ assert(saoBlkParam->deltaBand >= 2); val = saoBlkParam->deltaBand - 2; while (val >= (1 << exp_golomb_order)) { biari_encode_symbol_eq_prob_vrdo(p_aec, 0); val -= (1 << exp_golomb_order); exp_golomb_order++; } if (exp_golomb_order == 4) { exp_golomb_order = 0; } else { biari_encode_symbol_eq_prob_vrdo(p_aec, 1); } while (exp_golomb_order--) { // next binary part biari_encode_symbol_eq_prob_vrdo(p_aec, (uint8_t)((val >> exp_golomb_order) & 1)); } } else { assert(saoBlkParam->typeIdc >= SAO_TYPE_EO_0 && saoBlkParam->typeIdc <= SAO_TYPE_EO_45); val = saoBlkParam->typeIdc; biari_encode_symbol_eq_prob_vrdo(p_aec, val & 0x01); biari_encode_symbol_eq_prob_vrdo(p_aec, (val >> 1) & 0x01); } return rate; } /* --------------------------------------------------------------------------- */ int aec_write_alf_lcu_ctrl_vrdo(aec_t *p_aec, uint8_t iflag) { int org_bits = arienco_bits_written(p_aec); DECLARE_CONTEXT(context_t *p_ctx = &(p_aec->p_ctx_set->alf_cu_enable_scmodel[0][0])); UNUSED_PARAMETER(iflag); biari_encode_symbol_vrdo(p_aec, iflag, p_ctx); /* return the number of written bits */ return arienco_bits_written(p_aec) - org_bits; } /* --------------------------------------------------------------------------- * codes cu header */ static int write_cu_header_vrdo(xavs2_t *h, aec_t *p_aec, cu_t *p_cu) { int rate = 0; int level = p_cu->cu_info.i_level; int mode = p_cu->cu_info.i_mode; int i; // write bits for inter cu type if (h->i_type != SLICE_TYPE_I) { rate += aec_write_cutype_vrdo(p_aec, mode, level, p_cu->cu_info.i_cbp, h->param->enable_amp); if (h->i_type == SLICE_TYPE_B && (mode >= PRED_2Nx2N && mode <= PRED_nRx2N)) { rate += aec_write_pdir_vrdo(p_aec, mode, level, p_cu->cu_info.b8pdir[0], p_cu->cu_info.b8pdir[1]); } else if (h->i_type == SLICE_TYPE_F && h->param->enable_dhp && (h->i_ref > 1) && ((mode >= PRED_2Nx2N && mode <= PRED_nRx2N && level > B8X8_IN_BIT) || (mode == PRED_2Nx2N && level == B8X8_IN_BIT))) { rate += aec_write_pdir_dhp_vrdo(p_aec, mode, p_cu->cu_info.b8pdir[0], p_cu->cu_info.b8pdir[1]); } /* write bits for F slice skip/direct mode */ if (IS_SKIP_MODE(mode)) { int b_write_spatial_skip = 0; if (h->i_type == SLICE_TYPE_F) { int weighted_skip_mode = p_cu->cu_info.directskip_wsm_idx; /* write weighted skip mode */ if (h->param->enable_wsm && h->i_ref > 1) { rate += aec_write_wpm_vrdo(p_aec, weighted_skip_mode, h->i_ref); } /* write bits for F-spatial-skip mode */ b_write_spatial_skip = (h->param->enable_mhp_skip && (weighted_skip_mode == 0)); } b_write_spatial_skip = b_write_spatial_skip || (SLICE_TYPE_B == h->i_type); /* write bits for b-direct-skip mode */ if (b_write_spatial_skip) { rate += aec_write_spatial_skip_mode_vrdo(p_aec, p_cu->cu_info.directskip_mhp_idx + 1); } } } // write bits for intra modes if (IS_INTRA_MODE(mode)) { int num_of_intra_block = mode != PRED_I_2Nx2N ? 4 : 1; /* write "transform_split_flag" and cu_type for SDIP */ rate += aec_write_intra_cutype_vrdo(p_aec, mode, level, p_cu->cu_info.i_tu_split, h->param->enable_sdip); /* write intra pred mode */ for (i = 0; i < num_of_intra_block; i++) { rate += aec_write_intra_pred_mode_vrdo(p_aec, p_cu->cu_info.pred_intra_modes[i]); } if (h->param->chroma_format != CHROMA_400) { int i_left_cmode = DM_PRED_C; /* check left */ if (p_cu->p_left_cu != NULL) { i_left_cmode = p_cu->p_left_cu->i_intra_mode_c; } rate += aec_write_intra_pred_cmode_vrdo(p_aec, &p_cu->cu_info, i_left_cmode); } } return rate; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int write_mvd_vrdo(aec_t *p_aec, cu_t *p_cu, int k, int bwd_flag) { int curr_mvd_x = p_cu->cu_info.mvd[bwd_flag][k].x; int curr_mvd_y = p_cu->cu_info.mvd[bwd_flag][k].y; int rate; rate = aec_write_mvd_vrdo(p_aec, curr_mvd_x, 0); rate += aec_write_mvd_vrdo(p_aec, curr_mvd_y, 1); return rate; } /* --------------------------------------------------------------------------- */ static int write_cu_refs_mvds_vrdo(xavs2_t *h, aec_t *p_aec, cu_t *p_cu) { int mode = p_cu->cu_info.i_mode; int rate = 0; int k, refframe; int pdir; int dmh_mode; /* When CU is intra or skip mode, no need to code ref_idx and mvd */ if (IS_INTRA_MODE(mode) || IS_SKIP_MODE(mode)) { return 0; } /* only one frame on each direction, no need to code ref_idx */ // forward reference if (h->i_type != SLICE_TYPE_B && h->i_ref > 1) { for (k = 0; k < p_cu->cu_info.num_pu; k++) { if (p_cu->cu_info.b8pdir[k] == PDIR_FWD || p_cu->cu_info.b8pdir[k] == PDIR_DUAL) { refframe = p_cu->cu_info.ref_idx_1st[k]; rate += aec_write_ref_vrdo(h, p_aec, refframe); } } } /* write backward reference indexes of this CU, no need for current AVS2 */ /* write DMH mode, "dir_multi_hypothesis_mode" */ if (h->i_type == SLICE_TYPE_F /*&& h->param->enable_dmh*/ && p_cu->cu_info.b8pdir[0] == PDIR_FWD && p_cu->cu_info.b8pdir[1] == PDIR_FWD && p_cu->cu_info.b8pdir[2] == PDIR_FWD && p_cu->cu_info.b8pdir[3] == PDIR_FWD) { if (!(p_cu->cu_info.i_level == B8X8_IN_BIT && p_cu->cu_info.i_mode >= PRED_2NxN && p_cu->cu_info.i_mode <= PRED_nRx2N)) { dmh_mode = p_cu->cu_info.dmh_mode; rate += aec_write_dmh_mode_vrdo(p_aec, p_cu->cu_info.i_level, dmh_mode); } } /* write forward MVD */ for (k = 0; k < p_cu->cu_info.num_pu; k++) { pdir = p_cu->cu_info.b8pdir[k]; if (pdir != PDIR_BWD) { rate += write_mvd_vrdo(p_aec, p_cu, k, 0); } } /* write backward MVD */ if (h->i_type == SLICE_TYPE_B) { for (k = 0; k < p_cu->cu_info.num_pu; k++) { pdir = p_cu->cu_info.b8pdir[k]; if (pdir == PDIR_BWD || pdir == PDIR_BID) { //has backward vector rate += write_mvd_vrdo(p_aec, p_cu, k, 1); } } } return rate; } #if ENABLE_RATE_CONTROL_CU /* --------------------------------------------------------------------------- */ int write_cu_cbp_dqp_vrdo(xavs2_t *h, aec_t *p_aec, cu_info_t *p_cu_info, int slice_index_cur_cu, int *last_dqp) { int rate = aec_write_cu_cbp_vrdo(p_aec, p_cu_info, slice_index_cur_cu, h); if (!p_cu_info->i_cbp) { *last_dqp = 0; } if (p_cu_info->i_cbp != 0 && h->param->i_rc_method == XAVS2_RC_CBR_SCU) { rate += aec_write_dqp_vrdo(p_aec, cu_get_qp(h, p_cu_info), *last_dqp); #if ENABLE_RATE_CONTROL_CU *last_dqp = p_cu_info->i_delta_qp; #else *last_dqp = 0; #endif } return rate; } #endif /* --------------------------------------------------------------------------- */ static int write_luma_block_coeff_vrdo(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, coeff_t *quant_coeff, runlevel_t *runlevel, int i_level, int i_stride_shift, int is_intra, int intra_mode, int max_bits) { const int16_t(*cg_scan)[2] = NULL; int b_ver = p_cu->cu_info.i_tu_split == TU_SPLIT_VER; int b_hor = p_cu->cu_info.i_tu_split == TU_SPLIT_HOR; int intra_pred_class = INTRA_PRED_DC_DIAG; int num_cg; if (max_bits < 1) { return 1; ///< run_levelҪ1أsignΪbypassģʽ } if (b_hor) { cg_scan = tab_cg_scan_list_hor[i_level - 2]; } else if (b_ver) { cg_scan = tab_cg_scan_list_ver[i_level - 2]; } else { cg_scan = tab_cg_scan_list_nxn[i_level - 2]; } // reset b_hor and b_ver b_hor = (is_intra && tab_intra_mode_scan_type[intra_mode] == INTRA_PRED_HOR && p_cu->cu_info.i_mode != PRED_I_2Nxn && p_cu->cu_info.i_mode != PRED_I_nx2N); b_ver = !b_hor; num_cg = 1 << (i_level + i_level - 4); // number of CGs if (IS_ALG_ENABLE(OPT_BIT_EST_PSZT) && num_cg == 64 && !h->lcu.b_2nd_rdcost_pass) { // 32x32 TB num_cg = 25; } /* ʼRunLevelṹ */ runlevel->tab_cg_scan = cg_scan; runlevel->num_cg = num_cg; runlevel->i_stride_shift = i_stride_shift; runlevel->b_hor = b_hor; runlevel->quant_coeff = quant_coeff; runlevel->p_cu_info = &p_cu->cu_info; /* return bit rate */ if (IS_INTRA_MODE(p_cu->cu_info.i_mode)) { assert(intra_mode < NUM_INTRA_MODE); intra_pred_class = tab_intra_mode_scan_type[intra_mode]; } return aec_write_run_level_luma_vrdo(p_aec, intra_pred_class == INTRA_PRED_DC_DIAG, runlevel, h, max_bits); } /* --------------------------------------------------------------------------- */ static int write_chroma_block_coeff_vrdo(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, coeff_t *quant_coeff, runlevel_t *runlevel, int i_level, int max_bits) { int num_cg = 1 << (i_level + i_level - 4); if (max_bits < 1) { return 1; ///< run_levelҪ1أsignΪbypassģʽ } if (IS_ALG_ENABLE(OPT_BIT_EST_PSZT) && num_cg == 64 && !h->lcu.b_2nd_rdcost_pass) { // 32x32 TB num_cg = 25; } /* ʼRunLevelṹ */ runlevel->tab_cg_scan = tab_cg_scan_list_nxn[i_level - 2]; runlevel->num_cg = num_cg; runlevel->i_stride_shift = i_level; runlevel->b_hor = 0; runlevel->quant_coeff = quant_coeff; runlevel->p_cu_info = &p_cu->cu_info; return aec_write_run_level_chroma_vrdo(p_aec, runlevel, h, max_bits); } /** * =========================================================================== * function handler * =========================================================================== */ binary_t gf_aec_vrdo = { /* syntax elements */ .write_intra_pred_mode = aec_write_intra_pred_mode_vrdo, .write_ctu_split_flag = aec_write_split_flag_vrdo, .est_cu_header = write_cu_header_vrdo, .est_cu_refs_mvds = write_cu_refs_mvds_vrdo, .est_luma_block_coeff = write_luma_block_coeff_vrdo, .est_chroma_block_coeff = write_chroma_block_coeff_vrdo, #if ENABLE_RATE_CONTROL_CU .write_cu_cbp_dqp = write_cu_cbp_dqp_vrdo, #else .write_cu_cbp = aec_write_cu_cbp_vrdo, #endif .write_sao_mergeflag = write_sao_mergeflag_vrdo, .write_sao_mode = write_sao_mode_vrdo, .write_sao_offset = write_sao_offset_vrdo, .write_sao_type = write_sao_type_vrdo, .write_alf_lcu_ctrl = aec_write_alf_lcu_ctrl_vrdo, }; xavs2-1.3/source/encoder/alf.c000066400000000000000000002251551340660520300162560ustar00rootroot00000000000000/* * alf.c * * Description of this file: * ALF functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "aec.h" #include "primitives.h" #include "alf.h" #include "header.h" #include "cpu.h" #include "cudata.h" #define ROUND(a) (((a) < 0)? (int)((a) - 0.5) : (int)((a) + 0.5)) #define REG 0.0001 #define REG_SQR 0.0000001 #define Clip_post(high,val) ((val > high)? high: val) /** * =========================================================================== * global/local variables * =========================================================================== */ static const int tab_weightsShape1Sym[ALF_MAX_NUM_COEF + 1] = { 2, 2, 2, 2, 2, 2, 2, 2, 1, 1 }; static const int svlc_bitrate_estimate[128] = { 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 5, 5, 3, 1, 3, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 }; static const int uvlc_bitrate_estimate[128] = { 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 15 }; typedef struct dh_nc { double dh; int nc; } DhNc; typedef struct { int64_t m_autoCorr[NO_VAR_BINS][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF]; // auto-correlation matrix double m_crossCorr[NO_VAR_BINS][ALF_MAX_NUM_COEF]; // cross-correlation double pixAcc[NO_VAR_BINS]; } AlfCorrData; typedef struct { double m_cross_merged[NO_VAR_BINS][ALF_MAX_NUM_COEF]; int64_t m_auto_merged[NO_VAR_BINS][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF]; double m_cross_temp[ALF_MAX_NUM_COEF]; double m_pixAcc_merged[NO_VAR_BINS]; int64_t m_auto_temp[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF]; int m_coeffNoFilter[NO_VAR_BINS][ALF_MAX_NUM_COEF]; int m_filterCoeffSym[NO_VAR_BINS][ALF_MAX_NUM_COEF]; int m_varIndTab[NO_VAR_BINS]; AlfCorrData m_pic_corr[IMG_CMPNTS]; AlfCorrData m_alfCorrMerged[IMG_CMPNTS]; AlfCorrData *m_alfCorr[IMG_CMPNTS]; AlfCorrData *m_alfNonSkippedCorr[IMG_CMPNTS]; AlfCorrData *m_alfPrevCorr; int m_alfReDesignIteration; uint32_t m_uiBitIncrement; ALFParam m_alfPictureParam[32][IMG_CMPNTS]; int *m_numSlicesDataInOneLCU; int8_t *tab_lcu_region; } alf_ctx_t; /* ------------------------------------------------------------- */ static ALWAYS_INLINE void init_alf_frame_param(ALFParam *p_alf) { p_alf->alf_flag = 0; p_alf->num_coeff = ALF_MAX_NUM_COEF; p_alf->filters_per_group = 1; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void reconstructCoefficients(ALFParam *alfParam, int filterCoeff[][ALF_MAX_NUM_COEF]) { int g, sum, i, coeffPred; for (g = 0; g < alfParam->filters_per_group; g++) { for (i = 0, sum = 0; i < alfParam->num_coeff - 1; i++) { sum += (2 * alfParam->coeffmulti[g][i]); filterCoeff[g][i] = alfParam->coeffmulti[g][i]; } coeffPred = (1 << ALF_NUM_BIT_SHIFT) - sum; filterCoeff[g][alfParam->num_coeff - 1] = coeffPred + alfParam->coeffmulti[g][alfParam->num_coeff - 1]; } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void reconstructCoefInfo(int compIdx, ALFParam *alfParam, int filterCoeff[][ALF_MAX_NUM_COEF], int *varIndTab) { int i; if (compIdx == IMG_Y) { memset(varIndTab, 0, NO_VAR_BINS * sizeof(int)); if (alfParam->filters_per_group > 1) { for (i = 1; i < NO_VAR_BINS; ++i) { varIndTab[i] = varIndTab[i - 1]; if (alfParam->filterPattern[i]) { varIndTab[i] ++; } } } } reconstructCoefficients(alfParam, filterCoeff); } /* --------------------------------------------------------------------------- */ static INLINE void checkFilterCoeffValue(int *filter, int filterLength) { int maxValueNonCenter = 1 * (1 << ALF_NUM_BIT_SHIFT) - 1; int minValueNonCenter = 0 - 1 * (1 << ALF_NUM_BIT_SHIFT); int maxValueCenter = 2 * (1 << ALF_NUM_BIT_SHIFT) - 1; int minValueCenter = 0; int i; for (i = 0; i < filterLength - 1; i++) { filter[i] = XAVS2_CLIP3(minValueNonCenter, maxValueNonCenter, filter[i]); } filter[filterLength - 1] = XAVS2_CLIP3(minValueCenter, maxValueCenter, filter[filterLength - 1]); } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void copyALFparam(ALFParam *dst, ALFParam *src, int componentID) { int j; dst->alf_flag = src->alf_flag; dst->filters_per_group = src->filters_per_group; dst->num_coeff = src->num_coeff; switch (componentID) { case IMG_Y: for (j = 0; j < NO_VAR_BINS; j++) { memcpy(dst->coeffmulti[j], src->coeffmulti[j], ALF_MAX_NUM_COEF * sizeof(int)); } memcpy(dst->filterPattern, src->filterPattern, NO_VAR_BINS * sizeof(int)); break; case IMG_U: case IMG_V: memcpy(dst->coeffmulti[0], src->coeffmulti[0], ALF_MAX_NUM_COEF * sizeof(int)); break; default: printf("Not a legal component ID\n"); assert(0); exit(-1); } } /* --------------------------------------------------------------------------- * calculate the correlation matrix for Luma */ static void calcCorrOneCompRegionLuma(xavs2_t *h, alf_ctx_t *Enc_ALF, pel_t *org, int i_org, pel_t *rec, int i_rec, int yPos, int xPos, int height, int width, int64_t m_autoCorr[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double m_crossCorr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *pixAcc, int isLeftAvail, int isRightAvail, int isAboveAvail, int isBelowAvail) { int xPosEnd = xPos + width; int N = ALF_MAX_NUM_COEF; //m_sqrFiltLengthTab[0]; int startPosLuma = isAboveAvail ? (yPos - 4) : yPos; int endPosLuma = isBelowAvail ? (yPos + height - 4) : (yPos + height); int xOffSetLeft = isLeftAvail ? -3 : 0; int xOffSetRight = isRightAvail ? 3 : 0; pel_t *imgPad = rec; pel_t *imgOrg = org; int yUp, yBottom; int xLeft, xRight; int ELocal[ALF_MAX_NUM_COEF]; pel_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6; int i, j, k, l, yLocal, varInd; int64_t(*E)[9]; double *yy; imgPad += startPosLuma * i_rec; imgOrg += startPosLuma * i_org; varInd = Enc_ALF->tab_lcu_region[(yPos >> h->i_lcu_level) * h->i_width_in_lcu + (xPos >> h->i_lcu_level)]; int step = 1; if (IS_ALG_ENABLE(OPT_FAST_ALF)) { step = 2; } for (i = startPosLuma; i < endPosLuma; i += step) { yUp = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i - 1); yBottom = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i + 1); imgPad1 = imgPad + (yBottom - i) * i_rec; imgPad2 = imgPad + (yUp - i) * i_rec; yUp = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i - 2); yBottom = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i + 2); imgPad3 = imgPad + (yBottom - i) * i_rec; imgPad4 = imgPad + (yUp - i) * i_rec; yUp = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i - 3); yBottom = XAVS2_CLIP3(startPosLuma, endPosLuma - 1, i + 3); imgPad5 = imgPad + (yBottom - i) * i_rec; imgPad6 = imgPad + (yUp - i) * i_rec; for (j = xPos; j < xPosEnd; j += step) { memset(ELocal, 0, N * sizeof(int)); ELocal[0] = (imgPad5[j] + imgPad6[j]); ELocal[1] = (imgPad3[j] + imgPad4[j]); xLeft = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 1); xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 1); ELocal[2] = (imgPad1[xRight] + imgPad2[xLeft]); ELocal[3] = (imgPad1[j ] + imgPad2[j ]); ELocal[4] = (imgPad1[xLeft] + imgPad2[xRight]); ELocal[7] = (imgPad[xRight] + imgPad[xLeft]); xLeft = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 2); xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 2); ELocal[6] = (imgPad[xRight] + imgPad[xLeft]); xLeft = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 3); xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 3); ELocal[5] = (imgPad[xRight] + imgPad[xLeft]); ELocal[8] = (imgPad[j ]); yLocal = imgOrg[j]; pixAcc[varInd] += (yLocal * yLocal); E = m_autoCorr[varInd]; yy = m_crossCorr[varInd]; for (k = 0; k < N; k++) { for (l = k; l < N; l++) { E[k][l] += (ELocal[k] * ELocal[l]); } yy[k] += (double)(ELocal[k] * yLocal); } } imgPad += i_rec; imgOrg += i_org; } for (varInd = 0; varInd < NO_VAR_BINS; varInd++) { E = m_autoCorr[varInd]; for (k = 1; k < N; k++) { for (l = 0; l < k; l++) { E[k][l] = E[l][k]; } } } } /* --------------------------------------------------------------------------- * calculate the correlation matrix for Chroma */ static void calcCorrOneCompRegionChma(xavs2_t *h, pel_t *org, int i_org, pel_t *rec, int i_rec, int yPos, int xPos, int height, int width, int64_t m_autoCorr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *m_crossCorr, int isLeftAvail, int isRightAvail, int isAboveAvail, int isBelowAvail) { int xPosEnd = xPos + width; const int N = ALF_MAX_NUM_COEF; //m_sqrFiltLengthTab[0]; int startPosChroma = isAboveAvail ? (yPos - 4) : yPos; int endPosChroma = isBelowAvail ? (yPos + height - 4) : (yPos + height); int xOffSetLeft = isLeftAvail ? -3 : 0; int xOffSetRight = isRightAvail ? 3 : 0; pel_t *imgPad = rec; pel_t *imgOrg = org; int yUp, yBottom; int xLeft, xRight; int ELocal[ALF_MAX_NUM_COEF]; pel_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6; int i, j, k, l, yLocal; imgPad += startPosChroma * i_rec; imgOrg += startPosChroma * i_org; int step = 1; if (IS_ALG_ENABLE(OPT_FAST_ALF)) { step = 2; } for (i = startPosChroma; i < endPosChroma; i += step) { yUp = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i - 1); yBottom = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i + 1); imgPad1 = imgPad + (yBottom - i) * i_rec; imgPad2 = imgPad + (yUp - i) * i_rec; yUp = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i - 2); yBottom = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i + 2); imgPad3 = imgPad + (yBottom - i) * i_rec; imgPad4 = imgPad + (yUp - i) * i_rec; yUp = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i - 3); yBottom = XAVS2_CLIP3(startPosChroma, endPosChroma - 1, i + 3); imgPad5 = imgPad + (yBottom - i) * i_rec; imgPad6 = imgPad + (yUp - i) * i_rec; for (j = xPos; j < xPosEnd; j += step) { memset(ELocal, 0, N * sizeof(int)); ELocal[0] = (imgPad5[j] + imgPad6[j]); ELocal[1] = (imgPad3[j] + imgPad4[j]); xLeft = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 1); xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 1); ELocal[2] = (imgPad1[xRight] + imgPad2[xLeft]); ELocal[3] = (imgPad1[j ] + imgPad2[j ]); ELocal[4] = (imgPad1[xLeft] + imgPad2[xRight]); ELocal[7] = (imgPad[xRight] + imgPad[xLeft]); xLeft = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 2); xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 2); ELocal[6] = (imgPad[xRight] + imgPad[xLeft]); xLeft = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j - 3); xRight = XAVS2_CLIP3(xPos + xOffSetLeft, xPosEnd - 1 + xOffSetRight, j + 3); ELocal[5] = (imgPad[xRight] + imgPad[xLeft]); ELocal[8] = (imgPad[j ]); yLocal = (int)imgOrg[j]; for (k = 0; k < N; k++) { m_autoCorr[k][k] += ELocal[k] * ELocal[k]; for (l = k + 1; l < N; l++) { m_autoCorr[k][l] += ELocal[k] * ELocal[l]; } m_crossCorr[k] += yLocal * ELocal[k]; } } imgPad += i_rec; imgOrg += i_org; } for (j = 0; j < N - 1; j++) { for (i = j + 1; i < N; i++) { m_autoCorr[i][j] = m_autoCorr[j][i]; } } } /* --------------------------------------------------------------------------- */ static void reset_alfCorr(AlfCorrData *alfCorr, int componentID) { int numCoef = ALF_MAX_NUM_COEF; int maxNumGroups = NO_VAR_BINS; int g, j, i; int numGroups = (componentID == IMG_Y) ? (maxNumGroups) : (1); for (g = 0; g < numGroups; g++) { alfCorr->pixAcc[g] = 0; for (j = 0; j < numCoef; j++) { alfCorr->m_crossCorr[g][j] = 0; for (i = 0; i < numCoef; i++) { alfCorr->m_autoCorr[g][j][i] = 0; } } } } /* --------------------------------------------------------------------------- */ static void deriveBoundaryAvail(xavs2_t *h, int pic_x, int pic_y, int *isLeftAvail, int *isRightAvail, int *isAboveAvail, int *isBelowAvail) { int size_lcu = 1 << h->i_lcu_level; int mb_x, mb_y, mb_nr; int pic_mb_width = h->i_width_in_mincu; cu_info_t *cuCurr, *cuLeft, *cuRight, *cuAbove, *cuBelow; mb_x = pic_x >> MIN_CU_SIZE_IN_BIT; mb_y = pic_y >> MIN_CU_SIZE_IN_BIT; mb_nr = mb_y * pic_mb_width + mb_x; *isLeftAvail = pic_x > 0; *isRightAvail = pic_x + size_lcu < h->i_width; *isAboveAvail = pic_y > 0; *isBelowAvail = pic_y + size_lcu < h->i_height; cuCurr = &(h->cu_info[mb_nr]); cuLeft = *isLeftAvail ? &(h->cu_info[mb_nr - 1]) : NULL; cuRight = *isRightAvail ? &(h->cu_info[mb_nr + 1]) : NULL; cuAbove = *isAboveAvail ? &(h->cu_info[mb_nr - pic_mb_width]) : NULL; cuBelow = *isBelowAvail ? &(h->cu_info[mb_nr + pic_mb_width]) : NULL; if (!h->param->b_cross_slice_loop_filter) { int curSliceNr = cu_get_slice_index(h, mb_x, mb_y); if (*isLeftAvail) { *isLeftAvail = cu_get_slice_index(h, mb_x - 1, mb_y) == curSliceNr; } if (*isRightAvail) { *isRightAvail = cu_get_slice_index(h, mb_x + (size_lcu >> MIN_CU_SIZE_IN_BIT), mb_y) == curSliceNr; } if (*isAboveAvail) { *isAboveAvail = cu_get_slice_index(h, mb_x, mb_y - 1) == curSliceNr; } } } /* --------------------------------------------------------------------------- * Function: Calculate the correlation matrix for each LCU * Input: * h : handler of encoder * (lcu_x, lcu_y) : The LCU position * p_org : The original image * p_rec : The reconstruction image before ALF * Output: * Return: * --------------------------------------------------------------------------- */ void alf_get_statistics_lcu(xavs2_t *h, int lcu_x, int lcu_y, xavs2_frame_t *p_org, xavs2_frame_t *p_rec) { alf_ctx_t *Enc_ALF = (alf_ctx_t *)h->enc_alf; int ctu = lcu_y * h->i_width_in_lcu + lcu_x; int ctuYPos = lcu_y << h->i_lcu_level; int ctuXPos = lcu_x << h->i_lcu_level; int size_lcu = 1 << h->i_lcu_level; int ctuHeight = XAVS2_MIN(size_lcu, h->i_height - ctuYPos); int ctuWidth = XAVS2_MIN(size_lcu, h->i_width - ctuXPos); int formatShift; int compIdx = IMG_U; AlfCorrData *alfCorr = &Enc_ALF->m_alfCorr[compIdx][ctu]; int isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail; deriveBoundaryAvail(h, ctuXPos, ctuYPos, &isLeftAvail, &isRightAvail, &isAboveAvail, &isBelowAvail); reset_alfCorr(alfCorr, compIdx); formatShift = 1; calcCorrOneCompRegionChma(h, p_org->planes[compIdx], p_org->i_stride[compIdx], p_rec->planes[compIdx], p_rec->i_stride[compIdx], ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift, alfCorr->m_autoCorr[0], alfCorr->m_crossCorr[0], isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail); compIdx = IMG_V; alfCorr = &Enc_ALF->m_alfCorr[compIdx][ctu]; reset_alfCorr(alfCorr, compIdx); //Vypos, xpos, height, widthĸֵUһҪ޸ calcCorrOneCompRegionChma(h, p_org->planes[compIdx], p_org->i_stride[compIdx], p_rec->planes[compIdx], p_rec->i_stride[compIdx], ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift, alfCorr->m_autoCorr[0], alfCorr->m_crossCorr[0], isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail); compIdx = IMG_Y; alfCorr = &Enc_ALF->m_alfCorr[compIdx][ctu]; reset_alfCorr(alfCorr, compIdx); formatShift = 0; calcCorrOneCompRegionLuma(h, Enc_ALF, p_org->planes[compIdx], p_org->i_stride[compIdx], p_rec->planes[compIdx], p_rec->i_stride[compIdx], ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift, alfCorr->m_autoCorr, alfCorr->m_crossCorr, alfCorr->pixAcc, isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail); } /** * --------------------------------------------------------------------------- * Function: correlation matrix merge * Input: * src: input correlation matrix * mergeTable: merge table * Output: * dst: output correlation matrix * Return: * --------------------------------------------------------------------------- */ static void mergeFrom(AlfCorrData *dst, AlfCorrData *src, int *mergeTable, int doPixAccMerge, int componentID) { int numCoef = ALF_MAX_NUM_COEF; int64_t (*srcE)[ALF_MAX_NUM_COEF], (*dstE)[ALF_MAX_NUM_COEF]; double *srcy, *dsty; int maxFilterSetSize, j, i, varInd, filtIdx; //assert(dst->componentID == src->componentID); reset_alfCorr(dst, componentID); switch (componentID) { case IMG_U: case IMG_V: srcE = src->m_autoCorr[0]; dstE = dst->m_autoCorr[0]; srcy = src->m_crossCorr[0]; dsty = dst->m_crossCorr[0]; for (j = 0; j < numCoef; j++) { for (i = 0; i < numCoef; i++) { dstE[j][i] += srcE[j][i]; } dsty[j] += srcy[j]; } if (doPixAccMerge) { dst->pixAcc[0] = src->pixAcc[0]; } break; case IMG_Y: maxFilterSetSize = (int)NO_VAR_BINS; for (varInd = 0; varInd < maxFilterSetSize; varInd++) { filtIdx = (mergeTable == NULL) ? (0) : (mergeTable[varInd]); srcE = src->m_autoCorr[varInd]; dstE = dst->m_autoCorr[filtIdx]; srcy = src->m_crossCorr[varInd]; dsty = dst->m_crossCorr[filtIdx]; for (j = 0; j < numCoef; j++) { for (i = 0; i < numCoef; i++) { dstE[j][i] += srcE[j][i]; } dsty[j] += srcy[j]; } if (doPixAccMerge) { dst->pixAcc[filtIdx] += src->pixAcc[varInd]; } } break; default: printf("not a legal component ID\n"); assert(0); exit(-1); } } /* --------------------------------------------------------------------------- */ static uint32_t ALFParamBitrateEstimate(ALFParam *alfParam) { uint32_t bitrate = 0; //alf enabled flag int g, i; for (g = 0; g < alfParam->filters_per_group; g++) { for (i = 0; i < (int)ALF_MAX_NUM_COEF; i++) { bitrate += svlc_bitrate_estimate[64 + alfParam->coeffmulti[g][i]]; } } return bitrate; } /* --------------------------------------------------------------------------- */ static uint32_t estimateALFBitrateInPicHeader(ALFParam *alfPicParam) { //CXCTBD please help to check if the implementation is consistent with syntax coding uint32_t bitrate = 3; // pic_alf_enabled_flag[0,1,2] if (alfPicParam[0].alf_flag) { int noFilters = alfPicParam[0].filters_per_group - 1; bitrate += uvlc_bitrate_estimate[noFilters] + (4 * noFilters); bitrate += ALFParamBitrateEstimate(&alfPicParam[0]); } if (alfPicParam[1].alf_flag) { bitrate += ALFParamBitrateEstimate(&alfPicParam[1]); } if (alfPicParam[2].alf_flag) { bitrate += ALFParamBitrateEstimate(&alfPicParam[2]); } return bitrate; } /* --------------------------------------------------------------------------- */ static long xFastFiltDistEstimation(alf_ctx_t *Enc_ALF, int64_t ppdE[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *pdy, int *piCoeff, int iFiltLength) { //static memory double pdcoeff[ALF_MAX_NUM_COEF]; //variable int i, j; long iDist; double dDist = 0; uint32_t uiShift; for (i = 0; i < iFiltLength; i++) { pdcoeff[i] = (double)piCoeff[i] / (double)(1 << ((int)ALF_NUM_BIT_SHIFT)); } for (i = 0, dDist = 0; i < iFiltLength; i++) { double dsum = ((double)ppdE[i][i]) * pdcoeff[i]; for (j = i + 1; j < iFiltLength; j++) { dsum += (double)(2 * ppdE[i][j]) * pdcoeff[j]; } dDist += ((dsum - 2.0 * pdy[i]) * pdcoeff[i]); } uiShift = Enc_ALF->m_uiBitIncrement << 1; if (dDist < 0) { iDist = -(((long)(-dDist + 0.5)) >> uiShift); } else { //dDist >=0 iDist = ((long)(dDist + 0.5)) >> uiShift; } return iDist; } /* --------------------------------------------------------------------------- */ static long estimateFilterDistortion(alf_ctx_t *Enc_ALF, int compIdx, AlfCorrData *alfCorr, int coeffSet[][ALF_MAX_NUM_COEF], int filterSetSize, int *mergeTable, int doPixAccMerge) { AlfCorrData *alfMerged = &Enc_ALF->m_alfCorrMerged[compIdx]; int f; long iDist = 0; mergeFrom(alfMerged, alfCorr, mergeTable, doPixAccMerge, compIdx); if (coeffSet == NULL) { coeffSet = Enc_ALF->m_coeffNoFilter; } for (f = 0; f < filterSetSize; f++) { iDist += xFastFiltDistEstimation(Enc_ALF, alfMerged->m_autoCorr[f], alfMerged->m_crossCorr[f], coeffSet[f], ALF_MAX_NUM_COEF); } return iDist; } /* --------------------------------------------------------------------------- */ static dist_t calcAlfLCUDist(xavs2_t *h, alf_ctx_t *Enc_ALF, int compIdx, int ypos, int xpos, int height, int width, int isAboveAvail, pel_t *picSrc, int i_src, pel_t *picCmp, int i_cmp) { dist_t dist = 0; pel_t *pelCmp = picCmp; pel_t *pelSrc = picSrc; int notSkipLinesRightVB = TRUE; int notSkipLinesBelowVB = TRUE; int NumCUsInFrame, numLCUInPicWidth, numLCUInPicHeight; numLCUInPicHeight = h->i_height_in_lcu; numLCUInPicWidth = h->i_width_in_lcu; NumCUsInFrame = numLCUInPicHeight * numLCUInPicWidth; switch (compIdx) { case IMG_U: case IMG_V: if (!notSkipLinesBelowVB) { height = height - (int)(DF_CHANGED_SIZE >> 1) - (int)(ALF_FOOTPRINT_SIZE >> 1); } if (!notSkipLinesRightVB) { width = width - (int)(DF_CHANGED_SIZE >> 1) - (int)(ALF_FOOTPRINT_SIZE >> 1); } if (isAboveAvail) { pelSrc += ((ypos - 4) * i_src) + xpos; pelCmp += ((ypos - 4) * i_cmp) + xpos; } else { pelSrc += (ypos * i_src) + xpos; pelCmp += (ypos * i_cmp) + xpos; } break; default: // case IMG_Y: if (!notSkipLinesBelowVB) { height = height - (int)(DF_CHANGED_SIZE)-(int)(ALF_FOOTPRINT_SIZE >> 1); } if (!notSkipLinesRightVB) { width = width - (int)(DF_CHANGED_SIZE)-(int)(ALF_FOOTPRINT_SIZE >> 1); } pelCmp = picCmp + (ypos * i_cmp) + xpos; pelSrc = picSrc + (ypos * i_src) + xpos; break; } if (PART_INDEX(width, height) == LUMA_INVALID) { uint32_t uiShift = Enc_ALF->m_uiBitIncrement << 1; dist += g_funcs.pixf.ssd_block(pelSrc, i_src, pelCmp, i_cmp, width, height) >> uiShift; } else { dist += g_funcs.pixf.ssd[PART_INDEX(width, height)](pelSrc, i_src, pelCmp, i_cmp); } return dist; } /* --------------------------------------------------------------------------- * ALF filter on CTB */ static void filterOneCTB(xavs2_t *h, alf_ctx_t *Enc_ALF, pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int compIdx, ALFParam *alfParam, int ypos, int height, int xpos, int width, int isAboveAvail, int isBelowAvail) { int *coef; //reconstruct coefficients to m_filterCoeffSym and m_varIndTab reconstructCoefInfo(compIdx, alfParam, Enc_ALF->m_filterCoeffSym, Enc_ALF->m_varIndTab); //reconstruct ALF coefficients & related parameters //derive CTB start positions, width, and height. If the boundary is not available, skip boundary samples. if (compIdx == IMG_Y) { int var = Enc_ALF->tab_lcu_region[(ypos >> h->i_lcu_level) * h->i_width_in_lcu + (xpos >> h->i_lcu_level)]; coef = Enc_ALF->m_filterCoeffSym[Enc_ALF->m_varIndTab[var]]; } else { coef = Enc_ALF->m_filterCoeffSym[0]; } g_funcs.alf_flt[0](p_dst, i_dst, p_src, i_src, xpos, ypos, width, height, coef, isAboveAvail, isBelowAvail); g_funcs.alf_flt[1](p_dst, i_dst, p_src, i_src, xpos, ypos, width, height, coef, isAboveAvail, isBelowAvail); } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void copyOneAlfBlk(pel_t *p_dst, int i_dst, pel_t *p_src, int i_src, int ypos, int xpos, int height, int width, int isAboveAvail, int isBelowAvail) { int startPos = isAboveAvail ? (ypos - 4) : ypos; int endPos = isBelowAvail ? (ypos + height - 4) : ypos + height; p_dst += (startPos * i_dst) + xpos; p_src += (startPos * i_src) + xpos; g_funcs.plane_copy(p_dst, i_dst, p_src, i_src, width, endPos - startPos); } /* --------------------------------------------------------------------------- * ALF On/off decision for LCU and do RDO Estimation */ static double executePicLCUOnOffDecisionRDOEstimate(xavs2_t *h, alf_ctx_t *Enc_ALF, aec_t *p_aec, ALFParam *alfPictureParam, double lambda, AlfCorrData * alfCorr) { dist_t distEnc, distOff; double rateEnc, rateOff, costEnc, costOff, costAlfOn, costAlfOff; dist_t distBestPic[IMG_CMPNTS]; double rateBestPic[IMG_CMPNTS]; int compIdx, ctu; double lambda_luma, lambda_chroma; int img_height, img_width; int NumCUsInFrame; double bestCost = 0; int rate, noFilters; h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_initial); h->copy_aec_state_rdo(&h->cs_data.cs_alf_cu_ctr, p_aec); img_height = h->i_height; img_width = h->i_width; NumCUsInFrame = h->i_height_in_lcu * h->i_width_in_lcu; lambda_luma = lambda; //VKTBD lambda is not correct lambda_chroma = LAMBDA_SCALE_CHROMA * lambda_luma; for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { distBestPic[compIdx] = 0; rateBestPic[compIdx] = 0; } for (ctu = 0; ctu < NumCUsInFrame; ctu++) { for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { //if slice-level enabled flag is 0, set CTB-level enabled flag 0 if (alfPictureParam[compIdx].alf_flag == 0) { h->is_alf_lcu_on[ctu][compIdx] = FALSE; continue; } // ALF on reconstructCoefInfo(compIdx, &alfPictureParam[compIdx], Enc_ALF->m_filterCoeffSym, Enc_ALF->m_varIndTab); //distEnc is the estimated distortion reduction compared with filter-off case distEnc = estimateFilterDistortion(Enc_ALF, compIdx, alfCorr + (compIdx * NumCUsInFrame) + ctu, Enc_ALF->m_filterCoeffSym, alfPictureParam[compIdx].filters_per_group, Enc_ALF->m_varIndTab, FALSE) - estimateFilterDistortion(Enc_ALF, compIdx, alfCorr + (compIdx * NumCUsInFrame) + ctu, NULL, 1, NULL, FALSE); h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_cu_ctr); rateEnc = p_aec->binary.write_alf_lcu_ctrl(p_aec, 1); costEnc = (double)distEnc + (compIdx == 0 ? lambda_luma : lambda_chroma) * rateEnc; // ALF off distOff = 0; // rateOff = 1; h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_cu_ctr); rateOff = p_aec->binary.write_alf_lcu_ctrl(p_aec, 0); costOff = (double)distOff + (compIdx == 0 ? lambda_luma : lambda_chroma) * rateOff; //set CTB-level on/off flag h->is_alf_lcu_on[ctu][compIdx] = (costEnc < costOff) ? TRUE : FALSE; //update CABAC status //cabacCoder->updateAlfCtrlFlagState(m_pcPic->getCU(ctu)->getAlfLCUEnabled(compIdx)?1:0); h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_cu_ctr); rateOff = p_aec->binary.write_alf_lcu_ctrl(p_aec, (h->is_alf_lcu_on[ctu][compIdx] ? 1 : 0)); h->copy_aec_state_rdo(&h->cs_data.cs_alf_cu_ctr, p_aec); rateBestPic[compIdx] += (h->is_alf_lcu_on[ctu][compIdx] ? rateEnc : rateOff); distBestPic[compIdx] += (h->is_alf_lcu_on[ctu][compIdx] ? distEnc : distOff); } //CTB } //CTU for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { if (alfPictureParam[compIdx].alf_flag == 1) { double lambda = (compIdx == 0 ? lambda_luma : lambda_chroma); rate = ALFParamBitrateEstimate(&alfPictureParam[compIdx]); if (compIdx == IMG_Y) { noFilters = alfPictureParam[0].filters_per_group - 1; rate += uvlc_bitrate_estimate[noFilters] + (4 * noFilters); } costAlfOn = (double)distBestPic[compIdx] + lambda * (rateBestPic[compIdx] + (double)(rate)); costAlfOff = 0; if (costAlfOn >= costAlfOff) { alfPictureParam[compIdx].alf_flag = 0; for (ctu = 0; ctu < NumCUsInFrame; ctu++) { h->is_alf_lcu_on[ctu][compIdx] = FALSE; } } } } bestCost = 0; for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { if (alfPictureParam[compIdx].alf_flag == 1) { bestCost += (double)distBestPic[compIdx] + (compIdx == 0 ? lambda_luma : lambda_chroma) * (rateBestPic[compIdx]); } } // return the block-level RD cost return bestCost; } /* --------------------------------------------------------------------------- * ALF On/Off decision for LCU */ static void executePicLCUOnOffDecision(xavs2_t *h, alf_ctx_t *Enc_ALF, aec_t *p_aec, ALFParam *alfPictureParam, double lambda, xavs2_frame_t *p_org, xavs2_frame_t *p_rec, xavs2_frame_t *p_dst) { dist_t distEnc, distOff; double rateEnc, rateOff, costEnc, costOff, costAlfOn, costAlfOff; int isLeftAvail, isRightAvail, isAboveAvail, isBelowAvail; dist_t distBestPic[IMG_CMPNTS]; double rateBestPic[IMG_CMPNTS]; int compIdx, ctu, ctuYPos, ctuXPos, ctuHeight, ctuWidth; int formatShift = 0; int i_org = 0; int i_rec_before = 0; int i_rec_after = 0; pel_t *p_org_pixel = NULL; pel_t *p_rec_before = NULL; pel_t *p_rec_after = NULL; double lambda_luma, lambda_chroma; int img_height, img_width; int size_lcu = 1 << h->i_lcu_level; int ctux, ctuy; int NumCUsInFrame, numLCUInPicWidth, numLCUInPicHeight; int rate, noFilters; h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_initial); h->copy_aec_state_rdo(&h->cs_data.cs_alf_cu_ctr, p_aec); img_height = h->i_height; img_width = h->i_width; numLCUInPicHeight = h->i_height_in_lcu; numLCUInPicWidth = h->i_width_in_lcu; NumCUsInFrame = numLCUInPicHeight * numLCUInPicWidth; lambda_luma = lambda; //VKTBD lambda is not correct lambda_chroma = LAMBDA_SCALE_CHROMA * lambda_luma; for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { distBestPic[compIdx] = 0; rateBestPic[compIdx] = 0; } for (ctuy = 0, ctu = 0; ctuy < numLCUInPicHeight; ctuy++) { //derive CTU height ctuYPos = ctuy * size_lcu; ctuHeight = XAVS2_MIN(img_height - ctuYPos, size_lcu); for (ctux = 0; ctux < numLCUInPicWidth; ctux++, ctu++) { //derive CTU width ctuXPos = ctux * size_lcu; ctuWidth = XAVS2_MIN(img_width - ctuXPos, size_lcu); //derive CTU boundary availabilities deriveBoundaryAvail(h, ctuXPos, ctuYPos, &isLeftAvail, &isRightAvail, &isAboveAvail, &isBelowAvail); for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { //if slice-level enabled flag is 0, set CTB-level enabled flag 0 if (alfPictureParam[compIdx].alf_flag == 0) { h->is_alf_lcu_on[ctu][compIdx] = FALSE; continue; } formatShift = (compIdx == IMG_Y) ? 0 : 1; p_org_pixel = p_org->planes[compIdx]; i_org = p_org->i_stride[compIdx]; p_rec_before = p_rec->planes[compIdx]; i_rec_before = p_rec->i_stride[compIdx]; p_rec_after = p_dst->planes[compIdx]; i_rec_after = p_dst->i_stride[compIdx]; // ALF on filterOneCTB(h, Enc_ALF, p_rec_after, i_rec_after, p_rec_before, i_rec_before, compIdx, &alfPictureParam[compIdx], ctuYPos >> formatShift, ctuHeight >> formatShift, ctuXPos >> formatShift, ctuWidth >> formatShift, isAboveAvail, isBelowAvail); distEnc = calcAlfLCUDist(h, Enc_ALF, compIdx, ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift, isAboveAvail, p_org_pixel, i_org, p_rec_after, i_rec_after); distEnc -= calcAlfLCUDist(h, Enc_ALF, compIdx, ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift, isAboveAvail, p_org_pixel, i_org, p_rec_before, i_rec_before); h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_cu_ctr); rateEnc = p_aec->binary.write_alf_lcu_ctrl(p_aec, 1); costEnc = (double)distEnc + (compIdx == 0 ? lambda_luma : lambda_chroma) * rateEnc; // ALF off distOff = 0; //rateOff = 1; h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_cu_ctr); rateOff = p_aec->binary.write_alf_lcu_ctrl(p_aec, 0); costOff = (double)distOff + (compIdx == 0 ? lambda_luma : lambda_chroma) * rateOff; //set CTB-level on/off flag h->is_alf_lcu_on[ctu][compIdx] = (costEnc < costOff) ? TRUE : FALSE; if (!h->is_alf_lcu_on[ctu][compIdx]) { copyOneAlfBlk(p_rec_after, i_rec_after, p_rec_before, i_rec_before, ctuYPos >> formatShift, ctuXPos >> formatShift, ctuHeight >> formatShift, ctuWidth >> formatShift, isAboveAvail, isBelowAvail); } //update CABAC status //cabacCoder->updateAlfCtrlFlagState(m_pcPic->getCU(ctu)->getAlfLCUEnabled(compIdx)?1:0); h->copy_aec_state_rdo(p_aec, &h->cs_data.cs_alf_cu_ctr); rateOff = p_aec->binary.write_alf_lcu_ctrl(p_aec, (h->is_alf_lcu_on[ctu][compIdx] ? 1 : 0)); h->copy_aec_state_rdo(&h->cs_data.cs_alf_cu_ctr, p_aec); rateBestPic[compIdx] += (h->is_alf_lcu_on[ctu][compIdx] ? rateEnc : rateOff); distBestPic[compIdx] += (h->is_alf_lcu_on[ctu][compIdx] ? distEnc : distOff); } //CTB } } //CTU for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { if (alfPictureParam[compIdx].alf_flag == 1) { double lambda = (compIdx == 0 ? lambda_luma : lambda_chroma); rate = ALFParamBitrateEstimate(&alfPictureParam[compIdx]); if (compIdx == IMG_Y) { noFilters = alfPictureParam[0].filters_per_group - 1; rate += uvlc_bitrate_estimate[noFilters] + (4 * noFilters); } costAlfOn = (double)distBestPic[compIdx] + lambda * (rateBestPic[compIdx] + (double)(rate)); costAlfOff = 0; if (costAlfOn >= costAlfOff) { alfPictureParam[compIdx].alf_flag = 0; for (ctu = 0; ctu < NumCUsInFrame; ctu++) { h->is_alf_lcu_on[ctu][compIdx] = FALSE; } g_funcs.plane_copy(p_dst->planes[compIdx], p_dst->i_stride[compIdx], p_rec->planes[compIdx], p_rec->i_stride[compIdx], p_rec->i_width[compIdx], p_rec->i_lines[compIdx]); } } } } /* --------------------------------------------------------------------------- */ static void ADD_AlfCorrData(AlfCorrData *A, AlfCorrData *B, AlfCorrData *C, int componentID) { int numCoef = ALF_MAX_NUM_COEF; int maxNumGroups = NO_VAR_BINS; int numGroups; int g, j, i; numGroups = (componentID == IMG_Y) ? (maxNumGroups) : (1); for (g = 0; g < numGroups; g++) { C->pixAcc[g] = A->pixAcc[g] + B->pixAcc[g]; for (j = 0; j < numCoef; j++) { C->m_crossCorr[g][j] = A->m_crossCorr[g][j] + B->m_crossCorr[g][j]; for (i = 0; i < numCoef; i++) { C->m_autoCorr[g][j][i] = A->m_autoCorr[g][j][i] + B->m_autoCorr[g][j][i]; } } } } /* --------------------------------------------------------------------------- */ static void accumulateLCUCorrelations(xavs2_t *h, AlfCorrData **alfCorrAcc, AlfCorrData ** alfCorSrcLCU, int useAllLCUs) { int compIdx, numStatLCU, addr; AlfCorrData *alfCorrAccComp; int NumCUsInFrame = h->i_width_in_lcu * h->i_height_in_lcu; for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { alfCorrAccComp = alfCorrAcc[compIdx]; reset_alfCorr(alfCorrAccComp, compIdx); if (!useAllLCUs) { numStatLCU = 0; for (addr = 0; addr < NumCUsInFrame; addr++) { if (h->is_alf_lcu_on[addr][compIdx]) { numStatLCU++; break; } } useAllLCUs = (numStatLCU == 0) ? TRUE : useAllLCUs; } for (addr = 0; addr < (int)NumCUsInFrame; addr++) { if (useAllLCUs || h->is_alf_lcu_on[addr][compIdx]) { //*alfCorrAccComp += *(alfCorSrcLCU[compIdx][addr]); ADD_AlfCorrData(&alfCorSrcLCU[compIdx][addr], alfCorrAccComp, alfCorrAccComp, compIdx); } } } } /* --------------------------------------------------------------------------- */ static void predictALFCoeff(int coeff[][ALF_MAX_NUM_COEF], int numCoef, int numFilters) { int g, pred, sum, i; for (g = 0; g < numFilters; g++) { for (i = 0, sum = 0; i < numCoef - 1; i++) { sum += (2 * coeff[g][i]); } pred = (1 << ALF_NUM_BIT_SHIFT) - (sum); coeff[g][numCoef - 1] = coeff[g][numCoef - 1] - pred; } } /* --------------------------------------------------------------------------- */ static void xcodeFiltCoeff(int filterCoeff[][ALF_MAX_NUM_COEF], int *varIndTab, int numFilters, ALFParam *alfParam) { int filterPattern[NO_VAR_BINS], startSecondFilter = 0, i, g; memset(filterPattern, 0, NO_VAR_BINS * sizeof(int)); alfParam->num_coeff = (int)ALF_MAX_NUM_COEF; alfParam->filters_per_group = numFilters; //merge table assignment if (alfParam->filters_per_group > 1) { for (i = 1; i < NO_VAR_BINS; ++i) { if (varIndTab[i] != varIndTab[i - 1]) { filterPattern[i] = 1; startSecondFilter = i; } } } memcpy(alfParam->filterPattern, filterPattern, NO_VAR_BINS * sizeof(int)); //coefficient prediction for (g = 0; g < alfParam->filters_per_group; g++) { for (i = 0; i < alfParam->num_coeff; i++) { alfParam->coeffmulti[g][i] = filterCoeff[g][i]; } } predictALFCoeff(alfParam->coeffmulti, alfParam->num_coeff, alfParam->filters_per_group); } /* --------------------------------------------------------------------------- */ static void gnsTransposeBacksubstitution(double U[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double rhs[], double x[], int order) { int i, j; // Looping variables double sum; // Holds backsubstitution from already handled rows // Backsubstitution starts x[0] = rhs[0] / U[0][0]; // first row of U for (i = 1; i < order; i++) { // for the rows 1..order-1 for (j = 0, sum = 0.0; j < i; j++) { // Backsubst already solved unknowns sum += x[j] * U[j][i]; } x[i] = (rhs[i] - sum) / U[i][i]; // i'th component of solution vect } } /* --------------------------------------------------------------------------- */ static void gnsBacksubstitution(double R[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double z[ALF_MAX_NUM_COEF], int R_size, double A[ALF_MAX_NUM_COEF]) { int i, j; double sum; R_size--; A[R_size] = z[R_size] / R[R_size][R_size]; for (i = R_size - 1; i >= 0; i--) { for (j = i + 1, sum = 0.0; j <= R_size; j++) { sum += R[i][j] * A[j]; } A[i] = (z[i] - sum) / R[i][i]; } } /* --------------------------------------------------------------------------- */ static int gnsCholeskyDec(int64_t inpMatr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double outMatr[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int noEq) { int i, j, k; /* Looping Variables */ double scale; /* scaling factor for each row */ double invDiag[ALF_MAX_NUM_COEF]; /* Vector of the inverse of diagonal entries of outMatr */ // Cholesky decomposition starts for (i = 0; i < noEq; i++) { for (j = i; j < noEq; j++) { /* Compute the scaling factor */ scale = (double)inpMatr[i][j]; if (i > 0) { for (k = i - 1; k >= 0; k--) { scale -= outMatr[k][j] * outMatr[k][i]; } } /* Compute i'th row of outMatr */ if (i == j) { if (scale <= REG_SQR) { // if(scale <= 0 ) /* If inpMatr is singular */ return 0; } else { /* Normal operation */ invDiag[i] = 1.0 / (outMatr[i][i] = sqrt(scale)); } } else { outMatr[i][j] = scale * invDiag[i]; /* Upper triangular part */ outMatr[j][i] = 0.0; /* Lower triangular part set to 0 */ } } } return 1; /* Signal that Cholesky factorization is successfully performed */ } /* --------------------------------------------------------------------------- */ static int gnsSolveByChol(int64_t LHS[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *rhs, double *x, int noEq) { double aux[ALF_MAX_NUM_COEF]; /* Auxiliary vector */ double U[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF]; /* Upper triangular Cholesky factor of LHS */ int i, singular; /* Looping variable */ assert(noEq > 0); /* The equation to be solved is LHSx = rhs */ /* Compute upper triangular U such that U'*U = LHS */ if (gnsCholeskyDec(LHS, U, noEq)) { /* If Cholesky decomposition has been successful */ singular = 1; /* Now, the equation is U'*U*x = rhs, where U is upper triangular * Solve U'*aux = rhs for aux */ gnsTransposeBacksubstitution(U, rhs, aux, noEq); /* The equation is now U*x = aux, solve it for x (new motion coefficients) */ gnsBacksubstitution(U, aux, noEq, x); } else { /* LHS was singular */ singular = 0; /* Regularize LHS for (i = 0; i < noEq; i++) { LHS[i][i] += REG; }*/ /* Compute upper triangular U such that U'*U = regularized LHS */ singular = gnsCholeskyDec(LHS, U, noEq); if (singular == 1) { /* Solve U'*aux = rhs for aux */ gnsTransposeBacksubstitution(U, rhs, aux, noEq); /* Solve U*x = aux for x */ gnsBacksubstitution(U, aux, noEq, x); } else { x[0] = 1.0; for (i = 1; i < noEq; i++) { x[i] = 0.0; } } } return singular; } /* --------------------------------------------------------------------------- */ static double calculateErrorAbs(int64_t A[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *b, double y, int size) { int i; double error, sum; double c[ALF_MAX_NUM_COEF]; gnsSolveByChol(A, b, c, size); sum = 0; for (i = 0; i < size; i++) { sum += c[i] * b[i]; } error = y - sum; return error; } /* --------------------------------------------------------------------------- */ static double mergeFiltersGreedy(alf_ctx_t *Enc_ALF, double yGlobalSeq[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int64_t EGlobalSeq[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *pixAccGlobalSeq, int intervalBest[NO_VAR_BINS][2], int sqrFiltLength, int noIntervals) { int first, ind, ind1, ind2, i, j, bestToMerge; double error, error1, error2, errorMin; static double pixAcc_temp, error_tab[NO_VAR_BINS], error_comb_tab[NO_VAR_BINS]; static int indexList[NO_VAR_BINS], available[NO_VAR_BINS], noRemaining; if (noIntervals == NO_VAR_BINS) { noRemaining = NO_VAR_BINS; for (ind = 0; ind < NO_VAR_BINS; ind++) { indexList[ind] = ind; available[ind] = 1; Enc_ALF->m_pixAcc_merged[ind] = pixAccGlobalSeq[ind]; memcpy(Enc_ALF->m_cross_merged[ind], yGlobalSeq[ind], sizeof(double)*sqrFiltLength); for (i = 0; i < sqrFiltLength; i++) { memcpy(Enc_ALF->m_auto_merged[ind][i], EGlobalSeq[ind][i], sizeof(int64_t)*sqrFiltLength); } } // try merging different matrices for (ind = 0; ind < NO_VAR_BINS; ind++) { error_tab[ind] = calculateErrorAbs(Enc_ALF->m_auto_merged[ind], Enc_ALF->m_cross_merged[ind], Enc_ALF->m_pixAcc_merged[ind], sqrFiltLength); } for (ind = 0; ind < NO_VAR_BINS - 1; ind++) { ind1 = indexList[ind]; ind2 = indexList[ind + 1]; error1 = error_tab[ind1]; error2 = error_tab[ind2]; pixAcc_temp = Enc_ALF->m_pixAcc_merged[ind1] + Enc_ALF->m_pixAcc_merged[ind2]; for (i = 0; i < sqrFiltLength; i++) { Enc_ALF->m_cross_temp[i] = Enc_ALF->m_cross_merged[ind1][i] + Enc_ALF->m_cross_merged[ind2][i]; for (j = 0; j < sqrFiltLength; j++) { Enc_ALF->m_auto_temp[i][j] = Enc_ALF->m_auto_merged[ind1][i][j] + Enc_ALF->m_auto_merged[ind2][i][j]; } } error_comb_tab[ind1] = calculateErrorAbs(Enc_ALF->m_auto_temp, Enc_ALF->m_cross_temp, pixAcc_temp, sqrFiltLength) - error1 - error2; } } while (noRemaining > noIntervals) { errorMin = 0; first = 1; bestToMerge = 0; for (ind = 0; ind < noRemaining - 1; ind++) { error = error_comb_tab[indexList[ind]]; if ((error < errorMin || first == 1)) { errorMin = error; bestToMerge = ind; first = 0; } } ind1 = indexList[bestToMerge]; ind2 = indexList[bestToMerge + 1]; Enc_ALF->m_pixAcc_merged[ind1] += Enc_ALF->m_pixAcc_merged[ind2]; for (i = 0; i < sqrFiltLength; i++) { Enc_ALF->m_cross_merged[ind1][i] += Enc_ALF->m_cross_merged[ind2][i]; for (j = 0; j < sqrFiltLength; j++) { Enc_ALF->m_auto_merged[ind1][i][j] += Enc_ALF->m_auto_merged[ind2][i][j]; } } available[ind2] = 0; // update error tables error_tab[ind1] = error_comb_tab[ind1] + error_tab[ind1] + error_tab[ind2]; if (indexList[bestToMerge] > 0) { ind1 = indexList[bestToMerge - 1]; ind2 = indexList[bestToMerge]; error1 = error_tab[ind1]; error2 = error_tab[ind2]; pixAcc_temp = Enc_ALF->m_pixAcc_merged[ind1] + Enc_ALF->m_pixAcc_merged[ind2]; for (i = 0; i < sqrFiltLength; i++) { Enc_ALF->m_cross_temp[i] = Enc_ALF->m_cross_merged[ind1][i] + Enc_ALF->m_cross_merged[ind2][i]; for (j = 0; j < sqrFiltLength; j++) { Enc_ALF->m_auto_temp[i][j] = Enc_ALF->m_auto_merged[ind1][i][j] + Enc_ALF->m_auto_merged[ind2][i][j]; } } error_comb_tab[ind1] = calculateErrorAbs(Enc_ALF->m_auto_temp, Enc_ALF->m_cross_temp, pixAcc_temp, sqrFiltLength) - error1 - error2; } if (indexList[bestToMerge + 1] < NO_VAR_BINS - 1) { ind1 = indexList[bestToMerge]; ind2 = indexList[bestToMerge + 2]; error1 = error_tab[ind1]; error2 = error_tab[ind2]; pixAcc_temp = Enc_ALF->m_pixAcc_merged[ind1] + Enc_ALF->m_pixAcc_merged[ind2]; for (i = 0; i < sqrFiltLength; i++) { Enc_ALF->m_cross_temp[i] = Enc_ALF->m_cross_merged[ind1][i] + Enc_ALF->m_cross_merged[ind2][i]; for (j = 0; j < sqrFiltLength; j++) { Enc_ALF->m_auto_temp[i][j] = Enc_ALF->m_auto_merged[ind1][i][j] + Enc_ALF->m_auto_merged[ind2][i][j]; } } error_comb_tab[ind1] = calculateErrorAbs(Enc_ALF->m_auto_temp, Enc_ALF->m_cross_temp, pixAcc_temp, sqrFiltLength) - error1 - error2; } ind = 0; for (i = 0; i < NO_VAR_BINS; i++) { if (available[i] == 1) { indexList[ind] = i; ind++; } } noRemaining--; } errorMin = 0; for (ind = 0; ind < noIntervals; ind++) { errorMin += error_tab[indexList[ind]]; } for (ind = 0; ind < noIntervals - 1; ind++) { intervalBest[ind][0] = indexList[ind]; intervalBest[ind][1] = indexList[ind + 1] - 1; } intervalBest[noIntervals - 1][0] = indexList[noIntervals - 1]; intervalBest[noIntervals - 1][1] = NO_VAR_BINS - 1; return (errorMin); } /* --------------------------------------------------------------------------- */ static double xfindBestCoeffCodMethod(int filterCoeffSymQuant[][ALF_MAX_NUM_COEF], int sqrFiltLength, int filters_per_fr, double errorForce0CoeffTab[NO_VAR_BINS][2], double lambda) { int coeffBits, i; double error = 0, lagrangian; int coeffmulti[NO_VAR_BINS][ALF_MAX_NUM_COEF]; int g; for (g = 0; g < filters_per_fr; g++) { for (i = 0; i < sqrFiltLength; i++) { coeffmulti[g][i] = filterCoeffSymQuant[g][i]; } } predictALFCoeff(coeffmulti, sqrFiltLength, filters_per_fr); coeffBits = 0; for (g = 0; g < filters_per_fr; g++) { for (i = 0; i < (int)ALF_MAX_NUM_COEF; i++) { coeffBits += svlc_bitrate_estimate[64 + coeffmulti[g][i]]; } error += errorForce0CoeffTab[g][1]; } lagrangian = error + lambda * coeffBits; return (lagrangian); } /* --------------------------------------------------------------------------- */ static void add_A(int64_t Amerged[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int64_t A[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int start, int stop, int size) { int i, j, ind; for (i = 0; i < size; i++) { for (j = 0; j < size; j++) { Amerged[i][j] = 0; for (ind = start; ind <= stop; ind++) { Amerged[i][j] += A[ind][i][j]; } } } } /* --------------------------------------------------------------------------- */ static void add_b(double *bmerged, double b[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int start, int stop, int size) { int i, ind; for (i = 0; i < size; i++) { bmerged[i] = 0; for (ind = start; ind <= stop; ind++) { bmerged[i] += b[ind][i]; } } } /* --------------------------------------------------------------------------- */ static void roundFiltCoeff(int *FilterCoeffQuan, double *FilterCoeff, int sqrFiltLength, int factor) { int i, diffInt, sign; double diff; for (i = 0; i < sqrFiltLength; i++) { sign = (FilterCoeff[i] > 0) ? 1 : -1; diff = FilterCoeff[i] * sign; diffInt = (int)(diff * (double)factor + 0.5); FilterCoeffQuan[i] = diffInt * sign; } } /* --------------------------------------------------------------------------- */ static double calculateErrorCoeffProvided(int64_t A[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *b, double *c, int size) { int i, j; double error = 0, sum; for (i = 0; i < size; i++) { // diagonal for (sum = 0, j = i + 1; j < size; j++) { sum += (A[j][i] + A[i][j]) * c[j]; } error += (A[i][i] * c[i] + sum - 2 * b[i]) * c[i]; } return error; } /* --------------------------------------------------------------------------- */ static double QuantizeIntegerFilterPP(double *filterCoeff, int *filterCoeffQuant, int64_t E[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *y, int sqrFiltLength) { double error; int filterCoeffQuantMod[ALF_MAX_NUM_COEF]; int factor = (1 << ((int)ALF_NUM_BIT_SHIFT)); int i; int quantCoeffSum, minInd, targetCoeffSumInt, k, diff; double targetCoeffSum, errMin; const int *weights = tab_weightsShape1Sym; gnsSolveByChol(E, y, filterCoeff, sqrFiltLength); targetCoeffSum = 0; quantCoeffSum = 0; roundFiltCoeff(filterCoeffQuant, filterCoeff, sqrFiltLength, factor); for (i = 0; i < sqrFiltLength; i++) { targetCoeffSum += (weights[i] * filterCoeff[i] * factor); quantCoeffSum += weights[i] * filterCoeffQuant[i]; } targetCoeffSumInt = ROUND(targetCoeffSum); while (quantCoeffSum != targetCoeffSumInt) { diff = (quantCoeffSum - targetCoeffSumInt); diff = (diff < 0) ? (-diff) : diff; errMin = 0; minInd = -1; for (k = 0; k < sqrFiltLength; k++) { if (weights[k] <= diff) { for (i = 0; i < sqrFiltLength; i++) { filterCoeffQuantMod[i] = filterCoeffQuant[i]; } if (quantCoeffSum > targetCoeffSumInt) { filterCoeffQuantMod[k]--; } else { filterCoeffQuantMod[k]++; } for (i = 0; i < sqrFiltLength; i++) { filterCoeff[i] = (double)filterCoeffQuantMod[i] / (double)factor; } error = calculateErrorCoeffProvided(E, y, filterCoeff, sqrFiltLength); if (error < errMin || minInd == -1) { errMin = error; minInd = k; } } // if (weights(k)<=diff) } // for (k=0; k targetCoeffSumInt) { filterCoeffQuant[minInd]--; } else { filterCoeffQuant[minInd]++; } quantCoeffSum = 0; for (i = 0; i < sqrFiltLength; i++) { quantCoeffSum += weights[i] * filterCoeffQuant[i]; } } checkFilterCoeffValue(filterCoeffQuant, sqrFiltLength); for (i = 0; i < sqrFiltLength; i++) { filterCoeff[i] = (double)filterCoeffQuant[i] / (double)factor; } error = calculateErrorCoeffProvided(E, y, filterCoeff, sqrFiltLength); return (error); } /* --------------------------------------------------------------------------- */ static double findFilterCoeff(alf_ctx_t *Enc_ALF, int64_t EGlobalSeq[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double yGlobalSeq[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *pixAccGlobalSeq, int filterCoeffSeq[][ALF_MAX_NUM_COEF], int filterCoeffQuantSeq[][ALF_MAX_NUM_COEF], int intervalBest[NO_VAR_BINS][2], int varIndTab[NO_VAR_BINS], int sqrFiltLength, int filters_per_fr, double errorTabForce0Coeff[NO_VAR_BINS][2]) { double pixAcc_temp; int filterCoeffQuant[ALF_MAX_NUM_COEF]; double filterCoeff[ALF_MAX_NUM_COEF]; double error; int k, filtNo; error = 0; for (filtNo = 0; filtNo < filters_per_fr; filtNo++) { add_A(Enc_ALF->m_auto_temp, EGlobalSeq, intervalBest[filtNo][0], intervalBest[filtNo][1], sqrFiltLength); add_b(Enc_ALF->m_cross_temp, yGlobalSeq, intervalBest[filtNo][0], intervalBest[filtNo][1], sqrFiltLength); pixAcc_temp = 0; for (k = intervalBest[filtNo][0]; k <= intervalBest[filtNo][1]; k++) { pixAcc_temp += pixAccGlobalSeq[k]; varIndTab[k] = filtNo; } // find coefficients errorTabForce0Coeff[filtNo][1] = pixAcc_temp + QuantizeIntegerFilterPP(filterCoeff, filterCoeffQuant, Enc_ALF->m_auto_temp, Enc_ALF->m_cross_temp, sqrFiltLength); errorTabForce0Coeff[filtNo][0] = pixAcc_temp; error += errorTabForce0Coeff[filtNo][1]; for (k = 0; k < sqrFiltLength; k++) { filterCoeffSeq[filtNo][k] = filterCoeffQuant[k]; filterCoeffQuantSeq[filtNo][k] = filterCoeffQuant[k]; } } return (error); } /* --------------------------------------------------------------------------- */ static void xfindBestFilterVarPred(alf_ctx_t *Enc_ALF, double ySym[ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], int64_t ESym[][ALF_MAX_NUM_COEF][ALF_MAX_NUM_COEF], double *pixAcc, int filterCoeffSym[][ALF_MAX_NUM_COEF], int *filters_per_fr_best, int varIndTab[], double lambda_val, int numMaxFilters) { int filterCoeffSymQuant[NO_VAR_BINS][ALF_MAX_NUM_COEF]; int filters_per_fr, firstFilt, interval[NO_VAR_BINS][2], intervalBest[NO_VAR_BINS][2]; double lagrangian, lagrangianMin; int sqrFiltLength; double errorForce0CoeffTab[NO_VAR_BINS][2]; sqrFiltLength = (int)ALF_MAX_NUM_COEF; // zero all variables memset(varIndTab, 0, sizeof(int)*NO_VAR_BINS); memset(filterCoeffSym, 0, sizeof(int)*ALF_MAX_NUM_COEF * NO_VAR_BINS); memset(filterCoeffSymQuant, 0, sizeof(int)*ALF_MAX_NUM_COEF * NO_VAR_BINS); firstFilt = 1; lagrangianMin = 0; filters_per_fr = NO_VAR_BINS; while (filters_per_fr >= 1) { mergeFiltersGreedy(Enc_ALF, ySym, ESym, pixAcc, interval, sqrFiltLength, filters_per_fr); findFilterCoeff(Enc_ALF, ESym, ySym, pixAcc, filterCoeffSym, filterCoeffSymQuant, interval, varIndTab, sqrFiltLength, filters_per_fr, errorForce0CoeffTab); lagrangian = xfindBestCoeffCodMethod(filterCoeffSymQuant, sqrFiltLength, filters_per_fr, errorForce0CoeffTab, lambda_val); if (lagrangian < lagrangianMin || firstFilt == 1 || filters_per_fr == numMaxFilters) { firstFilt = 0; lagrangianMin = lagrangian; (*filters_per_fr_best) = filters_per_fr; memcpy(intervalBest, interval, NO_VAR_BINS * 2 * sizeof(int)); } filters_per_fr--; } findFilterCoeff(Enc_ALF, ESym, ySym, pixAcc, filterCoeffSym, filterCoeffSymQuant, intervalBest, varIndTab, sqrFiltLength, (*filters_per_fr_best), errorForce0CoeffTab); if (*filters_per_fr_best == 1) { memset(varIndTab, 0, sizeof(int)*NO_VAR_BINS); } } /* --------------------------------------------------------------------------- */ static int compare_coef(const void *value1, const void *value2) { DhNc *a = (DhNc*)value1; DhNc *b = (DhNc*)value2; double temp = (a->dh - b->dh); return temp > 0.0 ? 1 : (temp < 0.0 ? -1 : 0); } /* --------------------------------------------------------------------------- */ static void xQuantFilterCoef(double *hh, int *qh) { int i; const int N = (int)ALF_MAX_NUM_COEF; int max_value, min_value; double dbl_total_gain; int total_gain, q_total_gain; int upper, lower; DhNc dhnc[ALF_MAX_NUM_COEF]; const int *pFiltMag = tab_weightsShape1Sym; max_value = (1 << (1 + ALF_NUM_BIT_SHIFT)) - 1; min_value = 0 - (1 << (1 + ALF_NUM_BIT_SHIFT)); dbl_total_gain = 0.0; q_total_gain = 0; for (i = 0; i < N; i++) { if (hh[i] >= 0.0) { qh[i] = (int)(hh[i] * (1 << ALF_NUM_BIT_SHIFT) + 0.5); } else { qh[i] = -(int)(-hh[i] * (1 << ALF_NUM_BIT_SHIFT) + 0.5); } dhnc[i].dh = (double)qh[i] / (double)(1 << ALF_NUM_BIT_SHIFT) - hh[i]; dhnc[i].dh *= pFiltMag[i]; dbl_total_gain += hh[i] * pFiltMag[i]; q_total_gain += qh[i] * pFiltMag[i]; dhnc[i].nc = i; } // modification of quantized filter coefficients total_gain = (int)(dbl_total_gain * (1 << ALF_NUM_BIT_SHIFT) + 0.5); if (q_total_gain != total_gain) { qsort(dhnc, N, sizeof(struct dh_nc), compare_coef); if (q_total_gain > total_gain) { upper = N - 1; while (q_total_gain > total_gain + 1) { i = dhnc[upper % N].nc; qh[i]--; q_total_gain -= pFiltMag[i]; upper--; } if (q_total_gain == total_gain + 1) { if (dhnc[N - 1].dh > 0) { qh[N - 1]--; } else { i = dhnc[upper % N].nc; qh[i]--; qh[N - 1]++; } } } else if (q_total_gain < total_gain) { lower = 0; while (q_total_gain < total_gain - 1) { i = dhnc[lower % N].nc; qh[i]++; q_total_gain += pFiltMag[i]; lower++; } if (q_total_gain == total_gain - 1) { if (dhnc[N - 1].dh < 0) { qh[N - 1]++; } else { i = dhnc[lower % N].nc; qh[i]++; qh[N - 1]--; } } } } // set of filter coefficients for (i = 0; i < N; i++) { qh[i] = XAVS2_CLIP3(min_value, max_value, qh[i]); } checkFilterCoeffValue(qh, N); } /* --------------------------------------------------------------------------- */ static void deriveFilterInfo(alf_ctx_t *Enc_ALF, ALFParam *alfPictureParam, AlfCorrData **alfCorr_ptr, int maxNumFilters, double lambda) { int numCoeff = ALF_MAX_NUM_COEF; double coef[ALF_MAX_NUM_COEF]; int compIdx, lambdaForMerge, numFilters; compIdx = IMG_Y; AlfCorrData *alfCorr = alfCorr_ptr[compIdx]; ALFParam *alfFiltParam = &alfPictureParam[compIdx]; alfFiltParam->alf_flag = 1; lambdaForMerge = ((int)lambda) * (1 << (2 * Enc_ALF->m_uiBitIncrement)); memset(Enc_ALF->m_varIndTab, 0, sizeof(int)*NO_VAR_BINS); xfindBestFilterVarPred(Enc_ALF, alfCorr->m_crossCorr, alfCorr->m_autoCorr, alfCorr->pixAcc, Enc_ALF->m_filterCoeffSym, &numFilters, Enc_ALF->m_varIndTab, lambdaForMerge, maxNumFilters); xcodeFiltCoeff(Enc_ALF->m_filterCoeffSym, Enc_ALF->m_varIndTab, numFilters, alfFiltParam); compIdx = IMG_U; alfCorr = alfCorr_ptr[compIdx]; alfFiltParam = &alfPictureParam[compIdx]; alfFiltParam->alf_flag = 1; gnsSolveByChol(alfCorr->m_autoCorr[0], alfCorr->m_crossCorr[0], coef, numCoeff); xQuantFilterCoef(coef, Enc_ALF->m_filterCoeffSym[0]); memcpy(alfFiltParam->coeffmulti[0], Enc_ALF->m_filterCoeffSym[0], sizeof(int)*numCoeff); predictALFCoeff(alfFiltParam->coeffmulti, numCoeff, alfFiltParam->filters_per_group); compIdx = IMG_V; alfCorr = alfCorr_ptr[compIdx]; alfFiltParam = &alfPictureParam[compIdx]; alfFiltParam->alf_flag = 1; gnsSolveByChol(alfCorr->m_autoCorr[0], alfCorr->m_crossCorr[0], coef, numCoeff); xQuantFilterCoef(coef, Enc_ALF->m_filterCoeffSym[0]); memcpy(alfFiltParam->coeffmulti[0], Enc_ALF->m_filterCoeffSym[0], sizeof(int)*numCoeff); predictALFCoeff(alfFiltParam->coeffmulti, numCoeff, alfFiltParam->filters_per_group); } /** * --------------------------------------------------------------------------- * Function: ALF parameter selection * Input: * alfPictureParam: The ALF parameter * apsId: The ALF parameter index in the buffer * isNewApsSentThe New flag index * lambda : The lambda value in the ALF-RD decision * Return: * --------------------------------------------------------------------------- */ static void setCurAlfParam(xavs2_t *h, alf_ctx_t *Enc_ALF, aec_t *p_aec, ALFParam *alfPictureParam, double lambda) { int compIdx, i; AlfCorrData *alfPicCorr[IMG_CMPNTS]; double costMin, cost; ALFParam tempAlfParam[IMG_CMPNTS]; int picHeaderBitrate = 0; costMin = MAX_DOUBLE; for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { init_alf_frame_param(&tempAlfParam[compIdx]); alfPicCorr[compIdx] = &Enc_ALF->m_pic_corr[compIdx]; } for (i = 0; i < Enc_ALF->m_alfReDesignIteration; i++) { // redesign filter according to the last on off results, "!i" replace TRUE/FALSE to control design or redesign accumulateLCUCorrelations(h, alfPicCorr, Enc_ALF->m_alfCorr, !i); deriveFilterInfo(Enc_ALF, tempAlfParam, alfPicCorr, NO_VAR_BINS, lambda); // estimate cost cost = executePicLCUOnOffDecisionRDOEstimate(h, Enc_ALF, p_aec, tempAlfParam, lambda, Enc_ALF->m_alfCorr[0]); picHeaderBitrate = estimateALFBitrateInPicHeader(tempAlfParam); cost += (double)picHeaderBitrate * lambda; if (cost < costMin) { costMin = cost; for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { copyALFparam(&alfPictureParam[compIdx], &tempAlfParam[compIdx], compIdx); } } } alfPicCorr[0] = alfPicCorr[1] = alfPicCorr[2] = NULL; } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ int alf_get_buffer_size(const xavs2_param_t *param) { int size_lcu = 1 << param->lcu_bit_level; int width_in_lcu = (param->org_width + size_lcu - 1) >> param->lcu_bit_level; int height_in_lcu = (param->org_height + size_lcu - 1) >> param->lcu_bit_level; int num_lcu = height_in_lcu * width_in_lcu; int maxNumTemporalLayer = (int)(log10((float)(param->i_gop_size)) / log10(2.0) + 1); int mem_size = sizeof(alf_ctx_t) + 2 * IMG_CMPNTS * num_lcu * sizeof(AlfCorrData) + maxNumTemporalLayer * IMG_CMPNTS * num_lcu * sizeof(AlfCorrData) + num_lcu * sizeof(int) // m_numSlicesDataInOneLCU + num_lcu * sizeof(int8_t) // tab_lcu_region + num_lcu * IMG_CMPNTS * sizeof(bool_t) // is_alf_lcu_on[3] + num_lcu * sizeof(AlfCorrData) //for other function temp variable alfPicCorr + CACHE_LINE_SIZE * 50; return mem_size; } /* --------------------------------------------------------------------------- */ void alf_init_buffer(xavs2_t *h, uint8_t *mem_base) { // ϣɨ˳ static const uint8_t regionTable[NO_VAR_BINS] = { 0, 1, 4, 5, 15, 2, 3, 6, 14, 11, 10, 7, 13, 12, 9, 8} ; int width_in_lcu = h->i_width_in_lcu; int height_in_lcu = h->i_height_in_lcu; int quad_w_in_lcu = ((width_in_lcu + 1) >> 2); int quad_h_in_lcu = ((height_in_lcu + 1) >> 2); int region_idx_x; int region_idx_y; int i, j; int num_lcu = height_in_lcu * width_in_lcu; int compIdx, n; int maxNumTemporalLayer = (int)(log10((float)(h->param->i_gop_size)) / log10(2.0) + 1); int mem_size; uint8_t *mem_ptr = mem_base; alf_ctx_t *Enc_ALF; mem_size = alf_get_buffer_size(h->param); memset(mem_ptr, 0, mem_size); Enc_ALF = (alf_ctx_t *)mem_ptr; mem_ptr += sizeof(alf_ctx_t); Enc_ALF->m_alfReDesignIteration = 3; Enc_ALF->m_uiBitIncrement = 0; for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { Enc_ALF->m_alfCorr[compIdx] = (AlfCorrData *)mem_ptr; mem_ptr += (num_lcu * sizeof(AlfCorrData)); } for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { Enc_ALF->m_alfNonSkippedCorr[compIdx] = (AlfCorrData *)mem_ptr; mem_ptr += (num_lcu * sizeof(AlfCorrData)); } Enc_ALF->m_alfPrevCorr = (AlfCorrData *)mem_ptr; mem_ptr += maxNumTemporalLayer * IMG_CMPNTS * num_lcu * sizeof(AlfCorrData); for (n = 0; n < maxNumTemporalLayer; n++) { for (compIdx = 0; compIdx < IMG_CMPNTS; compIdx++) { init_alf_frame_param(&(Enc_ALF->m_alfPictureParam[n][compIdx])); } } for (n = 0; n < NO_VAR_BINS; n++) { Enc_ALF->m_coeffNoFilter[n][ALF_MAX_NUM_COEF - 1] = (1 << ALF_NUM_BIT_SHIFT); } Enc_ALF->m_numSlicesDataInOneLCU = (int *)mem_ptr; mem_ptr += (num_lcu * sizeof(int)); Enc_ALF->tab_lcu_region = (int8_t *)mem_ptr; mem_ptr += (num_lcu * sizeof(int8_t)); h->is_alf_lcu_on = (bool_t(*)[IMG_CMPNTS])mem_ptr; mem_ptr += (num_lcu * IMG_CMPNTS * sizeof(bool_t)); for (j = 0; j < height_in_lcu; j++) { region_idx_y = (quad_h_in_lcu == 0) ? 3 : XAVS2_MIN(j / quad_h_in_lcu, 3); for (i = 0; i < width_in_lcu; i++) { region_idx_x = (quad_w_in_lcu == 0) ? 3 : XAVS2_MIN(i / quad_w_in_lcu, 3); Enc_ALF->tab_lcu_region[j * width_in_lcu + i] = regionTable[region_idx_y * 4 + region_idx_x]; } } h->enc_alf = Enc_ALF; aec_init_coding_state(&h->cs_data.cs_alf_cu_ctr); aec_init_coding_state(&h->cs_data.cs_alf_initial); } /* --------------------------------------------------------------------------- */ void alf_filter_one_frame(xavs2_t *h) { aec_t *p_aec = &h->aec; ALFParam *alfPictureParam = h->pic_alf_params; xavs2_frame_t *p_org = h->fenc; xavs2_frame_t *p_rec = h->img_alf; alf_ctx_t *Enc_ALF = (alf_ctx_t *)h->enc_alf; double lambda_mode = h->f_lambda_mode * LAMBDA_SCALE_LUMA; int i; h->copy_aec_state_rdo(&h->cs_data.cs_alf_initial, p_aec); // init ALF buffers for (i = 0; i < IMG_CMPNTS; i++) { init_alf_frame_param(&alfPictureParam[i]); } setCurAlfParam(h, Enc_ALF, p_aec, alfPictureParam, lambda_mode); executePicLCUOnOffDecision(h, Enc_ALF, p_aec, alfPictureParam, lambda_mode, p_org, p_rec, h->fdec); // set ALF frame parameters for (i = 0; i < IMG_CMPNTS; i++) { h->pic_alf_on[i] = alfPictureParam[i].alf_flag; } } xavs2-1.3/source/encoder/alf.h000066400000000000000000000037701340660520300162600ustar00rootroot00000000000000/* * alf.h * * Description of this file: * ALF functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_ALF_H #define XAVS2_ALF_H #define alf_get_buffer_size FPFX(alf_get_buffer_size) int alf_get_buffer_size(const xavs2_param_t *param); #define alf_init_buffer FPFX(alf_init_buffer) void alf_init_buffer(xavs2_t *h, uint8_t *mem_base); #define alf_filter_one_frame FPFX(alf_filter_one_frame) void alf_filter_one_frame(xavs2_t *h); #define alf_get_statistics_lcu FPFX(alf_get_statistics_lcu) void alf_get_statistics_lcu(xavs2_t *h, int lcu_x, int lcu_y, xavs2_frame_t *p_org, xavs2_frame_t *p_rec); #endif // XAVS2_ALF_H xavs2-1.3/source/encoder/bitstream.h000066400000000000000000000243371340660520300175120ustar00rootroot00000000000000/* * bitstream.h * * Description of this file: * Bitstream Processing functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_BITSTREAM_H #define XAVS2_BITSTREAM_H /** * =========================================================================== * global variables * =========================================================================== */ #if XAVS2_TRACE extern int g_sym_count; /* global symbol count for trace */ extern int g_bit_count; /* global bit count for trace */ #endif /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void xavs2_bs_init(bs_t *bs, void *p_data, int i_data) { bs->p_start = (uint8_t *)p_data; bs->p = (uint8_t *)p_data; bs->p_end = bs->p_start + i_data; bs->i_left = 8; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int xavs2_bs_pos(bs_t *bs) { return ((int)(8 * (bs->p - bs->p_start) + 8 - bs->i_left)); } /* --------------------------------------------------------------------------- * writes UVLC code to the bitstream buffer */ static ALWAYS_INLINE void xavs2_bs_write(bs_t *bs, uint32_t code, int len) { assert(bs->p < bs->p_end); while (len > 0) { if (len < 32) { code &= (1 << len) - 1; } if (len < bs->i_left) { (*bs->p) = (uint8_t)(((*bs->p) << len) | code); bs->i_left -= len; break; } else { (*bs->p) = (uint8_t)(((*bs->p) << bs->i_left) | (code >> (len - bs->i_left))); bs->p++; len -= bs->i_left; bs->i_left = 8; } } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void xavs2_bs_write1(bs_t *bs, uint8_t i_bit) { if (bs->p < bs->p_end) { (*bs->p) <<= 1; (*bs->p) |= i_bit; if (--bs->i_left == 0) { bs->i_left = 8; bs->p++; } } } /* --------------------------------------------------------------------------- * one bit "1" is added to the end of stream, then some bits "0" are added * to byte aligned position. */ static ALWAYS_INLINE void xavs2_bs_stuff_bits(bs_t *bs) { if (bs->i_left != 8) { xavs2_bs_write1(bs, 1); xavs2_bs_write(bs, 0, bs->i_left & 7); } else { xavs2_bs_write(bs, 0x80, 8); } } #define bs_stuff_bits xavs2_bs_stuff_bits /* --------------------------------------------------------------------------- * one bit "1" is added to the end of stream, then some bits "0" are added * to byte aligned position. */ static ALWAYS_INLINE int xavs2_bs_byte_align(bs_t *bs) { if (bs->i_left != 8) { int bits = bs->i_left; xavs2_bs_write1(bs, 1); xavs2_bs_write(bs, 0, bs->i_left & 7); return bits; } return 0; } #define bs_byte_align xavs2_bs_byte_align /* --------------------------------------------------------------------------- * write out a trace string to the trace file */ #if XAVS2_TRACE static void write_trace_info(char *trace_string, int bit_pattern, int value, int len) { int i, chars; xavs2_trace("@"); chars = xavs2_trace("%i", g_bit_count); while (chars++ < 6) { xavs2_trace(" "); } chars += xavs2_trace("%s", trace_string); while (chars++ < 55) { xavs2_trace(" "); } // align bit-pattern if (len < 15) { for (i = 0; i < 15 - len; i++) { xavs2_trace(" "); } } // print bit-pattern g_bit_count += len; for (i = 1; i <= len; i++) { if ((bit_pattern >> (len - i)) & 0x1) { xavs2_trace("1"); } else { xavs2_trace("0"); } } xavs2_trace(" (%3d) \n", value); } #endif /* --------------------------------------------------------------------------- * write out a trace string to the trace file */ #if XAVS2_TRACE static void write_trace_info2(char *trace_string, int value, int len) { int i, chars; xavs2_trace("@"); chars = xavs2_trace("%i", g_bit_count); while (chars++ < 6) { xavs2_trace(" "); } chars += xavs2_trace("%s", trace_string); while (chars++ < 55) { xavs2_trace(" "); } // align bit-pattern if (len < 15) { for (i = 0; i < 15 - len; i++) { xavs2_trace(" "); } } g_bit_count += len; while (len >= 32) { for (i = 0; i < 8; i++) { xavs2_trace("0"); } len -= 8; } // print bit-pattern for (i = 0; i < len; i++) { if (0x01 & (value >> (len - i - 1))) { xavs2_trace("1"); } else { xavs2_trace("0"); } } xavs2_trace(" (%3d) \n", value); } #endif /** * --------------------------------------------------------------------------- * Function : writes an ue(v) syntax element, returns the length in bits * Parameters : * [in ] : trace_string - the string for the trace file * : value - the value to be coded * [out] : bs - the bs_t the value should be coded into * Return : number of bits used by the coded syntax element * --------------------------------------------------------------------------- */ #if XAVS2_TRACE static ALWAYS_INLINE int xavs2_bs_write_ue(bs_t *bs, char *trace_string, int value) #else static ALWAYS_INLINE int xavs2_bs_write_ue(bs_t *bs, int value) #endif { int i, nn, len, inf, suffix_len, bit_pattern; // generates UVLC code and passes the codeword to the buffer nn = (value + 1) >> 1; for (i = 0; i < 16 && nn != 0; i++) { nn /= 2; } len = 2 * i + 1; inf = value + 1 - (int)pow(2, i); suffix_len = len >> 1; bit_pattern = (1 << suffix_len) | (inf & ((1 << suffix_len) - 1)); xavs2_bs_write(bs, bit_pattern, len); #if XAVS2_TRACE write_trace_info(trace_string, bit_pattern, value, len); #endif return len; } /** * --------------------------------------------------------------------------- * Function : writes an se(v) syntax element, returns the length in bits * Parameters : * [in ] : trace_string - the string for the trace file * : value - the value to be coded * [out] : bs - the bs_t the value should be coded into * Return : number of bits used by the coded syntax element * --------------------------------------------------------------------------- */ #if XAVS2_TRACE static ALWAYS_INLINE int xavs2_bs_write_se(bs_t *bs, char *trace_string, int value) #else static ALWAYS_INLINE int xavs2_bs_write_se(bs_t *bs, int value) #endif { #if XAVS2_TRACE return xavs2_bs_write_ue(bs, trace_string, value <= 0 ? -value * 2 : value * 2 - 1); #else return xavs2_bs_write_ue(bs, value <= 0 ? -value * 2 : value * 2 - 1); #endif } /** * --------------------------------------------------------------------------- * Function : writes n bit fixed length syntax element, returns the length in bits * Parameters : * [in ] : len - number of bits write out * : trace_string - the string for the trace file * : value - the value to be coded * [out] : bs - the bs_t the value should be coded into * Return : number of bits used by the coded syntax element * --------------------------------------------------------------------------- */ #if XAVS2_TRACE static ALWAYS_INLINE int xavs2_bs_write_uv(bs_t *bs, int len, char *trace_string, int value, int b_trace) #else static ALWAYS_INLINE int xavs2_bs_write_uv(bs_t *bs, int len, int value) #endif { xavs2_bs_write(bs, value, len); #if XAVS2_TRACE if (b_trace) { write_trace_info(trace_string, value, value, len); } #endif return len; } #if XAVS2_TRACE #define ue_v(bs, value, trace_string) xavs2_bs_write_ue(bs, trace_string, value) #define se_v(bs, value, trace_string) xavs2_bs_write_se(bs, trace_string, value) #define u_v(bs, len, value, trace_string) xavs2_bs_write_uv(bs, len, trace_string, value, 1) #define u_0(bs, len, value, trace_string) xavs2_bs_write_uv(bs, len, trace_string, value, 0) /* no trace */ #else #define ue_v(bs, value, trace_string) xavs2_bs_write_ue(bs, value) #define se_v(bs, value, trace_string) xavs2_bs_write_se(bs, value) #define u_v(bs, len, value, trace_string) xavs2_bs_write_uv(bs, len, value) #define u_0(bs, len, value, trace_string) xavs2_bs_write_uv(bs, len, value) #endif #endif // XAVS2_BITSTREAM_H xavs2-1.3/source/encoder/encoder.c000066400000000000000000002372711340660520300171350ustar00rootroot00000000000000/* * encoder.c * * Description of this file: * Encoder functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "mc.h" #include "wrapper.h" #include "encoder.h" #include "frame.h" #include "aec.h" #include "slice.h" #include "nal.h" #include "ratecontrol.h" #include "tdrdo.h" #include "me.h" #include "cpu.h" #include "rdo.h" #include "rps.h" #include "wquant.h" #include "header.h" #include "cudata.h" #include "block_info.h" #include "presets.h" #include "version.h" #include "alf.h" #include "sao.h" #include "ratecontrol.h" /* "video_sequence_end_code", 0xB1 */ static const uint8_t end_code[16] = { 0x00, 0x00, 0x01, 0xB1 }; static const int len_end_code = 4; /** * =========================================================================== * local tables * =========================================================================== */ static const int tab_frm_type_to_slice_type[] = { 0, 0, 0, 1, 3, 2, 0, 0, 0 }; // --------------------------------------------------------------------------- static const int tab_LambdaQ[4] = { // [slice_type] -2, 0, 0, 0 }; // --------------------------------------------------------------------------- static const double tab_LambdaF[4] = { // [slice_type] 0.60, 0.60, 0.85, 0.60 }; // --------------------------------------------------------------------------- static const int8_t tab_cu_bfs_order[] = { 21, 5, 1, 0 }; extern double tab_qsfd_thres[MAX_QP][2][CTU_DEPTH]; /* --------------------------------------------------------------------------- * QSFD threshold */ static ALWAYS_INLINE void qsfd_calculate_threshold_of_a_frame(xavs2_t *h) { assert(sizeof(h->thres_qsfd_cu) == sizeof(tab_qsfd_thres[0])); memcpy(h->thres_qsfd_cu, tab_qsfd_thres[h->i_qp], sizeof(h->thres_qsfd_cu)); } /* --------------------------------------------------------------------------- * decrease the reference count by one */ static void release_one_frame(xavs2_t *h, xavs2_frame_t *frame) { xavs2_handler_t *h_mgr = h->h_top; xavs2_thread_mutex_lock(&frame->mutex); /* lock */ assert(frame->cnt_refered > 0); frame->cnt_refered--; xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ if (frame->cnt_refered == 0) { /* signal to the h_mgr */ xavs2_thread_cond_signal(&h_mgr->cond[SIG_FRM_BUFFER_RELEASED]); } } /* --------------------------------------------------------------------------- * fill packet data for output */ static ALWAYS_INLINE void encoder_fill_packet_data(xavs2_handler_t *h_mgr, xavs2_outpacket_t *packet, xavs2_frame_t *frame) { assert(packet != NULL); packet->private_data = frame; packet->opaque = h_mgr->user_data; if (frame == NULL) { packet->stream = end_code; packet->len = 0; packet->state = XAVS2_STATE_FLUSH_END; packet->type = 0; packet->pts = h_mgr->max_out_pts; packet->dts = h_mgr->max_out_dts; if (h_mgr->b_seq_end == 0) { packet->len = len_end_code; h_mgr->b_seq_end = 1; } } else { assert(frame->i_bs_len > 0); packet->stream = frame->p_bs_buf; packet->len = frame->i_bs_len; packet->state = XAVS2_STATE_ENCODED; packet->type = frame->i_frm_type; packet->pts = frame->i_pts; packet->dts = frame->i_dts; h_mgr->max_out_pts = XAVS2_MAX(h_mgr->max_out_pts, frame->i_pts); h_mgr->max_out_dts = XAVS2_MAX(h_mgr->max_out_dts, frame->i_dts); } } /** * --------------------------------------------------------------------------- * Function : output encoded data from the encoder * Parameters : * [in] : opaque - user data * [in] : frame - pointer to the frame * Return : none * --------------------------------------------------------------------------- */ static INLINE void encoder_output_frame_bitstream(xavs2_handler_t *h_mgr, xavs2_frame_t *frame) { if (frame != NULL) { xl_append(&h_mgr->list_frames_output, frame); } } /** * --------------------------------------------------------------------------- * Function : fetch bit-stream of one encoded frame * Parameters : * [in ] : h_mgr - pointer to xavs2_handler_t * [out] : packet of one encoded frame * Return : none * --------------------------------------------------------------------------- */ void encoder_fetch_one_encoded_frame(xavs2_handler_t *h_mgr, xavs2_outpacket_t *packet, int is_flush) { int num_encoding_frames = h_mgr->num_encode - h_mgr->num_output; // ڱ֡ int num_frames_threads = h_mgr->i_frm_threads; // ֡ /* clear packet data */ packet->len = 0; packet->private_data = NULL; if (is_flush && h_mgr->num_input == h_mgr->num_output) { /* all frames are encoded and have been output; * return video_end_code to finish encoding */ encoder_fill_packet_data(h_mgr, packet, NULL); } else if (num_encoding_frames > num_frames_threads || is_flush) { /* now we should wait for one frame output */ xavs2_frame_t *frame = (xavs2_frame_t *)xl_remove_head(&h_mgr->list_frames_output, 1); if (frame != NULL) { encoder_fill_packet_data(h_mgr, packet, frame); h_mgr->num_output++; assert(frame->i_bs_len > 0); } } } /* --------------------------------------------------------------------------- * check pseudo code and merge slice data with slice header bits */ static ALWAYS_INLINE void check_pseudo_code_and_merge_slice_data(bs_t *p_bs, aec_t *p_aec) { uint8_t *dst = p_bs->p; /* point to the end of previous bitstream */ uint8_t *src = p_aec->p_start; /* point to the start position of source */ uint8_t *end = p_aec->p; /* point to the end position of source */ assert(p_bs->i_left == 8); /* check pseudo code */ p_bs->p = nal_escape_c(dst, src, end); } /* --------------------------------------------------------------------------- * calculate lambda for one frame */ void xavs2e_get_frame_lambda(xavs2_t *h, xavs2_frame_t *cur_frm, int i_qp) { double lambda; int i_type = tab_frm_type_to_slice_type[cur_frm->i_frm_type]; int qp = i_qp - SHIFT_QP; int rps_idx = cur_frm->rps_index_in_gop; #if ENABLE_WQUANT // adaptive frequency weighting quantization if (h->WeightQuantEnable) { qp += tab_LambdaQ[i_type]; } #endif lambda = pow(2, qp / 4.0); #if ENABLE_WQUANT if (h->WeightQuantEnable) { lambda *= tab_LambdaF[i_type]; } else { lambda *= 0.85 * LAM_2Level_TU; } #else lambda *= 0.85 * LAM_2Level_TU; #endif if (h->param->intra_period_max != 1) { if (h->param->num_bframes > 0) { if (i_type != SLICE_TYPE_I && rps_idx != 0) { if (i_type == SLICE_TYPE_B) { lambda *= 1.2; } lambda *= XAVS2_CLIP3F(2.00, 4.00, qp / 8.0); } else if (i_type == SLICE_TYPE_I) { lambda *= 0.8; } } else if (i_type != SLICE_TYPE_I) { lambda *= 0.8; if ((rps_idx + 1) % h->i_gop_size != 0) { lambda *= XAVS2_CLIP3F(2.00, 4.00, qp / 8.0) * 0.8; } } } /* only use for RA configure */ #if AQPO if (h->param->is_enable_AQPO && h->param->intra_period_max != 1 && h->param->i_cfg_type == 2) { int gop_size; int num_poc; int index; int intra_period_num; int temp_a, temp_b, temp_c, temp_d; float temp_e, temp_f, temp_g; gop_size = h->param->i_gop_size; num_poc = (h->curr_coi >> 8) << 8; if (cur_frm->i_frame + num_poc == 0) { h->param->GopQpOffset_old = 0; } if (cur_frm->i_frame + num_poc != 0) { if ((cur_frm->i_frame + num_poc) % gop_size == 0) { intra_period_num = h->param->intra_period_max; index = ((cur_frm->i_frame + num_poc + gop_size - 1) / gop_size) % intra_period_num; temp_a = (intra_period_num - index) >> 1; temp_b = intra_period_num >> 1; temp_c = (intra_period_num - 1) >> 1; temp_d = (intra_period_num - index - 1) >> 1; temp_e = 7.5 * (1 / (float)pow(2, temp_b)) + 5 * (1 / (float)pow(2, temp_c)); temp_f = 7.5 * (1 / (float)pow(2, temp_a)) + 5 * (1 / (float)pow(2, temp_d)); temp_g = (temp_f - temp_e) / (14 - temp_e); temp_g = (temp_g + 1.0)*intra_period_num / (intra_period_num + 2.0); if (temp_g < 0.8001) { h->param->GopQpOffset = 0; } else if (temp_g < 1.0) { h->param->GopQpOffset = 1; } else { h->param->GopQpOffset = 2; } h->param->GopQpOffset = XAVS2_CLIP3(0, h->param->GopQpOffset_old + 1, h->param->GopQpOffset); h->param->GopQpOffset_old = h->param->GopQpOffset; } lambda *= exp(h->param->GopQpOffset / 5.661); } } #endif cur_frm->f_frm_lambda_ssd = lambda; cur_frm->i_frm_lambda_sad = LAMBDA_FACTOR(sqrt(lambda)); } /* --------------------------------------------------------------------------- * calculate lambda for RDO */ static void xavs2e_update_lambda(xavs2_t *h, int i_type, double lambda) { /* get lambda for RDO */ h->f_lambda_mode = (rdcost_t)lambda; h->i_lambda_factor = LAMBDA_FACTOR(sqrt(lambda)); h->f_lambda_1th = 1.0 / h->f_lambda_mode; /* get lambda for RDOQ */ if (h->param->i_rdoq_level != RDOQ_OFF) { h->f_lambda_rdoq = (lambda * h->param->lambda_factor_rdoq + 50) / 100; if (i_type == SLICE_TYPE_P || i_type == SLICE_TYPE_F) { h->f_lambda_rdoq = (h->f_lambda_rdoq * h->param->lambda_factor_rdoq_p + 50) / 100; } else if (i_type == SLICE_TYPE_B) { h->f_lambda_rdoq = (h->f_lambda_rdoq * h->param->lambda_factor_rdoq_b + 50) / 100; } } } /* --------------------------------------------------------------------------- * initializes the parameters for a new frame */ static void init_frame(xavs2_t *h, xavs2_frame_t *frame) { int frame_size_in_4x4 = h->i_height_in_minpu * h->i_width_in_minpu; dist_t *all_mincost = &h->all_mincost[0][0][0]; h->fenc = frame; h->i_nal = 0; switch (frame->i_frm_type) { case XAVS2_TYPE_I: h->i_nal_type = NAL_SLICE; h->i_nal_ref_idc = NAL_PRIORITY_HIGHEST; h->i_type = SLICE_TYPE_I; break; case XAVS2_TYPE_P: h->i_nal_type = NAL_SLICE; h->i_nal_ref_idc = NAL_PRIORITY_HIGH; h->i_type = SLICE_TYPE_P; break; case XAVS2_TYPE_F: h->i_nal_type = NAL_SLICE; h->i_nal_ref_idc = NAL_PRIORITY_HIGH; h->i_type = SLICE_TYPE_F; break; default: h->i_nal_type = NAL_SLICE; h->i_nal_ref_idc = NAL_PRIORITY_DISPOSABLE; h->i_type = SLICE_TYPE_B; break; } // initialize slice index of each CTU/LCU g_funcs.fast_memset(h->lcu_slice_idx, -1, h->i_width_in_lcu * h->i_height_in_lcu * sizeof(int8_t)); // initialize MVs, references and prediction direction if (h->param->me_method == XAVS2_ME_UMH) { g_funcs.mem_repeat_i(all_mincost, 1 << 30, frame_size_in_4x4 * MAX_INTER_MODES * MAX_REFS * sizeof(dist_t) / sizeof(int32_t)); } } /* --------------------------------------------------------------------------- * get the next input frame order */ static int Advance2NextFrame(xavs2_handler_t *h_mgr, int frame) { return (frame + 1) % h_mgr->i_frm_threads; } /* --------------------------------------------------------------------------- * get a frame encoder handle */ static xavs2_t *encoder_alloc_frame_task(xavs2_handler_t *h_mgr, xavs2_frame_t *frame) { int refs_unavailable = 0; int i, j; xavs2_thread_mutex_lock(&h_mgr->mutex); /* lock */ /* wait until we successfully get one frame context */ for (;;) { for (i = 0; i < h_mgr->i_frm_threads; i++) { /* alloc a frame task */ xavs2_t *h = h_mgr->frm_contexts[i]; assert(h->task_type == XAVS2_TASK_FRAME); if (h->task_status == XAVS2_TASK_FREE) { /* initialize the task */ h->task_status = XAVS2_TASK_BUSY; h->i_frame_b = h_mgr->dpb.i_frame_b; h->ip_pic_idx = h_mgr->dpb.ip_pic_idx; h->i_aec_frm = h_mgr->i_frame_in; h->b_all_row_ctx_released = 0; #if XAVS2_STAT /* reset frame statistics */ memset(&h->frameinfo->frame_stat, 0, sizeof(frame_stat_t)); #endif /* reset all rows */ for (j = 0; j < h->i_height_in_lcu; j++) { row_info_t *row = &h->frameinfo->rows[j]; row->h = 0; row->row = j; row->coded = -1; } /* init caches */ init_frame(h, frame); h->fenc->b_random_access_decodable = (h->fenc->i_frame >= h_mgr->dpb.POC_IDR); /* update the task manager */ frame_buffer_update(h, &h_mgr->dpb, h->fenc); /* advance to the next input frame */ h_mgr->i_frame_in = Advance2NextFrame(h_mgr, h_mgr->i_frame_in); /* prepare the reference list */ h->fdec = find_fdec_and_build_rps(h, &h_mgr->dpb, h->fenc, h->fref); if (h->fdec == NULL) { xavs2_log(NULL, XAVS2_LOG_DEBUG, "find FDEC or build reference lists fail\n"); refs_unavailable = 1; break; } /* decide frame QP and lambdas */ h->fenc->i_frm_qp = clip_qp(h, xavs2_rc_get_base_qp(h) + h->fenc->rps.qp_offset); xavs2e_get_frame_lambda(h, h->fenc, h->fenc->i_frm_qp); h->i_qp = h->fenc->i_frm_qp; /* update lambda in encoder handler (h) */ xavs2e_update_lambda(h, h->i_type, h->fenc->f_frm_lambda_ssd); h->frameinfo->frame_stat.stat_frm.f_lambda_frm = h->f_lambda_mode; /* refine qp */ if (h->param->enable_refine_qp && h->param->intra_period_min > 1) { h->i_qp = (int)(5.661 * log((double)(h->f_lambda_mode)) + 13.131 + 0.5); } /* udpdate some properties */ h->i_ref = h->fenc->rps.num_of_ref; h->i_layer = h->fenc->rps.temporal_id; assert(h->i_ref <= XAVS2_MAX_REFS); xavs2_thread_mutex_unlock(&h_mgr->mutex); /* unlock */ /* signal to the aec thread */ xavs2_thread_cond_signal(&h_mgr->cond[SIG_FRM_CONTEXT_ALLOCATED]); return h; } } if (refs_unavailable) { break; } xavs2_thread_cond_wait(&h_mgr->cond[SIG_FRM_CONTEXT_RELEASED], &h_mgr->mutex); } xavs2_thread_mutex_unlock(&h_mgr->mutex); /* unlock */ return 0; } /* --------------------------------------------------------------------------- * set frame task status */ static void encoder_set_task_status(xavs2_t *h, task_status_e status) { xavs2_handler_t *h_mgr = h->h_top; assert(h->task_type == XAVS2_TASK_FRAME); xavs2_thread_mutex_lock(&h_mgr->mutex); /* lock */ if ((status == XAVS2_TASK_RDO_DONE && h->task_status == XAVS2_TASK_AEC_DONE) || (status == XAVS2_TASK_AEC_DONE && h->task_status == XAVS2_TASK_RDO_DONE)) { h->task_status = XAVS2_TASK_FREE; } else { h->task_status = status; } xavs2_thread_mutex_unlock(&h_mgr->mutex); /* unlock */ if (status == XAVS2_TASK_AEC_DONE) { /* signal to the output proc */ xavs2_thread_cond_signal(&h_mgr->cond[SIG_FRM_AEC_COMPLETED]); } if (h->task_status == XAVS2_TASK_FREE) { /* broadcast to the task manager & flush */ xavs2_thread_cond_broadcast(&h_mgr->cond[SIG_FRM_CONTEXT_RELEASED]); } } /* --------------------------------------------------------------------------- */ static void encoder_write_rec_frame(xavs2_handler_t *h_mgr) { xavs2_t *h = h_mgr->p_coder; xavs2_frame_t **DPB = h_mgr->dpb.frames; int size_dpb = h_mgr->dpb.num_frames; int i = 0; int j; xavs2_thread_mutex_lock(&h_mgr->mutex); /* lock */ while (i < size_dpb) { int next_output_frame_idx; xavs2_frame_t *frame = DPB[i]; if (frame == NULL) { i++; continue; } xavs2_thread_mutex_lock(&frame->mutex); /* lock */ next_output_frame_idx = get_next_frame_id(h_mgr->i_output); if (frame->i_frame == next_output_frame_idx) { /* has the frame already been reconstructed ? */ for (j = 0; j < h->i_height_in_lcu; j++) { if (frame->num_lcu_coded_in_row[j] < h->i_width_in_lcu) { break; } } if (j < h->i_height_in_lcu) { /* frame doesn't finish reconstruction */ xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ break; } /* update output frame index */ h_mgr->i_output = next_output_frame_idx; #if XAVS2_DUMP_REC dump_yuv_out(h, h_mgr->h_rec_file, frame, h->param->org_width, h->param->org_height); #endif //if XAVS2_DUMP_REC xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ /* release one frame */ release_one_frame(h, frame); // write reconstruction file /* start over for the next reconstruction frame */ i = 0; continue; } xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ i++; } xavs2_thread_mutex_unlock(&h_mgr->mutex); /* unlock */ } /* --------------------------------------------------------------------------- * the aec encoding */ static INLINE void encoder_encode_frame_header(xavs2_t *h) { bs_t *p_bs = &h->header_bs; int overhead = 30; /* number of overhead bytes (include I/P/B picture header) */ /* init bitstream context */ xavs2_bs_init(p_bs, h->p_bs_buf_header, h->i_bs_buf_header); /* create sequence header if need ------------------------------ */ if (h->fenc->b_keyframe) { if (h->fenc->i_frm_coi == 0 || h->param->intra_period_min > 1) { /* generate sequence parameters */ nal_start(h, NAL_SPS, NAL_PRIORITY_HIGHEST); xavs2_sequence_write(h, p_bs); nal_end(h); overhead += h->p_nal[h->i_nal - 1].i_payload; /* generate user data */ nal_start(h, NAL_AUD, NAL_PRIORITY_HIGHEST); xavs2_user_data_write(p_bs); nal_end(h); overhead += h->p_nal[h->i_nal - 1].i_payload; } } /* nal start for picture header */ nal_start(h, NAL_PPS, NAL_PRIORITY_HIGHEST); if (h->i_type == SLICE_TYPE_I) { xavs2_intra_picture_header_write(h, p_bs); } else { xavs2_inter_picture_header_write(h, p_bs); } // write picture header (ALF) xavs2_picture_header_alf_write(h, h->pic_alf_params, p_bs); bs_stuff_bits(p_bs); // stuff bits after finishing ALF nal_end(h); // nal for picture header } /* --------------------------------------------------------------------------- * the aec encoding */ static void *encoder_aec_encode_one_frame(xavs2_t *h) { aec_t aec; frame_info_t *frame = h->frameinfo; xavs2_frame_t *fdec = h->fdec; row_info_t *row = NULL; lcu_info_t *lcu = NULL; slice_t *slice = NULL; aec_t *p_aec = &aec; outputframe_t output_frame; #if XAVS2_STAT frame_stat_t *frm_stat = &frame->frame_stat; int i = 0; #endif int lcu_xy = 0; int lcu_x = 0, lcu_y = 0; /* encode frame header */ encoder_encode_frame_header(h); /* encode all LCUs */ for (lcu_y = 0; lcu_y < h->i_height_in_lcu; lcu_y++) { row = &frame->rows[lcu_y]; /* wait until the row finishes RDO */ xavs2_thread_mutex_lock(&fdec->mutex); /* lock */ while (fdec->num_lcu_coded_in_row[lcu_y] < h->i_width_in_lcu) { xavs2_thread_cond_wait(&fdec->cond, &fdec->mutex); } xavs2_thread_mutex_unlock(&fdec->mutex); /* unlock */ /* row is clear: start aec for every LCU */ for (lcu_x = 0; lcu_x < h->i_width_in_lcu; lcu_x++, lcu_xy++) { lcu = &row->lcus[lcu_x]; slice = h->slices[lcu->slice_index]; // while (fdec->num_lcu_coded_in_row[lcu_y] <= lcu_x) { // xavs2_sleep_ms(1); // } if (lcu_xy == slice->i_first_lcu_xy) { /* slice start : initialize the aec engine */ aec_start(h, p_aec, slice->bs.p_start + PSEUDO_CODE_SIZE, slice->bs.p_end, 1); p_aec->b_writting = 1; } if (h->param->enable_sao) { write_saoparam_one_lcu(h, p_aec, lcu_x, lcu_y, h->slice_sao_on, h->sao_blk_params[lcu_y * h->i_width_in_lcu + lcu_x]); } if (h->param->enable_alf) { int alf_comp; for (alf_comp = 0; alf_comp < 3; alf_comp++) { if (h->pic_alf_on[alf_comp]) { p_aec->binary.write_alf_lcu_ctrl(p_aec, h->is_alf_lcu_on[lcu_xy][alf_comp]); } } } xavs2_lcu_write(h, p_aec, lcu, h->i_lcu_level, lcu->pix_x, lcu->pix_y); /* for the last LCU in SLice, write 1, otherwise write 0 */ xavs2_lcu_terminat_bit_write(p_aec, lcu_xy == slice->i_last_lcu_xy); } /* LCUмSliceַʽ */ if (lcu_xy >= slice->i_last_lcu_xy) { int bs_len; /* slice done */ aec_done(p_aec); /* check pseudo start code, and store bit stream length */ check_pseudo_code_and_merge_slice_data(&slice->bs, p_aec); bs_len = xavs2_bs_pos(&slice->bs) / 8; nal_merge_slice(h, slice->p_slice_bs_buf, bs_len, h->i_nal_type, h->i_nal_ref_idc); } } h->fenc->i_bs_len = (int)encoder_encapsulate_nals(h, h->fenc, 0); #if XAVS2_STAT /* collect frame properties */ frm_stat->i_type = h->i_type; frm_stat->i_frame = h->fenc->i_frame; frm_stat->i_qp = h->i_qp; frm_stat->i_ref = h->i_ref; for (i = 0; i < h->i_ref; i++) { frm_stat->ref_poc_set[i] = h->fref[i]->i_frm_poc >> 1; } h->fenc->i_time_end = xavs2_mdate(); if (h->param->enable_psnr) { encoder_cal_psnr(h, &frm_stat->stat_frm.f_psnr[0], &frm_stat->stat_frm.f_psnr[1], &frm_stat->stat_frm.f_psnr[2]); } else { frm_stat->stat_frm.f_psnr[0] = 0; frm_stat->stat_frm.f_psnr[1] = 0; frm_stat->stat_frm.f_psnr[2] = 0; } if (h->param->enable_ssim) { encoder_cal_ssim(h, &frm_stat->stat_frm.f_ssim[0], &frm_stat->stat_frm.f_ssim[1], &frm_stat->stat_frm.f_ssim[2]); } else { frm_stat->stat_frm.f_ssim[0] = 0; frm_stat->stat_frm.f_ssim[1] = 0; frm_stat->stat_frm.f_ssim[2] = 0; } #endif /* make sure all row context has been released */ while (h->b_all_row_ctx_released == 0) { xavs2_sleep_ms(1); } /* release the reconstructed frame */ release_one_frame(h, h->fdec); /* update rate control */ xavs2_rc_update_after_frame_coded(h, h->fenc->i_bs_len * 8, h->i_qp, h->fenc->i_frm_type, h->fenc->i_frame); /* output this encoded frame */ output_frame.frm_enc = h->fenc; output_frame.next = NULL; #if XAVS2_STAT memcpy(&output_frame.out_frm_stat, &h->frameinfo->frame_stat, sizeof(output_frame.out_frm_stat)); /* report frame encoding */ if (output_frame.frm_enc->i_bs_len > 0) { encoder_report_one_frame(h, &output_frame); if (output_frame.frm_enc->i_bs_len >= (output_frame.frm_enc->i_bs_buf >> 2)) { h->h_top->stat.num_frame_small_qp++; if ((h->h_top->stat.num_frame_small_qp & 128) == 1) { if (output_frame.frm_enc->i_bs_len > output_frame.frm_enc->i_bs_buf) { xavs2_log(h, XAVS2_LOG_ERROR, "Frame bitstream exceeds the BS buffer size. num:%d\n", h->h_top->stat.num_frame_small_qp); } else { xavs2_log(h, XAVS2_LOG_WARNING, "Frame bitstream exceeds 1/4 BS buffer size. num %d\n", h->h_top->stat.num_frame_small_qp); } } } } #endif /* output bitstream and recycle input frame */ { xavs2_handler_t *h_mgr = h->h_top; while (h_mgr->i_exit_flag != XAVS2_EXIT_THREAD) { /* wait until it is time for output of this frame */ if (h_mgr->i_frame_aec == h->i_aec_frm) { break; } } xavs2_thread_mutex_lock(&h_mgr->mutex); /* lock */ encoder_output_frame_bitstream(h_mgr, output_frame.frm_enc); h_mgr->i_frame_aec = Advance2NextFrame(h_mgr, h_mgr->i_frame_aec); xavs2_thread_mutex_unlock(&h_mgr->mutex); /* unlock */ } /* set task status */ encoder_set_task_status(h, XAVS2_TASK_AEC_DONE); return NULL; } /** * --------------------------------------------------------------------------- * Function : flush reconstructed frames from the encoder * Parameters : * [in ] : h - pointer to struct xavs2_t, the xavs2 encoder * [out] : none * Return : none * --------------------------------------------------------------------------- */ static void encoder_flush(xavs2_handler_t *h_mgr) { int i = 0; if (h_mgr == NULL) { return; } xavs2_thread_mutex_lock(&h_mgr->mutex); /* lock */ /* wait until all tasks free */ while (i < h_mgr->i_frm_threads) { xavs2_t *h_frm_coder = h_mgr->frm_contexts[i]; if (h_frm_coder && h_frm_coder->task_status != XAVS2_TASK_FREE) { /* use 'sleep()' instead ? */ xavs2_thread_cond_wait(&h_mgr->cond[SIG_FRM_CONTEXT_RELEASED], &h_mgr->mutex); /* recheck all */ i = 0; continue; } i++; } xavs2_thread_mutex_unlock(&h_mgr->mutex); /* unlock */ encoder_write_rec_frame(h_mgr); } /** * --------------------------------------------------------------------------- * Function : determine the MVD's value (1/4 pixel) is legal or not * Parameters : * [in ] : h - pointer to struct xavs2_t, the xavs2 encoder * [out] : - (-1) fail, (0) success * Return : none * --------------------------------------------------------------------------- */ static int encoder_decide_mv_range(xavs2_t *h) { /* set horizontal mv range */ h->min_mv_range[0] = -8192; h->max_mv_range[0] = 8191; if (h->param->profile_id == MAIN10_PROFILE || h->param->profile_id == MAIN_PROFILE) { if (h->param->i_frame_threads > 1) { /* set vertical mv range */ h->min_mv_range[1] = -((1 << h->i_lcu_level) << 2); h->max_mv_range[1] = ((1 << h->i_lcu_level) << 2) - 1; } else { /* set vertical mv range */ if (h->param->level_id >= 0x40) { h->min_mv_range[1] = -2048; h->max_mv_range[1] = 2047; } else if (h->param->level_id >= 0x20) { h->min_mv_range[1] = -1024; h->max_mv_range[1] = 1023; } else if (h->param->level_id >= 0x10) { h->min_mv_range[1] = -512; h->max_mv_range[1] = 511; } else { return -1; } } /* scale for field coding */ if (h->param->InterlaceCodingOption == FIELD_CODING) { h->min_mv_range[1] >>= 1; h->max_mv_range[1] >>= 1; } return 0; } return -1; } /** * --------------------------------------------------------------------------- * Function : determine the appropriate LevelID * --------------------------------------------------------------------------- */ static void encoder_decide_level_id(xavs2_param_t *param) { const int tab_level_restriction[][5] = { /* LevelID, MaxWidth, MaxHeight, MaxFps, MaxKBps */ { 0x00, 8192, 8192, 0, 0 }, // ֹ { 0x10, 352, 288, 15, 1500 }, // 2.0.15 { 0x12, 352, 288, 30, 2000 }, // 2.0.30 { 0x14, 352, 288, 60, 2500 }, // 2.0.60 { 0x20, 720, 576, 30, 6000 }, // 4.0.30 { 0x22, 720, 576, 60, 10000 }, // 4.0.60 { 0x40, 2048, 1152, 30, 12000 }, // 6.0.30 { 0x42, 2048, 1152, 30, 30000 }, // 6.0.60 { 0x44, 2048, 1152, 60, 20000 }, // 6.0.120 { 0x46, 2048, 1152, 60, 50000 }, // 6.2.120 { 0x48, 2048, 1152, 120, 25000 }, // 6.0.120 { 0x4A, 2048, 1152, 120, 100000 }, // 6.2.120 { 0x50, 4090, 2304, 30, 25000 }, // 8.0.30 { 0x52, 4090, 2304, 30, 25000 }, // 8.2.30 { 0x54, 4090, 2304, 60, 40000 }, // 8.0.60 { 0x56, 4090, 2304, 60, 160000 }, // 8.2.60 { 0x58, 4090, 2304, 120, 60000 }, // 8.0.120 { 0x5A, 4090, 2304, 120, 240000 }, // 8.2.120 { 0x60, 8192, 4608, 30, 60000 }, // 10.0.30 { 0x62, 8192, 4608, 30, 240000 }, // 10.2.30 { 0x64, 8192, 4608, 60, 120000 }, // 10.0.60 { 0x66, 8192, 4608, 60, 480000 }, // 10.2.60 { 0x68, 8192, 4608, 120, 240000 }, // 10.0.120 { 0x6A, 8192, 4608, 120, 800000 }, // 10.2.120 { 0x00, 16384, 8192, 120, 8000000 }, // ֹ }; int i = 1; int i_last_level = 0; for (; tab_level_restriction[i][4] != 0;) { /* δʱΪ */ if (param->i_rc_method == 0 && param->org_width <= tab_level_restriction[i_last_level][1] && param->org_height <= tab_level_restriction[i_last_level][2] && param->org_width <= tab_level_restriction[i][1] && param->org_height <= tab_level_restriction[i][2] && tab_level_restriction[i_last_level][1] < tab_level_restriction[i][1] && tab_level_restriction[i_last_level][2] < tab_level_restriction[i][2]) { /* ʿδʱѡķֱµߵ */ i = i_last_level; break; } /* ֱʡ֡ʷҪ */ if (param->org_width <= tab_level_restriction[i][1] && param->org_height <= tab_level_restriction[i][2] && param->frame_rate <= tab_level_restriction[i][3]) { i_last_level = i; /* 趨ɸLevelID */ if (param->i_rc_method != 0 && param->i_target_bitrate * 1.5 <= tab_level_restriction[i][4] * 1000 && param->bitrate_upper <= tab_level_restriction[i][4] * 1000) { break; } } i++; } param->level_id = tab_level_restriction[i][0]; } /* --------------------------------------------------------------------------- */ int encoder_check_parameters(xavs2_param_t *param) { int num_max_slice = ((param->org_height + (1 << param->lcu_bit_level) - 1) >> param->lcu_bit_level) >> 1; num_max_slice = XAVS2_MAX(2, num_max_slice); /* check number of threaded frames */ if (param->i_frame_threads > MAX_PARALLEL_FRAMES) { xavs2_log(NULL, XAVS2_LOG_ERROR, "too many threaded frames : %d. increase MAX_PARALLEL_FRAMES (%d) and recompile.\n", param->i_frame_threads, MAX_PARALLEL_FRAMES); return -1; } /* check slice number */ if (param->slice_num > MAX_SLICES || param->slice_num > num_max_slice) { xavs2_log(NULL, XAVS2_LOG_ERROR, "too many slices : %d. exceeds MAX_SLICES (%d) or LcuRows/2 (%d).\n", param->slice_num, MAX_SLICES, num_max_slice); return -1; } /* Slice²ܿ cross slice loop filterӰ첢Ч * TODO: ֧ */ if (param->slice_num > 1 && param->b_cross_slice_loop_filter != FALSE) { xavs2_log(NULL, XAVS2_LOG_WARNING, "Un-supported cross slice loop filter, forcing not filtering\n"); param->b_cross_slice_loop_filter = FALSE; } /* check frame rate */ if (param->frame_rate_code > 8 || param->frame_rate_code < 1) { xavs2_log(NULL, XAVS2_LOG_ERROR, "FrameRate should be in 1..8 (1: 24000/1001,2: 24,3: 25,4: 30000/1001,5: 30,6: 50,7: 60000/1001,8: 60)\n"); return -1; } param->frame_rate = FRAME_RATE[param->frame_rate_code - 1]; /* check LCU size */ if (param->lcu_bit_level < B16X16_IN_BIT || param->lcu_bit_level > B64X64_IN_BIT){ xavs2_log(NULL, XAVS2_LOG_ERROR, "MaxSizeInBit must be in 4..6 (LCU size: 16x16, 32x32, 64x64)\n"); return -1; } /* check intra period */ xavs2_log(NULL, XAVS2_LOG_DEBUG, "IntraPeriod { Min %d Max %d }, BFrames %d, OpenGOP %d\n", param->intra_period_min, param->intra_period_max, param->num_bframes, param->b_open_gop); if (param->intra_period_max == -1) { param->intra_period_max = param->frame_rate; } if (param->intra_period_min == -1) { param->intra_period_min = param->intra_period_max; } if (param->intra_period_min > param->intra_period_max) { xavs2_log(NULL, XAVS2_LOG_WARNING, "IntraPeriod: swapped Min/Max\n"); XAVS2_SWAP(param->intra_period_max, param->intra_period_min); } /* Only support GOP size divisible by 8 while using RA with openGOP */ if (param->b_open_gop && param->num_bframes) { int period = param->intra_period_max / XAVS2_ABS(param->i_gop_size); if (param->intra_period_max % XAVS2_ABS(param->i_gop_size)) { param->intra_period_max = (period + 1) * XAVS2_ABS(param->i_gop_size); xavs2_log(NULL, XAVS2_LOG_WARNING, "IntraPeriodMax Fixed for OpenGOP => %d\n", param->intra_period_max); } } if (param->profile_id == MAIN_PICTURE_PROFILE && (param->intra_period_max != 1 || param->intra_period_min != 1)) { xavs2_log(NULL, XAVS2_LOG_ERROR, "MAIN picture file only supports intra picture coding!\n"); return -1; } /* update profile id */ if (param->sample_bit_depth == 8) { param->profile_id = MAIN_PROFILE; } else { if (param->profile_id != MAIN10_PROFILE && param->sample_bit_depth > 8) { xavs2_log(NULL, XAVS2_LOG_WARNING, "Forcing Main10 Profile for high bit-depth coding\n"); param->profile_id = MAIN10_PROFILE; } } /* check bit depth */ if (param->profile_id != MAIN_PROFILE) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Not Supported profile \"%d\", HIGH_BIT_DEPTH macro haven`t turn on!\n", param->profile_id); return -1; } /* check LevelID */ encoder_decide_level_id(param); if (param->level_id <= 0 || param->level_id > 0x6A) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Not Supported LevelID: %dx%d, %.3f fps, %d bps!\n", param->org_width, param->org_height, param->frame_rate, param->i_target_bitrate); return -1; } /* check chroma format */ if (param->chroma_format != CHROMA_420 && param->chroma_format != CHROMA_400) { xavs2_log(NULL, XAVS2_LOG_ERROR, "invalid chroma format %d\n", param->chroma_format); exit(-1); } /* check reference configuration */ if (param->num_bframes >= XAVS2_MAX_GOP_SIZE) { xavs2_log(NULL, XAVS2_LOG_ERROR, "The number of successive B-frame is too big!\n"); return -1; } if (param->num_bframes > 0 && param->num_bframes + 1 != XAVS2_ABS(param->i_gop_size)) { xavs2_log(NULL, XAVS2_LOG_ERROR, "The number of successive B-frame is wrong!\n"); return -1; } if (rps_check_config(param) < 0) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Error found in RPS configuration!\n"); return -1; } /* GOP parallel encoding */ if (param->num_parallel_gop < 1) { param->num_parallel_gop = 1; } else if (param->num_parallel_gop > 1 && param->b_open_gop) { xavs2_log(NULL, XAVS2_LOG_WARNING, "Only ClosedGOP can be utilized with GOP parallel encoding\n"); param->b_open_gop = FALSE; } /* check preset level */ if (param->preset_level < 0 || param->preset_level > 9) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Error input parameter preset_level, check configuration file\n"); return -1; } else { if (param->is_preset_configured == FALSE) { /* modify configurations according to the input preset level */ parse_preset_level(param, param->preset_level); } } /* check QP */ if (param->i_initial_qp > MAX_QP || param->i_initial_qp < MIN_QP) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Error input parameter quant_0, check configuration file\n"); return -1; } if (param->i_initial_qp < 25 + 8 * (param->sample_bit_depth - 8)) { xavs2_log(NULL, XAVS2_LOG_WARNING, "A small QP is configured: QP: %d, EncodingBitDepth: %d, Suggested QP: >=%d\n", param->i_initial_qp, param->sample_bit_depth, 25 + 8 * (param->sample_bit_depth - 8)); } if (param->i_max_qp > 63 + (param->sample_bit_depth - 8) * 8) { xavs2_log(NULL, XAVS2_LOG_WARNING, "A too large max QP is configured: QP: %d, EncodingBitDepth: %d, Available QP: <=%d\n", param->i_max_qp, param->sample_bit_depth, 63 + 8 * (param->sample_bit_depth - 8)); param->i_max_qp = 63 + (param->sample_bit_depth - 8) * 8; } if (param->i_min_qp < 0) { param->i_min_qp = 0; } param->i_initial_qp = XAVS2_CLIP3(param->i_min_qp, param->i_max_qp, param->i_initial_qp); /* check LCU level */ if (param->lcu_bit_level > 6 || param->lcu_bit_level < 3) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Error input parameter MaxSizeInBit, check configuration file\n"); return -1; } /* check range of filter offsets */ if (param->alpha_c_offset > 8 || param->alpha_c_offset < -8) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Error input parameter LFAlphaC0Offset, check configuration file\n"); return -1; } if (param->beta_offset > 8 || param->beta_offset < -8) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Error input parameter LFBetaOffset, check configuration file\n"); return -1; } /* check ALF configuration */ if (param->i_frame_threads != 1 && param->enable_alf != 0) { param->enable_alf = 0; xavs2_log(NULL, XAVS2_LOG_WARNING, "ALF disabled since frame parallel encoding is enabled.\n"); } /* FIXME: set bitrate (lower and upper) */ param->bitrate_lower = (param->i_target_bitrate / 400) & 0x3FFFF; /* lower 18 bits */ param->bitrate_upper = (param->i_target_bitrate / 400) >> 18; /* upper 12 bits */ /* set for field coding */ if (param->InterlaceCodingOption == FIELD_CODING) { param->org_height = param->org_height >> 1; param->intra_period_max = param->intra_period_max << 1; param->intra_period_min = param->intra_period_min << 1; } /* low delay? */ if (param->num_bframes == 0) { param->low_delay = TRUE; } else { param->low_delay = FALSE; } /* Rate-Control */ #if !ENABLE_RATE_CONTROL_CU if (param->i_rc_method == XAVS2_RC_CBR_SCU) { xavs2_log(NULL, XAVS2_LOG_WARNING, "Rate Control with CU level control disabled in this version.\n"); param->i_rc_method = XAVS2_RC_CBR_FRM; } #endif if (param->i_rc_method == XAVS2_RC_CBR_SCU) { param->fixed_picture_qp = FALSE; } else { param->fixed_picture_qp = TRUE; } /* consistency check num_max_ref */ if (param->num_max_ref < 1) { param->num_max_ref = 1; } /* enable TDRDO? TDRDO is only just for low delay */ if (param->num_bframes != 0 || param->intra_period_min > 0) { param->enable_tdrdo = 0; } /* set display properties */ // param->display_horizontal_size = param->org_width; // param->display_vertical_size = param->org_height; param->sample_precision = ((param->input_sample_bit_depth - 6) / 2); param->aspect_ratio_information = 1; #if !ENABLE_WQUANT /* weighting quantization */ param->enable_wquant = 0; /* disable */ #endif return 0; } /* --------------------------------------------------------------------------- * assign pointers for all coding tree units (till 4x4 CU) */ static void build_coding_tree(xavs2_t *h, cu_t *p_cu, int idx_zorder, int i_level, int i_pos_x, int i_pos_y) { int i; int idx_cu_bfs = 0; p_cu->i_size = 1 << i_level; p_cu->cu_info.i_level = (int8_t)i_level; p_cu->i_pos_x = i_pos_x; p_cu->i_pos_y = i_pos_y; p_cu->in_lcu_edge = ((i_pos_y != 0) << 1) + (i_pos_x != 0); p_cu->idx_zorder = (int8_t)idx_zorder; idx_cu_bfs = tab_cu_bfs_order[i_level - MIN_CU_SIZE_IN_BIT]; idx_cu_bfs += (i_pos_y >> i_level) * (MAX_CU_SIZE >> i_level) + (i_pos_x >> i_level); p_cu->idx_cu_bfs = (int8_t)idx_cu_bfs; if (i_level > B8X8_IN_BIT) { int num_parts = 1 << ((i_level - B16X16_IN_BIT) << 1); i_level--; for (i = 0; i < 4; i++) { p_cu->sub_cu[i] = &h->lcu.all_cu[h->lcu.i_scu_xy++]; i_pos_x = p_cu->i_pos_x + ((i & 1) << i_level); i_pos_y = p_cu->i_pos_y + ((i >> 1) << i_level); build_coding_tree(h, p_cu->sub_cu[i], idx_zorder + i * num_parts, i_level, i_pos_x, i_pos_y); } } else { for (i = 0; i < 4; i++) { p_cu->sub_cu[i] = NULL; } } } /* --------------------------------------------------------------------------- */ static xavs2_t *encoder_create_frame_context(const xavs2_param_t *param, int idx_frm_encoder) { const int num_slices = param->slice_num; xavs2_t *h = NULL; int frame_w = ((param->org_width + MIN_CU_SIZE - 1) >> MIN_CU_SIZE_IN_BIT) << MIN_CU_SIZE_IN_BIT; int frame_h = ((param->org_height + MIN_CU_SIZE - 1) >> MIN_CU_SIZE_IN_BIT) << MIN_CU_SIZE_IN_BIT; int size_lcu = 1 << param->lcu_bit_level; /* size of a LCU (largest coding unit) */ int w_in_lcu = (frame_w + size_lcu - 1) >> param->lcu_bit_level; int h_in_lcu = (frame_h + size_lcu - 1) >> param->lcu_bit_level; int w_in_scu = frame_w >> MIN_CU_SIZE_IN_BIT; int h_in_scu = frame_h >> MIN_CU_SIZE_IN_BIT; int w_in_4x4 = frame_w >> MIN_PU_SIZE_IN_BIT; int h_in_4x4 = frame_h >> MIN_PU_SIZE_IN_BIT; int bs_size = frame_w * frame_h * 2; int ipm_size = (w_in_4x4 + 16) * ((size_lcu >> MIN_PU_SIZE_IN_BIT) + 1); int size_4x4 = w_in_4x4 * h_in_4x4; int qpel_frame_size = (frame_w + 2 * XAVS2_PAD) * (frame_h + 2 * XAVS2_PAD); int info_size = sizeof(frame_info_t) + h_in_lcu * sizeof(row_info_t) + w_in_lcu * h_in_lcu * sizeof(lcu_info_t); int size_sao_stats = w_in_lcu * h_in_lcu * sizeof(SAOStatData[NUM_SAO_COMPONENTS][NUM_SAO_NEW_TYPES]); int size_sao_param = w_in_lcu * h_in_lcu * sizeof(SAOBlkParam[NUM_SAO_COMPONENTS]); int size_sao_onoff = h_in_lcu * sizeof(int[NUM_SAO_COMPONENTS]); size_t size_alf = alf_get_buffer_size(param); int frame_size_in_scu = w_in_scu * h_in_scu; int num_me_bytes = (w_in_4x4 * h_in_4x4)* sizeof(dist_t[MAX_INTER_MODES][MAX_REFS]); size_t size_extra_frame_buffer = 0; int i, j; int scu_xy = 0; cu_info_t *p_cu_info; size_t mem_size = 0; uint8_t *mem_base; num_me_bytes = (num_me_bytes + 255) >> 8 << 8; /* align number of bytes to 256 */ qpel_frame_size = (qpel_frame_size + CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE - 1); size_extra_frame_buffer = (param->enable_tdrdo + param->enable_sao + param->enable_alf) * xavs2_frame_buffer_size(param, FT_TEMP); /* compute the space size and alloc buffer */ mem_size = sizeof(xavs2_t) + /* xavs2_t */ sizeof(nal_t) * (MAX_SLICES + 6) + /* all nal units */ sizeof(uint8_t) * XAVS2_BS_HEAD_LEN + /* bitstream buffer (frame header only) */ sizeof(uint8_t) * bs_size + /* bitstream buffer for all slices */ sizeof(slice_t) * MAX_SLICES + /* slice array */ sizeof(pel_t) * (frame_w * 2) * num_slices + /* buffer for intra_border */ sizeof(uint8_t) * w_in_scu * 32 * num_slices + /* buffer for edge filter flag (of one LCU row) */ sizeof(int8_t) * ipm_size * num_slices + /* intra prediction mode buffer */ sizeof(int8_t) * size_4x4 + /* inter prediction direction */ sizeof(int8_t) * size_4x4 * 2 + /* reference frames */ sizeof(mv_t) * size_4x4 * 2 + /* reference motion vectors */ CACHE_LINE_SIZE * (MAX_SLICES + 32); mem_size += qpel_frame_size * 3 * sizeof(mct_t) + /* temporary buffer for 1/4 interpolation: a,1,b */ xavs2_me_get_buf_size(param) + /* buffers in me module */ info_size + /* the frame info structure */ frame_size_in_scu * sizeof(cu_info_t) + /* CU data */ num_me_bytes + /* Motion Estimation */ w_in_lcu * h_in_lcu * sizeof(int8_t) + /* CTU slice index */ size_extra_frame_buffer + /* extra frame buffer: TDRDO, SAO, ALF */ size_sao_stats + CACHE_LINE_SIZE + /* SAO stat data */ size_sao_param + CACHE_LINE_SIZE + /* SAO parameters */ size_sao_onoff + CACHE_LINE_SIZE + /* SAO on/off number of LCU row */ size_alf + CACHE_LINE_SIZE + /* ALF encoder contexts */ CACHE_LINE_SIZE * 30; /* used for align buffer */ /* alloc memory space */ mem_size = ((mem_size + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE) * CACHE_LINE_SIZE; CHECKED_MALLOC(mem_base, uint8_t *, mem_size); /* assign handle pointer of the xavs2 encoder */ h = (xavs2_t *)mem_base; memset(h, 0, sizeof(xavs2_t)); mem_base += sizeof(xavs2_t); ALIGN_POINTER(mem_base); /* align pointer */ /* init log module */ h->module_log.i_log_level = param->i_log_level; sprintf(h->module_log.module_name, "Enc[%2d] %06llx", idx_frm_encoder, (intptr_t)(h)); /* copy the input parameters */ h->param = param; /* const properties */ h->i_width = frame_w; h->i_height = frame_h; h->i_width_in_lcu = w_in_lcu; h->i_height_in_lcu = h_in_lcu; h->i_width_in_mincu = w_in_scu; h->i_height_in_mincu = h_in_scu; h->i_width_in_minpu = w_in_4x4; h->i_height_in_minpu = h_in_4x4; h->framerate = h->param->frame_rate; h->i_lcu_level = h->param->lcu_bit_level; h->i_scu_level = h->param->scu_bit_level; h->i_chroma_v_shift = h->param->chroma_format == CHROMA_420; h->i_max_ref = h->param->num_max_ref; h->b_progressive = (bool_t)h->param->progressive_frame; h->b_field_sequence = (h->param->InterlaceCodingOption == FIELD_CODING); /* set table which indicates numbers of intra prediction modes for RDO */ for (i = 0; i < MAX_CU_SIZE_IN_BIT; i++) { h->tab_num_intra_rdo[i] = 1; /* this will later be set according to the preset level */ } h->num_rdo_intra_chroma = NUM_INTRA_MODE_CHROMA; /* ------------------------------------------------------------- * assign buffer pointers of xavs2 encoder */ /* point to all nal units */ h->p_nal = (nal_t *)mem_base; mem_base += sizeof(nal_t) * (MAX_SLICES + 6); ALIGN_POINTER(mem_base); /* align pointer */ /* bitstream buffer (frame header) */ h->p_bs_buf_header = mem_base; h->i_bs_buf_header = sizeof(uint8_t) * XAVS2_BS_HEAD_LEN; mem_base += sizeof(uint8_t) * XAVS2_BS_HEAD_LEN; ALIGN_POINTER(mem_base); /* align pointer */ /* bitstream buffer for all slices */ h->p_bs_buf_slice = mem_base; h->i_bs_buf_slice = sizeof(uint8_t) * bs_size; mem_base += sizeof(uint8_t) * bs_size; ALIGN_POINTER(mem_base); /* align pointer */ /* slice array */ for (i = 0; i < num_slices; i++) { slice_t *p_slice = (slice_t *)mem_base; h->slices[i] = p_slice; mem_base += sizeof(slice_t); ALIGN_POINTER(mem_base); /* align pointer */ /* intra prediction mode buffer */ p_slice->slice_ipredmode = (int8_t *)mem_base; mem_base += sizeof(int8_t) * ipm_size; p_slice->slice_ipredmode += (h->i_width_in_minpu + 16) + 16; ALIGN_POINTER(mem_base); /* align pointer */ /* assign pointer to intra_border buffer */ p_slice->slice_intra_border[0] = (pel_t *)mem_base; mem_base += h->i_width * sizeof(pel_t); ALIGN_POINTER(mem_base); p_slice->slice_intra_border[1] = (pel_t *)mem_base; mem_base += (h->i_width / 2) * sizeof(pel_t); ALIGN_POINTER(mem_base); p_slice->slice_intra_border[2] = (pel_t *)mem_base; mem_base += (h->i_width / 2) * sizeof(pel_t); ALIGN_POINTER(mem_base); /* buffer for edge filter flag (of one LCU row) */ p_slice->slice_deblock_flag[0] = (uint8_t *)mem_base; mem_base += h->i_width_in_mincu * (MAX_CU_SIZE / MIN_PU_SIZE) * sizeof(uint8_t); p_slice->slice_deblock_flag[1] = (uint8_t *)mem_base; mem_base += h->i_width_in_mincu * (MAX_CU_SIZE / MIN_PU_SIZE) * sizeof(uint8_t); ALIGN_POINTER(mem_base); } slice_init_bufer(h, h->slices[0]); /* ------------------------------------------------------------- * fenc fdec * Y Y Y Y Y Y Y Y * Y Y Y Y Y Y Y Y * Y Y Y Y Y Y Y Y * Y Y Y Y Y Y Y Y * U U V V U U V V * U U V V U U V V */ /* assign pointers for p_fenc (Y/U/V pointers) */ h->lcu.p_fenc[0] = h->lcu.fenc_buf; h->lcu.p_fenc[1] = h->lcu.fenc_buf + FENC_STRIDE * MAX_CU_SIZE; h->lcu.p_fenc[2] = h->lcu.fenc_buf + FENC_STRIDE * MAX_CU_SIZE + (FENC_STRIDE / 2); /* assign pointers for p_fdec (Y/U/V pointers) */ h->lcu.p_fdec[0] = h->lcu.fdec_buf; h->lcu.p_fdec[1] = h->lcu.fdec_buf + FDEC_STRIDE * MAX_CU_SIZE; h->lcu.p_fdec[2] = h->lcu.fdec_buf + FDEC_STRIDE * MAX_CU_SIZE + (FDEC_STRIDE / 2); /* slice index of CTUs */ h->lcu_slice_idx = (int8_t *)mem_base; mem_base += w_in_lcu * h_in_lcu * sizeof(int8_t); ALIGN_POINTER(mem_base); /* align pointer */ /* inter prediction mode */ h->dir_pred = (int8_t *)mem_base; mem_base += sizeof(int8_t) * size_4x4; ALIGN_POINTER(mem_base); /* align pointer */ /* reference frames */ h->fwd_1st_ref = (int8_t *)mem_base; mem_base += sizeof(int8_t) * size_4x4; ALIGN_POINTER(mem_base); /* align pointer */ h->bwd_2nd_ref = (int8_t *)mem_base; mem_base += sizeof(int8_t) * size_4x4; ALIGN_POINTER(mem_base); /* align pointer */ /* reference motion vectors */ h->fwd_1st_mv = (mv_t *)mem_base; mem_base += sizeof(mv_t) * size_4x4; ALIGN_POINTER(mem_base); /* align pointer */ h->bwd_2nd_mv = (mv_t *)mem_base; mem_base += sizeof(mv_t) * size_4x4; ALIGN_POINTER(mem_base); /* align pointer */ /* temporary buffer for 1/4 interpolation: a,1,b, alone buffer */ h->img4Y_tmp[0] = (mct_t *)mem_base; h->img4Y_tmp[1] = h->img4Y_tmp[0] + qpel_frame_size; h->img4Y_tmp[2] = h->img4Y_tmp[0] + qpel_frame_size * 2; mem_base += qpel_frame_size * 3 * sizeof(mct_t); ALIGN_POINTER(mem_base); /* SAO data */ h->sao_stat_datas = (SAOStatData (*)[NUM_SAO_COMPONENTS][NUM_SAO_NEW_TYPES])mem_base; memset(h->sao_stat_datas[0], 0, size_sao_stats); mem_base += size_sao_stats; ALIGN_POINTER(mem_base); h->sao_blk_params = (SAOBlkParam (*)[NUM_SAO_COMPONENTS])mem_base; memset(h->sao_blk_params[0], 0, size_sao_param); mem_base += size_sao_param; ALIGN_POINTER(mem_base); h->num_sao_lcu_off = (int (*)[NUM_SAO_COMPONENTS])mem_base; memset(h->num_sao_lcu_off[0], 0, size_sao_onoff); mem_base += size_sao_onoff; ALIGN_POINTER(mem_base); /* init memory space in me module */ xavs2_me_init(h, &mem_base); /* allocate frame_info_t (one for each frame context) */ h->frameinfo = (frame_info_t *)mem_base; mem_base += sizeof(frame_info_t); ALIGN_POINTER(mem_base); /* align pointer */ h->frameinfo->rows = (row_info_t *)mem_base; mem_base += sizeof(row_info_t) * h_in_lcu; ALIGN_POINTER(mem_base); /* align pointer */ /* set available tables */ set_available_tables(h); /* assign pointers for all coding tree units */ h->lcu.p_ctu = &h->lcu.all_cu[0]; h->lcu.i_scu_xy = 1; // borrowed build_coding_tree(h, h->lcu.p_ctu, 0, h->i_lcu_level, 0, 0); h->lcu.i_scu_xy = 0; // reset /* set row info */ for (i = 0; i < h_in_lcu; i++) { row_info_t *row = &h->frameinfo->rows[i]; row->h = 0; row->row = i; row->coded = -1; row->lcus = (lcu_info_t *)mem_base; mem_base += sizeof(lcu_info_t) * w_in_lcu; if (xavs2_thread_mutex_init(&row->mutex, NULL)) { goto fail; } if (xavs2_thread_cond_init(&row->cond, NULL)) { goto fail; } } /* check memory size */ ALIGN_POINTER(mem_base); /* align pointer */ /* ------------------------------------------------------------- * allocate other alone spaces for xavs2 encoder */ h->cu_info = (cu_info_t *)mem_base; mem_base += frame_size_in_scu * sizeof(cu_info_t); ALIGN_POINTER(mem_base); p_cu_info = h->cu_info; for (j = 0; j < h_in_scu; j++) { for (i = 0; i < w_in_scu; i++) { scu_xy++; p_cu_info->i_scu_x = i; p_cu_info->i_scu_y = j; p_cu_info++; } } /* motion estimation buffer */ h->all_mincost = (dist_t(*)[MAX_INTER_MODES][MAX_REFS])mem_base; mem_base += num_me_bytes; ALIGN_POINTER(mem_base); // allocate memory for current frame if (h->param->enable_tdrdo) { h->img_luma_pre = xavs2_frame_new(h, &mem_base, FT_TEMP); ALIGN_POINTER(mem_base); } else { h->img_luma_pre = NULL; } if (h->param->enable_sao) { h->img_sao = xavs2_frame_new(h, &mem_base, FT_TEMP); ALIGN_POINTER(mem_base); } else { h->img_sao = NULL; } if (h->param->enable_alf) { h->img_alf = xavs2_frame_new(h, &mem_base, FT_TEMP); ALIGN_POINTER(mem_base); alf_init_buffer(h, mem_base); mem_base += size_alf; ALIGN_POINTER(mem_base); } else { h->img_alf = NULL; } if ((uintptr_t)(h) + mem_size < (uintptr_t)(mem_base)) { /* malloc size allocation error: no enough memory */ goto fail; } /* ------------------------------------------------------------- * init other properties/modules for xavs2 encoder */ /* init all slices */ xavs2_slices_init(h); #if ENABLE_WQUANT /* adaptive frequency weighting quantization */ if (h->param->enable_wquant) { xavs2_wq_init_seq_quant_param(h); } #endif return h; fail: return NULL; } /* --------------------------------------------------------------------------- */ static void encoder_destroy_frame_context(xavs2_t *h) { int i; assert(h != NULL); assert(h->task_type == XAVS2_TASK_FRAME); h->img_luma_pre = NULL; h->img_sao = NULL; h->img_alf = NULL; h->enc_alf = NULL; /* free frame_info_t & row_info_t */ if (h->frameinfo) { for (i = 0; i < h->i_height_in_lcu; i++) { /* free a row */ row_info_t *row = &h->frameinfo->rows[i]; if (row) { xavs2_thread_mutex_destroy(&row->mutex); xavs2_thread_cond_destroy(&row->cond); } } } xavs2_free(h); } /* --------------------------------------------------------------------------- * allocate memory for multiple threads (slices/frames parallel) */ int encoder_contexts_init(xavs2_t *h, xavs2_handler_t *h_mgr) { int i; /* ------------------------------------------------------------- * build lcu row encoding contexts */ if (h_mgr->num_row_contexts > 1) { CHECKED_MALLOC(h_mgr->row_contexts, xavs2_t *, h_mgr->num_row_contexts * sizeof(xavs2_t)); for (i = 0; i < h_mgr->num_row_contexts; i++) { xavs2_t *h_row_coder = &h_mgr->row_contexts[i]; memcpy(&h_row_coder->communal_vars_1, &h->communal_vars_1, (uint8_t *)&h->communal_vars_2 - (uint8_t *)&h->communal_vars_1); /* identify ourself */ h_row_coder->task_type = XAVS2_TASK_ROW; /* we are free */ h_row_coder->i_aec_frm = -1; /* assign pointers for all coding tree units */ h_row_coder->lcu.p_ctu = &h_row_coder->lcu.all_cu[0]; h_row_coder->lcu.i_scu_xy = 1; // borrowed build_coding_tree(h_row_coder, h_row_coder->lcu.p_ctu, 0, h_row_coder->i_lcu_level, 0, 0); h_row_coder->lcu.i_scu_xy = 0; // reset /* assign pointers for p_fenc (Y/U/V pointers) */ h_row_coder->lcu.p_fenc[0] = h_row_coder->lcu.fenc_buf; h_row_coder->lcu.p_fenc[1] = h_row_coder->lcu.fenc_buf + FENC_STRIDE * MAX_CU_SIZE; h_row_coder->lcu.p_fenc[2] = h_row_coder->lcu.fenc_buf + FENC_STRIDE * MAX_CU_SIZE + FENC_STRIDE / 2; /* assign pointers for p_fdec (Y/U/V pointers) */ h_row_coder->lcu.p_fdec[0] = h_row_coder->lcu.fdec_buf; h_row_coder->lcu.p_fdec[1] = h_row_coder->lcu.fdec_buf + FDEC_STRIDE * MAX_CU_SIZE; h_row_coder->lcu.p_fdec[2] = h_row_coder->lcu.fdec_buf + FDEC_STRIDE * MAX_CU_SIZE + FDEC_STRIDE / 2; } } /* ------------------------------------------------------------- * build frame encoding contexts */ h_mgr->frm_contexts[0] = h; /* context 0 is the main encoder handle */ for (i = 1; i < h_mgr->i_frm_threads; i++) { if ((h_mgr->frm_contexts[i] = encoder_create_frame_context(h->param, i)) == 0) { goto fail; } memcpy(&h_mgr->frm_contexts[i]->communal_vars_1, &h->communal_vars_1, (uint8_t *)&h->communal_vars_2 - (uint8_t *)&h->communal_vars_1); } return 0; fail: return -1; } /* --------------------------------------------------------------------------- * free all contexts except for the main context : xavs2_handler_t::contexts[0] */ static void encoder_contexts_free(xavs2_handler_t *h_mgr) { int i = 0; /* free all row contexts */ if (h_mgr->row_contexts != NULL) { xavs2_free(h_mgr->row_contexts); h_mgr->row_contexts = NULL; } /* free frame contexts */ for (i = 0; i < h_mgr->i_frm_threads; i++) { /* free the xavs2 encoder */ if (h_mgr->frm_contexts[i] != NULL) { encoder_destroy_frame_context(h_mgr->frm_contexts[i]); h_mgr->frm_contexts[i] = NULL; } } } /* --------------------------------------------------------------------------- * free the task manager */ void encoder_task_manager_free(xavs2_handler_t *h_mgr) { int i = 0; assert(h_mgr != NULL); /* signal to exit */ h_mgr->i_exit_flag = XAVS2_EXIT_THREAD; /* wait until the aec thread finish its job */ xavs2_thread_cond_signal(&h_mgr->cond[SIG_FRM_CONTEXT_ALLOCATED]); /* destroy the AEC thread pool */ if (h_mgr->threadpool_aec != NULL) { xavs2_threadpool_delete(h_mgr->threadpool_aec); } /* wait until the output thread finish its job */ xavs2_thread_cond_signal(&h_mgr->cond[SIG_FRM_AEC_COMPLETED]); xavs2_thread_mutex_destroy(&h_mgr->mutex); for (i = 0; i < SIG_COUNT; i++) { xavs2_thread_cond_destroy(&h_mgr->cond[i]); } /* destroy the RDO thread pool */ if (h_mgr->i_frm_threads > 1 || h_mgr->i_row_threads > 1) { xavs2_threadpool_delete(h_mgr->threadpool_rdo); } #if XAVS2_STAT /* report everything */ encoder_report_stat_info(h_mgr->p_coder); #endif /* destroy TDRDO */ tdrdo_destroy(h_mgr->td_rdo); /* destroy the rate control */ xavs2_rc_destroy(h_mgr->rate_control); #if XAVS2_DUMP_REC /* close rec file */ if (h_mgr->h_rec_file) { fclose(h_mgr->h_rec_file); h_mgr->h_rec_file = NULL; } #endif /* free contexts */ encoder_contexts_free(h_mgr); /* free memory of all lists */ destroy_all_lists(h_mgr); frame_buffer_destroy(h_mgr, &h_mgr->dpb); } /* --------------------------------------------------------------------------- * reset the decoding frame */ static void init_decoding_frame(xavs2_t *h) { #if SAVE_CU_INFO int frame_size_in_mincu = h->i_width_in_mincu * h->i_height_in_mincu; #endif int frame_size_in_mvstore = ((h->i_width_in_minpu + 3) >> 2) * ((h->i_height_in_minpu + 3) >> 2); int i; /* set frame properties */ h->fdec->i_frame = h->fenc->i_frame; h->fdec->i_frm_type = h->fenc->i_frm_type; h->fdec->i_pts = h->fenc->i_pts; h->fdec->i_dts = h->fenc->i_dts; h->fdec->i_frm_coi = h->fenc->i_frm_coi; h->fdec->i_gop_idr_coi = h->fenc->i_gop_idr_coi; h->fdec->rps.temporal_id = h->i_layer; if (h->b_field_sequence == 0) { h->fdec->i_frm_poc = h->fdec->i_frame << 1; } else { assert(0); // field sequences } /* set ref_dpoc */ for (i = 0; i < sizeof(h->fdec->ref_dpoc) / sizeof(h->fdec->ref_dpoc[0]); i++) { h->fdec->ref_dpoc[i] = MULTIx2; h->fdec->ref_dpoc_multi[i] = 1; } if (h->i_type == SLICE_TYPE_B) { h->fdec->ref_dpoc[B_BWD] = ((h->fref[B_BWD]->i_frm_poc - h->fdec->i_frm_poc) + 512) & 511; h->fdec->ref_dpoc[B_FWD] = ((h->fdec->i_frm_poc - h->fref[B_FWD]->i_frm_poc) + 512) & 511; h->fdec->ref_dpoc_multi[B_BWD] = MULTI / h->fdec->ref_dpoc[B_BWD]; h->fdec->ref_dpoc_multi[B_FWD] = MULTI / h->fdec->ref_dpoc[B_FWD]; } else if (h->i_type != SLICE_TYPE_I) { /* F/P frame */ for (i = 0; i < h->i_ref; i++) { h->fdec->ref_dpoc[i] = (h->fdec->i_frm_poc - h->fref[i]->i_frm_poc + 512) & 511; h->fdec->ref_dpoc_multi[i] = MULTI / h->fdec->ref_dpoc[i]; } } /* reset mv buffer */ g_funcs.fast_memzero(h->fdec->pu_mv, frame_size_in_mvstore * sizeof(mv_t)); /* reset ref buffer */ g_funcs.fast_memset(h->fdec->pu_ref, INVALID_REF, frame_size_in_mvstore * sizeof(int8_t)); #if SAVE_CU_INFO /* reset CU BitSize buffer */ g_funcs.fast_memzero(h->fdec->cu_level, frame_size_in_mincu * sizeof(int8_t)); /* reset CU type buffer */ g_funcs.fast_memzero(h->fdec->cu_mode, frame_size_in_mincu * sizeof(int8_t)); /* reset CU cbp buffer */ g_funcs.fast_memzero(h->fdec->cu_cbp, frame_size_in_mincu * sizeof(int8_t)); #endif } /* --------------------------------------------------------------------------- * init function handles */ static void encoder_init_func_handles(xavs2_t *h) { /* set some function handles according option or preset level */ if (h->param->enable_hadamard) { g_funcs.pixf.intra_cmp = g_funcs.pixf.satd; g_funcs.pixf.fpel_cmp = g_funcs.pixf.satd; } else { g_funcs.pixf.intra_cmp = g_funcs.pixf.sad; g_funcs.pixf.fpel_cmp = g_funcs.pixf.sad; } } /** * =========================================================================== * encoder function defines * =========================================================================== */ /** * --------------------------------------------------------------------------- * Function : create and initialize a xavs2 video encoder * Parameters : * [in ] : param - pointer to struct xavs2_param_t * : h_mgr - pointer to top handler * [out] : none * Return : handle of xavs2 encoder, none zero for success, otherwise false * --------------------------------------------------------------------------- */ xavs2_t *encoder_open(xavs2_param_t *param, xavs2_handler_t *h_mgr) { xavs2_t *h = NULL; #if XAVS2_STAT /* show header info */ encoder_show_head_info(param); #endif /* decide ultimaete coding parameters by preset level */ decide_ultimate_paramters(param); /* init frame context */ if ((h = encoder_create_frame_context(param, 0)) == NULL) { xavs2_log(NULL, XAVS2_LOG_ERROR, "create frame context fail\n"); goto fail; } /* set fast algorithms according to the input preset level */ encoder_set_fast_algorithms(h); /* init top handler */ h->h_top = h_mgr; h->rc = h_mgr->rate_control; h->td_rdo = h_mgr->td_rdo; h->task_type = XAVS2_TASK_FRAME; /* we are a frame task */ h->task_status = XAVS2_TASK_FREE; /* ready for encoding */ h->i_aec_frm = -1; /* ready to be allocated */ h_mgr->frm_contexts[0] = h; /* point to the xavs2_t handle */ #if XAVS2_TRACE xavs2_trace_init(h->param); /* init trace */ #endif if (encoder_decide_mv_range(h) < 0) { xavs2_log(NULL, XAVS2_LOG_ERROR, "check mv range fail\n"); goto fail; } encoder_init_func_handles(h); /* init function handles */ xavs2_init_valid_mode_table(h); xavs2_me_init_umh_threshold(h, h->umh_bsize, h->param->i_initial_qp + 1); #if CTRL_OPT_AEC init_aec_context_tab(); #endif /* parse RPS */ rps_set_picture_reorder_delay(h); #if XAVS2_STAT encoder_show_frame_info_tab(h, h_mgr); #endif /* init LCU row order */ slice_lcu_row_order_init(h); return h; fail: encoder_close(h_mgr); return 0; } /** * --------------------------------------------------------------------------- * Function : init frame coding (init bitstream and picture header) * Parameters : * [in ] : h - pointer to struct xavs2_t, the xavs2 encoder * [out] : none * Return : the length of bitstream * --------------------------------------------------------------------------- */ static INLINE void xavs2e_frame_coding_init(xavs2_t *h) { /* prepare to encode ------------------------------------------- */ #if ENABLE_WQUANT if (h->param->intra_period_min != 0 && h->i_type == SLICE_TYPE_I) { // adaptive frequency weighting quantization if (h->param->enable_wquant) { xavs2_wq_init_seq_quant_param(h); } } if (h->param->enable_wquant && h->param->PicWQEnable) { xavs2_wq_init_pic_quant_param(h); xavs2_wq_update_pic_matrix(h); } #endif /* frame picture? */ if (h->param->InterlaceCodingOption == FIELD_CODING) { h->b_top_field = (h->fenc->i_frm_poc & 1) == 0; } /* get QP to encode -------------------------------------------- */ /* enable TD-RDO? */ if (h->td_rdo != NULL) { tdrdo_frame_start(h); } /* get frame level qp */ if (h->param->i_rc_method != XAVS2_RC_CQP) { int new_qp = h->i_qp; new_qp = xavs2_rc_get_frame_qp(h, h->fenc->i_frame, h->fenc->i_frm_type, h->fenc->i_qpplus1); /* calculate the lambda again */ if (new_qp != h->i_qp) { h->i_qp = new_qp; xavs2e_get_frame_lambda(h, h->fenc, new_qp); xavs2e_update_lambda(h, h->i_type, h->fenc->f_frm_lambda_ssd); } } /* confirm the encoding QP in the right range */ h->i_qp = XAVS2_CLIP3(h->param->i_min_qp, h->param->i_max_qp, h->i_qp); h->i_qp = clip_qp(h, h->i_qp); /* encoding begin ---------------------------------------------- */ /* ֡ʼ */ if (IS_ALG_ENABLE(OPT_CU_QSFD)) { qsfd_calculate_threshold_of_a_frame(h); } if (h->param->enable_intra || h->fenc->i_frm_type == XAVS2_TYPE_I) { h->fenc->b_enable_intra = 1; } else { h->fenc->b_enable_intra = 0; } } /** * --------------------------------------------------------------------------- * Function : encode a video frame * Parameters : * [in ] : h - pointer to struct xavs2_t, the xavs2 encoder * : frm - pointer to struct xavs2_picture_t * [out] : none * Return : the length of bitstream * --------------------------------------------------------------------------- */ void *xavs2e_encode_one_frame(void *arg) { xavs2_t *h = (xavs2_t *)arg; row_info_t *rows = h->frameinfo->rows; const int enable_wpp = h->h_top->i_row_threads > 1; int i; /* (1) init frame properties for frame coding ------------------------- */ xavs2e_frame_coding_init(h); h->pic_alf_on[0] = h->param->enable_alf; h->pic_alf_on[1] = h->param->enable_alf; h->pic_alf_on[2] = h->param->enable_alf; if (h->param->enable_alf && IS_ALG_ENABLE(OPT_FAST_ALF)) { if ((!h->fdec->rps.referd_by_others && h->i_type == SLICE_TYPE_B)) { h->pic_alf_on[0] = 0; h->pic_alf_on[1] = 0; h->pic_alf_on[2] = 0; } } /* start AEC frame coding */ if (h->h_top->threadpool_aec != NULL && !h->param->enable_alf) { xavs2_threadpool_run(h->h_top->threadpool_aec, encoder_aec_encode_one_frame, h, 0); } /* (3) encode all LCU rows in current frame --------------------------- */ for (i = 0; i < h->i_height_in_lcu; i++) { int lcu_y = g_slice_lcu_row_order[i].lcu_y; int row_type = g_slice_lcu_row_order[i].row_type; row_info_t *row = &rows[lcu_y]; row_info_t *last_row; h->i_slice_index = g_slice_lcu_row_order[i].slice_idx; /* ǷҪ⴦Slice߽ */ row->b_top_slice_border = 0; row->b_down_slice_border = 0; /* ǰ֡ڵ */ if (row_type) { last_row = &rows[lcu_y - 1]; row->b_down_slice_border = (row_type == 2 && lcu_y != h->i_height_in_lcu - 1); } else { xavs2_slice_write_start(h); /* SliceĵһУʼ */ last_row = NULL; row->b_top_slice_border = (lcu_y > 0); } /* ȴο֡б */ xavs2e_inter_sync(h, lcu_y, 0); /* encode one lcu row */ if (enable_wpp && i != h->i_height_in_lcu - 1) { /* 1, һм߳̽б */ if ((row->h = xavs2e_alloc_row_task(h)) == NULL) { return NULL; } /* 2, 鵱ǰǷӦ * ΪȴһLCỤ߳ٵȴ1 */ wait_lcu_row_coded(last_row, 0); /* 3, ʹøм߳̽б */ xavs2_threadpool_run(h->h_top->threadpool_rdo, xavs2_lcu_row_write, row, 0); } else { row->h = h; xavs2_lcu_row_write(row); } /* SliceһLCU˵ҪϲSlice * RDO׶ΣҪ */ // if (h->param->slice_num > 1 && row_type == 2) { // nal_merge_slice(h, h->slices[h->i_slice_index]->p_bs_buf, h->i_nal_type, h->i_nal_ref_idc); // } } // for all LCU rows /* (4) Make sure that all LCU row are finished */ if (h->param->slice_num > 1) { xavs2_frame_t *p_fdec = h->fdec; for (i = 0; i < h->i_height_in_lcu; i++) { xavs2_thread_mutex_lock(&p_fdec->mutex); /* lock */ while (p_fdec->num_lcu_coded_in_row[i] < h->i_width_in_lcu) { xavs2_thread_cond_wait(&p_fdec->cond, &p_fdec->mutex); } xavs2_thread_mutex_unlock(&p_fdec->mutex); /* unlock */ } } /* (5) ͳSAOĿͿر */ if (h->param->enable_sao && (h->slice_sao_on[0] || h->slice_sao_on[1] || h->slice_sao_on[2])) { int sao_off_num_y = 0; int sao_off_num_u = 0; int sao_off_num_v = 0; for (i = 0; i < h->i_height_in_lcu; i++) { sao_off_num_y += h->num_sao_lcu_off[i][0]; sao_off_num_u += h->num_sao_lcu_off[i][1]; sao_off_num_v += h->num_sao_lcu_off[i][2]; } h->fdec->num_lcu_sao_off[0] = sao_off_num_y; h->fdec->num_lcu_sao_off[1] = sao_off_num_u; h->fdec->num_lcu_sao_off[2] = sao_off_num_v; } else { int num_lcu = h->i_width_in_lcu * h->i_height_in_lcu; h->fdec->num_lcu_sao_off[0] = num_lcu; h->fdec->num_lcu_sao_off[1] = num_lcu; h->fdec->num_lcu_sao_off[2] = num_lcu; } /* (6) ALF */ if (h->param->enable_alf) { xavs2_frame_copy_planes(h, h->img_alf, h->fdec); xavs2_frame_expand_border_frame(h, h->img_alf); alf_filter_one_frame(h); /* ¶عͼ߽չ */ if (h->pic_alf_on[0] || h->pic_alf_on[1] || h->pic_alf_on[2]) { xavs2_frame_expand_border_frame(h, h->fdec); } #if ENABLE_FRAME_SUBPEL_INTPL if (h->pic_alf_on[0] && h->use_fractional_me != 0) { /* interpolate (after finished expanding border) */ for (i = 0; i < h->i_height_in_lcu; i++) { interpolate_lcu_row(h, h->fdec, i); } } #endif if (h->h_top->threadpool_aec != NULL) { xavs2_threadpool_run(h->h_top->threadpool_aec, encoder_aec_encode_one_frame, h, 0); } } /* (7) after encoding ... ------------------------------------------ */ if (h->td_rdo != NULL) { tdrdo_frame_done(h); } encoder_write_rec_frame(h->h_top); /* update encoding information */ xavs2_reconfigure_encoder(h); /* release all reference frames */ for (i = 0; i < h->i_ref; i++) { release_one_frame(h, h->fref[i]); } /* make sure all row context to release */ if (h->param->i_lcurow_threads > 1) { int *num_lcu_coded = h->fdec->num_lcu_coded_in_row; for (i = 0; i < h->i_height_in_lcu; i++) { if (num_lcu_coded[i] <= h->i_width_in_lcu) { xavs2_sleep_ms(1); } } } h->b_all_row_ctx_released = 1; /* release the reconstructed frame */ release_one_frame(h, h->fdec); /* set task status */ encoder_set_task_status(h, XAVS2_TASK_RDO_DONE); if (h->h_top->threadpool_aec == NULL) { encoder_aec_encode_one_frame(h); } return 0; } /** * --------------------------------------------------------------------------- * Function : encode a video frame * Parameters : * [in ] : h - pointer to struct xavs2_t, the xavs2 encoder * : frm - pointer to struct xavs2_picture_t * [out] : none * Return : return 0 on success, -1 on failure * --------------------------------------------------------------------------- */ int encoder_encode(xavs2_handler_t *h_mgr, xavs2_frame_t *frame) { if (frame->i_state != XAVS2_FLUSH) { xavs2_t *p_coder; #if XAVS2_STAT frame->i_time_start = xavs2_mdate(); #endif /* prepare the encoding context. * get a frame encoder handle (initialized already) */ if ((p_coder = encoder_alloc_frame_task(h_mgr, frame)) == NULL) { return -1; } init_decoding_frame(p_coder); /* encode the input frame: parallel or not */ if (h_mgr->i_frm_threads > 1) { /* frame level parallel processing enabled */ xavs2_threadpool_run(h_mgr->threadpool_rdo, xavs2e_encode_one_frame, p_coder, 0); } else { xavs2e_encode_one_frame(p_coder); } } else { /* flush output */ encoder_flush(h_mgr); /* flush stream-end */ encoder_output_frame_bitstream(h_mgr, NULL); } return 0; } /** * --------------------------------------------------------------------------- * Function : destroy the xavs2 video encoder * Parameters : * [in ] : h_mgr - pointer to struct xavs2_handler_t, the xavs2 encoder * [out] : none * Return : none * --------------------------------------------------------------------------- */ void encoder_close(xavs2_handler_t *h_mgr) { if (h_mgr == NULL) { return; } /* flush delayed frames, so that every process could be stopped */ encoder_flush(h_mgr); /* now, destroy everything! */ #if XAVS2_TRACE /* destroy trace */ xavs2_trace_destroy(); #endif /* free the task manager and all encoding contexts */ encoder_task_manager_free(h_mgr); } xavs2-1.3/source/encoder/encoder.h000066400000000000000000000065201340660520300171310ustar00rootroot00000000000000/* * encoder.h * * Description of this file: * Encoder functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_ENCODER_H #define XAVS2_ENCODER_H /** * =========================================================================== * interface function defines (pre-encode processing) * =========================================================================== */ int send_frame_to_enc_queue(xavs2_handler_t *h_mgr, xavs2_frame_t *frm); void xavs2e_get_frame_lambda(xavs2_t *h, xavs2_frame_t *cur_frm, int i_qp); /** * =========================================================================== * interface function defines (encode processing) * =========================================================================== */ int encoder_check_parameters(xavs2_param_t *param); xavs2_t *encoder_open (xavs2_param_t *param, xavs2_handler_t *h_mgr); int encoder_encode(xavs2_handler_t *h_mgr, xavs2_frame_t *frame); void encoder_close (xavs2_handler_t *h_mgr); int encoder_contexts_init(xavs2_t *h, xavs2_handler_t *h_mgr); void dump_yuv_out(xavs2_t *h, FILE *fp, xavs2_frame_t *frame, int img_w, int img_h); void encoder_fetch_one_encoded_frame(xavs2_handler_t *h_mgr, xavs2_outpacket_t *packet, int is_flush); void xavs2_reconfigure_encoder(xavs2_t *h); #if XAVS2_STAT /** * =========================================================================== * interface function defines (encode report) * =========================================================================== */ void encoder_show_head_info(xavs2_param_t *param); void encoder_show_frame_info_tab(xavs2_t *h, xavs2_handler_t *mgr); void encoder_cal_psnr(xavs2_t *h, double *psnr_y, double *psnr_u, double *psnr_v); void encoder_cal_ssim(xavs2_t *h, double *ssim_y, double *ssim_u, double *ssim_v); void encoder_report_one_frame(xavs2_t *h, outputframe_t *frame); void encoder_report_stat_info(xavs2_t *h); #endif #endif // XAVS2_ENCODER_H xavs2-1.3/source/encoder/encoder_report.c000066400000000000000000000476221340660520300205270ustar00rootroot00000000000000/* * encoder_report.c * * Description of this file: * Encoder Reporting functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "mc.h" #include "wrapper.h" #include "encoder.h" #include "version.h" #include "cpu.h" #if defined(__ARM_ARCH_7A__) || SYS_LINUX || !defined(_MSC_VER) #define sprintf_s snprintf #endif #if XAVS2_STAT /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE double get_psnr_with_ssd(double f_max, uint64_t diff) { if (diff > 0) { return 10.0 * log10(f_max / diff); } else { return 99.999; } } /* --------------------------------------------------------------------------- * calculate PSNR for all three components (Y, U and V) */ void encoder_cal_psnr(xavs2_t *h, double *psnr_y, double *psnr_u, double *psnr_v) { int i_width = h->param->org_width; int i_height = h->param->org_height; int i_size = i_width * i_height; int uvformat = h->param->chroma_format == CHROMA_420 ? 4 : 2; const double f_max_signal = (double)(255 * 255) * i_size; const int inout_shift = 0; uint64_t diff_y, diff_u, diff_v; /* luma */ diff_y = xavs2_pixel_ssd_wxh(&g_funcs.pixf, h->fenc->planes[0], h->fenc->i_stride[0], h->fdec->planes[0], h->fdec->i_stride[0], i_width, i_height, inout_shift); /* chroma */ if (h->param->chroma_format != CHROMA_400) { i_width >>= 1; i_height >>= 1; diff_u = xavs2_pixel_ssd_wxh(&g_funcs.pixf, h->fenc->planes[1], h->fenc->i_stride[1], h->fdec->planes[1], h->fdec->i_stride[1], i_width, i_height, inout_shift); diff_v = xavs2_pixel_ssd_wxh(&g_funcs.pixf, h->fenc->planes[2], h->fenc->i_stride[2], h->fdec->planes[2], h->fdec->i_stride[2], i_width, i_height, inout_shift); } else { diff_u = 0; diff_v = 0; } xavs2_emms(); /* call before using float instructions */ /* get the PSNR for current frame */ *psnr_y = get_psnr_with_ssd(f_max_signal, diff_y); *psnr_u = get_psnr_with_ssd(f_max_signal, diff_u * uvformat); *psnr_v = get_psnr_with_ssd(f_max_signal, diff_v * uvformat); } /* --------------------------------------------------------------------------- * calculate SSIM */ double ssim_calculate_plane(xavs2_t *h, int comp_id) { uint32_t g_uiBitDepth = 8; // base bit-depth uint32_t g_uiBitIncrement = 0; // increments double k_ssim_1 = 0.01; double k_ssim_2 = 0.03; int m_uiWidth = h->param->org_width; int m_uiHeight = h->param->org_height; int iStride1 = h->fenc->i_stride[comp_id]; int iStride2 = h->fdec->i_stride[comp_id]; // xavs2_log(h, XAVS2_LOG_INFO, "iStride: %d",iStride); uint32_t uiWinWidth = 8; uint32_t uiWinHeight= 8; uint32_t uiWidth = comp_id ? m_uiWidth >> 1 : m_uiWidth; uint32_t uiHeight = comp_id ? m_uiHeight >> 1 : m_uiHeight; double dLocSSIM, dLocMeanRef, dLocMeanRec, dLocVarRef, dLocVarRec, dLocCovar, Num1, Num2, Den1, Den2, dMSSIM = 0; uint32_t i, j, x, y; // xavs2_log(h, XAVS2_LOG_INFO, "uiHeight: %d uiWinHeight: %d uiWidth: %d uiWinWidth:%d\n",uiHeight,uiWinHeight,uiWidth,uiWinWidth); uint32_t uiNumWin = (uiHeight - uiWinHeight + 1)*(uiWidth - uiWinWidth + 1); uint32_t iWinPixel = uiWinWidth * uiWinHeight; uint32_t uiMaxval = 255 * (1 << (g_uiBitDepth + g_uiBitIncrement - 8)); // xavs2_log(h, XAVS2_LOG_INFO, "uiNumWin : %d uiMaxval : %d\n",uiNumWin,uiMaxval); double C1 = k_ssim_1 * k_ssim_1 * uiMaxval * uiMaxval; double C2 = k_ssim_2 * k_ssim_2 * uiMaxval * uiMaxval; pel_t* pOrg = h->fenc->planes[comp_id]; pel_t* pRec = h->fdec->planes[comp_id]; // xavs2_log(h, XAVS2_LOG_INFO, "pOrg : %p pRec : %p\n",pOrg,pRec); pel_t* pOrgPel = pOrg; pel_t* pRecPel = pRec; for (j = 0; j <= uiHeight - uiWinHeight; j++) { for (i = 0; i <= uiWidth - uiWinWidth; i++) { dLocMeanRef = 0; dLocMeanRec = 0; dLocVarRef = 0; dLocVarRec = 0; dLocCovar = 0; pOrgPel = pOrg + i + iStride1*j; pRecPel = pRec + i + iStride2*j; // xavs2_log(h, XAVS2_LOG_INFO, "pOrgPel[0] : %d pRecPel[0] : %d\n",pOrgPel[0],pRecPel[0]); // xavs2_log(h, XAVS2_LOG_INFO, "uiWinWidth : %d uiWinHeight : %d\n",uiWinWidth,uiWinHeight); for (y = 0; y < uiWinHeight; y++) { for (x = 0; x < uiWinWidth; x++) { // xavs2_log(h, XAVS2_LOG_INFO, "pOrgPel[%d] : %d pRecPel[%d] : %d\n",x,pOrgPel[x],x,pRecPel[x]); dLocMeanRef += pOrgPel[x]; dLocMeanRec += pRecPel[x]; dLocVarRef += pOrgPel[x] * pOrgPel[x]; dLocVarRec += pRecPel[x] * pRecPel[x]; dLocCovar += pOrgPel[x] * pRecPel[x]; } pOrgPel += iStride1; pRecPel += iStride2; } dLocMeanRef /= iWinPixel; dLocMeanRec /= iWinPixel; // xavs2_log(h, XAVS2_LOG_INFO, "dLocMeanRef : %7.4f dLocMeanRec : %7.4f \n",dLocMeanRef,dLocMeanRec); dLocVarRef = (dLocVarRef - dLocMeanRef * dLocMeanRef * iWinPixel) / iWinPixel; dLocVarRec = (dLocVarRec - dLocMeanRec * dLocMeanRec * iWinPixel) / iWinPixel; dLocCovar = (dLocCovar - dLocMeanRef * dLocMeanRec * iWinPixel) / iWinPixel; Num1 = 2.0 * dLocMeanRef * dLocMeanRec + C1; Num2 = 2.0 * dLocCovar + C2; Den1 = dLocMeanRef * dLocMeanRef + dLocMeanRec * dLocMeanRec + C1; Den2 = dLocVarRef + dLocVarRec + C2; dLocSSIM = (Num1 * Num2) / (Den1 * Den2); dMSSIM += dLocSSIM; } } // xavs2_log(h, XAVS2_LOG_INFO,"ssim: %7.4f \n ", dMSSIM / (double)uiNumWin); return dMSSIM / (double)uiNumWin; } /* --------------------------------------------------------------------------- * calculate SSIM for all three components (Y, U and V) */ void encoder_cal_ssim(xavs2_t *h, double *ssim_y, double *ssim_u, double *ssim_v) { *ssim_y = ssim_calculate_plane(h, 0); *ssim_u = ssim_calculate_plane(h, 1); *ssim_v = ssim_calculate_plane(h, 2); } /* --------------------------------------------------------------------------- */ void encoder_report_stat_info(xavs2_t *h) { xavs2_stat_t *p_stat = &h->h_top->stat; float f_bitrate; double f_psnr_y = p_stat->stat_total.f_psnr[0]; double f_psnr_u = p_stat->stat_total.f_psnr[1]; double f_psnr_v = p_stat->stat_total.f_psnr[2]; double ssim_y = p_stat->stat_total.f_ssim[0]; double ssim_u = p_stat->stat_total.f_ssim[1]; double ssim_v = p_stat->stat_total.f_ssim[2]; int64_t i_total_bits = p_stat->stat_total.i_frame_size; int num_total_frames = p_stat->stat_total.num_frames; if (num_total_frames == 0) { xavs2_log(NULL, XAVS2_LOG_WARNING, "------------------------------------------------------------------\n"); xavs2_log(NULL, XAVS2_LOG_WARNING, "TOTAL TIME: %8.3f sec, NO FRAMES CODED\n", (double)(p_stat->i_end_time - p_stat->i_start_time) / 1000000.0); return; } xavs2_log(h, XAVS2_LOG_INFO, "---------------------------------------------------------------------\n"); // FIXME: cause "Segmentation fault (core dumped)" in Linux, print directly (gcc 4.7) if (h->param->enable_psnr) { xavs2_log(h, XAVS2_LOG_INFO, "AVERAGE SEQ PSNR: %7.4f %7.4f %7.4f\n", f_psnr_y / num_total_frames, f_psnr_u / num_total_frames, f_psnr_v / num_total_frames); } if (h->param->enable_ssim) { xavs2_log(h, XAVS2_LOG_INFO, "AVERAGE SEQ SSIM: %7.5f %7.5f %7.5f\n", ssim_y / num_total_frames, ssim_u / num_total_frames, ssim_v / num_total_frames); } // BITRATE f_bitrate = (i_total_bits * (8.0f / 1000.0f) * h->framerate) / ((float)num_total_frames); xavs2_log(h, XAVS2_LOG_INFO, " BITRATE: %6.2f kb/s @ %4.1f Hz, %d frames, xavs2 p%d \n", f_bitrate, h->framerate, num_total_frames, h->param->preset_level); // TOTAL BITS xavs2_log(h, XAVS2_LOG_INFO, " TOTAL BITS: %lld (I: %lld, B: %lld, P/F: %lld)\n", i_total_bits * 8, p_stat->stat_i_frame.i_frame_size * 8, p_stat->stat_b_frame.i_frame_size * 8, p_stat->stat_p_frame.i_frame_size * 8); // TOTAL TIME xavs2_log(h, XAVS2_LOG_DEBUG, " TOTAL TIME: %8.3f sec, total %d frames, speed: %5.2f fps \n", (double)(p_stat->i_end_time - p_stat->i_start_time) / 1000000.0, num_total_frames, (double)num_total_frames / ((p_stat->i_end_time - p_stat->i_start_time) / 1000000.0)); // Time Distribution xavs2_log(h, XAVS2_LOG_DEBUG, " Frame Time: I: %6.2f%%; B: %6.2f%%; P/F: %6.2f%%\n", (double)(p_stat->stat_i_frame.i_time_duration * 100.0) / p_stat->stat_total.i_time_duration, (double)(p_stat->stat_b_frame.i_time_duration * 100.0) / p_stat->stat_total.i_time_duration, (double)(p_stat->stat_p_frame.i_time_duration * 100.0) / p_stat->stat_total.i_time_duration); xavs2_log(h, XAVS2_LOG_INFO, " Frame Num : I: %6.2f%%; B: %6.2f%%; P/F: %6.2f%%\n", p_stat->stat_i_frame.num_frames * 100.0 / num_total_frames, p_stat->stat_b_frame.num_frames * 100.0 / num_total_frames, p_stat->stat_p_frame.num_frames * 100.0 / num_total_frames); xavs2_log(h, XAVS2_LOG_INFO, "---------------------------------------------------------------------\n"); } /* --------------------------------------------------------------------------- */ static INLINE void stat_add_frame_info(com_stat_t *sum_stat, com_stat_t *frm_stat, int frm_bs_len) { sum_stat->num_frames++; sum_stat->i_frame_size += frm_bs_len; sum_stat->i_time_duration += frm_stat->i_time_duration; sum_stat->f_psnr[0] += frm_stat->f_psnr[0]; sum_stat->f_psnr[1] += frm_stat->f_psnr[1]; sum_stat->f_psnr[2] += frm_stat->f_psnr[2]; sum_stat->f_ssim[0] += frm_stat->f_ssim[0]; sum_stat->f_ssim[1] += frm_stat->f_ssim[1]; sum_stat->f_ssim[2] += frm_stat->f_ssim[2]; } /* --------------------------------------------------------------------------- * get reference list string */ static ALWAYS_INLINE void get_reference_list_str(char *s_ref_list, int *p_poc, int num_ref) { if (num_ref > 1) { char str_tmp[16]; int i; sprintf(s_ref_list, "[%3d ", p_poc[0]); for (i = 1; i < num_ref - 1; i++) { sprintf(str_tmp, "%3d ", p_poc[i]); strcat(s_ref_list, str_tmp); } sprintf(str_tmp, "%3d]", p_poc[i]); strcat(s_ref_list, str_tmp); } else if (num_ref == 1) { sprintf(s_ref_list, "[%3d]", p_poc[0]); } } /* --------------------------------------------------------------------------- */ void encoder_report_one_frame(xavs2_t *h, outputframe_t *frame) { static const char frm_type[4] = {'I', 'P', 'B', 'F'}; char s_out_base[256]; char s_psnr[64] = ""; char s_ref_list[32] = ""; xavs2_stat_t *p_stat = &h->h_top->stat; frame_stat_t *frmstat = &frame->out_frm_stat; int frm_bs_len = frame->frm_enc->i_bs_len; if (p_stat->i_start_time == 0) { p_stat->i_start_time = frame->frm_enc->i_time_start; } p_stat->i_end_time = frame->frm_enc->i_time_end; frmstat->stat_frm.i_time_duration = frame->frm_enc->i_time_end - frame->frm_enc->i_time_start; /* frame info */ xavs2_thread_mutex_lock(&h->h_top->mutex); switch (frmstat->i_type) { case 0: stat_add_frame_info(&p_stat->stat_i_frame, &frmstat->stat_frm, frm_bs_len); break; case 2: stat_add_frame_info(&p_stat->stat_b_frame, &frmstat->stat_frm, frm_bs_len); break; default: stat_add_frame_info(&p_stat->stat_p_frame, &frmstat->stat_frm, frm_bs_len); break; } stat_add_frame_info(&p_stat->stat_total, &frmstat->stat_frm, frm_bs_len); xavs2_thread_mutex_unlock(&h->h_top->mutex); if (h->param->enable_psnr && h->param->enable_ssim) { sprintf_s(s_psnr, 64, " %7.4f %7.4f %7.4f %7.5f %7.5f %7.5f", frmstat->stat_frm.f_psnr[0], frmstat->stat_frm.f_psnr[1], frmstat->stat_frm.f_psnr[2], frmstat->stat_frm.f_ssim[0], frmstat->stat_frm.f_ssim[1], frmstat->stat_frm.f_ssim[2]); } else if (h->param->enable_psnr) { sprintf_s(s_psnr, 64, " %7.4f %7.4f %7.4f", frmstat->stat_frm.f_psnr[0], frmstat->stat_frm.f_psnr[1], frmstat->stat_frm.f_psnr[2]); } sprintf_s(s_out_base, 256, "%4d (%c) %2d %8d %s %5d", frmstat->i_frame, frm_type[frmstat->i_type], frmstat->i_qp, // frmstat->stat_frm.f_lambda_frm, // %7.2f frame->frm_enc->i_bs_len * 8, s_psnr, (int)((frmstat->stat_frm.i_time_duration) / 1000)); get_reference_list_str(s_ref_list, frmstat->ref_poc_set, frmstat->i_ref); xavs2_log(h, XAVS2_LOG_DEBUG, "%s %s\n", s_out_base, s_ref_list); } /* --------------------------------------------------------------------------- */ void encoder_show_head_info(xavs2_param_t *param) { const char *s_gop_param = param->b_open_gop ? "Open" : "Closed"; char buf_cpu[120] = ""; char s_threads_row [16] = "auto"; char s_threads_frame[16] = "auto"; /* init temp string */ if (param->i_lcurow_threads != 0) { sprintf(s_threads_row, "%d", param->i_lcurow_threads); } if (param->i_frame_threads != 0) { sprintf(s_threads_frame, "%d", param->i_frame_threads); } /* algorithms and controls in the encoder */ if (param->enable_refine_qp) { xavs2_log(NULL, XAVS2_LOG_DEBUG, " RefinedQp is on, the input QP might be changed;\n"); } /* input/output properties */ xavs2_log(NULL, XAVS2_LOG_DEBUG, " xavs2enc version : %s %s\n", XVERSION_STR, XBUILD_TIME); xavs2_log(NULL, XAVS2_LOG_DEBUG, " Input YUV file : %s \n", param->psz_in_file); xavs2_log(NULL, XAVS2_LOG_DEBUG, " Output bitstream : %s \n", param->psz_bs_file); xavs2_log(NULL, XAVS2_LOG_DEBUG, " Recon YUV file : %s \n", param->psz_dump_yuv); xavs2_log(NULL, XAVS2_LOG_DEBUG, " Total Frames : %d \n", param->num_frames); /* basic parameters */ xavs2_log(NULL, XAVS2_LOG_INFO, "--------------------------------------------------------------------------------\n"); xavs2_log(NULL, XAVS2_LOG_INFO, " Profile & Level : 0x%02X-0x%02X, BitDepth: %d/%d, size(pel): %d \n", param->profile_id, param->level_id, param->input_sample_bit_depth, param->sample_bit_depth, sizeof(pel_t)); xavs2_log(NULL, XAVS2_LOG_INFO, " Video Property : %dx%d, %.3f Hz (FrameRateCode: %d)\n", param->org_width, param->org_height, param->frame_rate, param->frame_rate_code); /* CPU capacities */ xavs2_get_simd_capabilities(buf_cpu, g_funcs.cpuid); xavs2_log(NULL, XAVS2_LOG_INFO, " CPU Capabilities : %s\n", buf_cpu); xavs2_log(NULL, XAVS2_LOG_INFO, " Preset Level : %d, %s \n", param->preset_level, xavs2_preset_names[param->preset_level]); xavs2_log(NULL, XAVS2_LOG_INFO, " Ref Structure : BFrames: %d; %s GOP; IntraPeriod: %d~%d\n", param->num_bframes, s_gop_param, param->intra_period_min, param->intra_period_max); xavs2_log(NULL, XAVS2_LOG_INFO, " Rate Control : %d; QP: %d, [%2d, %2d]; %.3f Mbps\n", param->i_rc_method, param->i_initial_qp, param->i_min_qp, param->i_max_qp, 0.000001f * param->i_target_bitrate); xavs2_log(NULL, XAVS2_LOG_INFO, " Threads (Row/Frm): %s / %s, cpu cores %d \n", s_threads_row, s_threads_frame, xavs2_cpu_num_processors()); } /* --------------------------------------------------------------------------- */ void encoder_show_frame_info_tab(xavs2_t *h, xavs2_handler_t *mgr) { const xavs2_param_t *param = h->param; size_t space_alloc = xavs2_get_total_malloc_space(); space_alloc = (space_alloc + (1 << 20) - 1) >> 20; xavs2_log(NULL, XAVS2_LOG_INFO, " Threads (Alloc) : %d / %d, threadpool %d, RowContexts %d \n", mgr->i_row_threads, mgr->i_frm_threads, mgr->num_pool_threads, mgr->num_row_contexts); xavs2_log(NULL, XAVS2_LOG_INFO, " Memory (Alloc) : %d MB \n", (int)(space_alloc)); xavs2_log(NULL, XAVS2_LOG_INFO, " Enabled Tools : LCU %d, 2NxN/Nx2N:%d, AMP:%d, IntraInInter:%d, SDIP:%d,\n"\ " FFrame %d, DHP:%d, DMH:%d, MHP:%d, WSM:%d,\n"\ " NSQT:%d, Fast2LevelTu:%d, 2ndTrans:%d,\n"\ " ME:%d, SearchRange:%d,\n"\ " RefinedQP:%d, TDRDO:%d, Algorithm: %8llx\n"\ " RdLevel:%d, RdoqLevel:%d, SAO:%d, ALF:%d.\n", 1 << param->lcu_bit_level, param->inter_2pu, param->enable_amp, param->enable_intra, param->enable_sdip, param->enable_f_frame, param->enable_dhp, param->enable_dmh, param->enable_mhp_skip, param->enable_wsm, param->enable_nsqt, param->b_fast_2lelvel_tu, param->enable_secT, param->me_method, param->search_range, param->enable_refine_qp, param->enable_tdrdo, h->i_fast_algs, param->i_rd_level, param->i_rdoq_level, param->enable_sao, param->enable_alf); /* table header */ xavs2_log(NULL, XAVS2_LOG_INFO, "--------------------------------------------------------------------------------\n"); if (param->enable_psnr && param->enable_ssim){ xavs2_log(NULL, XAVS2_LOG_DEBUG, "POC Type QP + Bits PsnrY PsnrU PsnrV SsimY SsimU SsimV Time [ RefList ]\n"); } else if (param->enable_psnr) { xavs2_log(NULL, XAVS2_LOG_DEBUG, "POC Type QP + Bits PsnrY PsnrU PsnrV Time [ RefList ]\n"); } else { xavs2_log(NULL, XAVS2_LOG_DEBUG, "POC Type QP + Bits Time [ RefList ]\n"); } } #endif // #if XAVS2_STAT xavs2-1.3/source/encoder/header.c000066400000000000000000000604071340660520300167410ustar00rootroot00000000000000/* * header.c * * Description of this file: * Header writing functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "header.h" #include "wquant.h" #include "bitstream.h" #include "aec.h" /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int is_valid_qp(xavs2_t *h, int i_qp) { int max_qp = MAX_QP; UNUSED_PARAMETER(h); return i_qp >= 0 && i_qp <= max_qp; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int get_frame_coi_to_write(xavs2_t *h, xavs2_frame_t *frm) { if (h->param->num_parallel_gop > 1) { return (frm->i_frm_coi - frm->i_gop_idr_coi) & 255; } else { return frm->i_frm_coi & 255; } } /* --------------------------------------------------------------------------- * write sequence header information */ int xavs2_sequence_write(xavs2_t *h, bs_t *p_bs) { const xavs2_rps_t *p_seq_rps = h->param->cfg_ref_all; int bits = 0; int i, j; bits += u_0(p_bs, 32, 0x1b0, "seqence_start_code"); bits += u_v(p_bs, 8, h->param->profile_id, "profile_id"); bits += u_v(p_bs, 8, h->param->level_id, "level_id"); bits += u_v(p_bs, 1, h->param->progressive_sequence, "progressive_sequence"); bits += u_v(p_bs, 1, h->b_field_sequence, "field_coded_sequence"); bits += u_v(p_bs, 14, h->param->org_width, "horizontal_size"); bits += u_v(p_bs, 14, h->param->org_height, "vertical_size"); bits += u_v(p_bs, 2, h->param->chroma_format, "chroma_format"); bits += u_v(p_bs, 3, h->param->sample_precision, "sample_precision"); if (h->param->profile_id == MAIN10_PROFILE) { // MAIN10 profile bits += u_v(p_bs, 3, ((h->param->sample_bit_depth - 6) / 2), "encoding_precision"); } bits += u_v(p_bs, 4, h->param->aspect_ratio_information, "aspect_ratio_information"); bits += u_v(p_bs, 4, h->param->frame_rate_code, "frame_rate_code"); bits += u_v(p_bs, 18, h->param->bitrate_lower, "bit_rate_lower"); bits += u_v(p_bs, 1, 1, "marker bit"); bits += u_v(p_bs, 12, h->param->bitrate_upper, "bit_rate_upper"); bits += u_v(p_bs, 1, h->param->low_delay, "low_delay"); bits += u_v(p_bs, 1, 1, "marker bit"); bits += u_v(p_bs, 1, h->param->temporal_id_exist_flag, "temporal_id exist flag"); bits += u_v(p_bs, 18, h->param->bbv_buffer_size, "bbv buffer size"); bits += u_v(p_bs, 3, h->i_lcu_level, "Largest Coding Block Size"); bits += u_v(p_bs, 1, h->param->enable_wquant, "weight_quant_enable"); #if ENABLE_WQUANT if (h->param->enable_wquant) { bits += u_v(p_bs, 1, h->param->SeqWQM, "load_seq_weight_quant_data_flag"); if (h->param->SeqWQM) { int x, y, sizeId, iWqMSize; wq_data_t *wq = &h->wq_data; for (sizeId = 0; sizeId < 2; sizeId++) { iWqMSize = XAVS2_MIN(1 << (sizeId + 2), 8); for (y = 0; y < iWqMSize; y++) { for (x = 0; x < iWqMSize; x++) { bits += ue_v(p_bs, wq->seq_wq_matrix[sizeId][y * iWqMSize + x], "weight_quant_coeff"); } } } } } #else assert(h->param->enable_wquant == 0); #endif bits += u_v(p_bs, 1, 1, "background_picture_disable"); bits += u_v(p_bs, 1, h->param->enable_mhp_skip, "mhpskip enabled"); bits += u_v(p_bs, 1, h->param->enable_dhp, "dhp enabled"); bits += u_v(p_bs, 1, h->param->enable_wsm, "wsm enabled"); bits += u_v(p_bs, 1, h->param->enable_amp, "Asymmetric Motion Partitions"); bits += u_v(p_bs, 1, h->param->enable_nsqt, "enable_NSQT"); bits += u_v(p_bs, 1, h->param->enable_sdip, "enable_SDIP"); bits += u_v(p_bs, 1, h->param->enable_secT, "secT enabled"); bits += u_v(p_bs, 1, h->param->enable_sao, "SAO Enable Flag"); bits += u_v(p_bs, 1, h->param->enable_alf, "ALF Enable Flag"); bits += u_v(p_bs, 1, h->param->enable_pmvr, "pmvr enabled"); bits += u_v(p_bs, 1, 1, "marker bit"); bits += u_v(p_bs, 6, h->i_gop_size, "num_of_RPS"); for (i = 0; i < h->i_gop_size; i++) { bits += u_v(p_bs, 1, p_seq_rps[i].referd_by_others, "refered by others"); bits += u_v(p_bs, 3, p_seq_rps[i].num_of_ref, "num of reference picture"); for (j = 0; j < p_seq_rps[i].num_of_ref; j++) { bits += u_v(p_bs, 6, p_seq_rps[i].ref_pic[j], "delta COI of ref pic"); } bits += u_v(p_bs, 3, p_seq_rps[i].num_to_rm, "num of removed picture"); for (j = 0; j < p_seq_rps[i].num_to_rm; j++) { bits += u_v(p_bs, 6, p_seq_rps[i].rm_pic[j], "delta COI of removed pic"); } bits += u_v(p_bs, 1, 1, "marker bit"); } if (!h->param->low_delay) { bits += u_v(p_bs, 5, h->picture_reorder_delay, "output_reorder_delay"); } bits += u_v(p_bs, 1, h->param->b_cross_slice_loop_filter, "Cross Loop Filter Flag"); bits += u_v(p_bs, 3, 0, "reserved bits"); /* byte align */ bits += bs_byte_align(p_bs); //xavs2_log(h, XAVS2_LOG_INFO, "Sequence Header, inserted before frame %d, COI %d\n", h->fenc->i_frame, h->curr_coi); return bits; } /* --------------------------------------------------------------------------- * write user data */ int xavs2_user_data_write(bs_t *p_bs) { const char *avs2_log = "xavs2 encoder"; int bits; bits = u_0(p_bs, 32, 0x1b2, "user data start code"); while (*avs2_log) { bits += u_v(p_bs, 8, *avs2_log++, "user data"); } bits += bs_byte_align(p_bs); return bits; } /* --------------------------------------------------------------------------- */ int xavs2_intra_picture_header_write(xavs2_t *h, bs_t *p_bs) { int bbv_delay = 0xFFFF; int display_delay = 0; int len; int i; len = u_0(p_bs, 32, 0x1B3, "I picture start code"); len += u_v(p_bs, 32, bbv_delay, "bbv_delay"); len += u_v(p_bs, 1, h->param->time_code_flag, "time_code_flag"); // if (h->param->time_code_flag) { // tc = frametotc(h, frame, h->tc_reserve_bit); // len += u_v(p_bs, 24, tc, "time_code"); // } if (!h->param->low_delay) { display_delay = h->fdec->i_frame - h->fdec->i_frm_coi + h->picture_reorder_delay; } len += u_v(p_bs, 8, get_frame_coi_to_write(h, h->fdec), "coding_order"); if (h->param->temporal_id_exist_flag == 1) { len += u_v(p_bs, TEMPORAL_MAXLEVEL_BIT, h->i_layer, "temporal_id"); } if (!h->param->low_delay) { len += ue_v(p_bs, display_delay, "picture_output_delay"); } len += u_v(p_bs, 1, h->fdec->rps.idx_in_gop >= 0, "use RCS in SPS"); if (h->fdec->rps.idx_in_gop >= 0) { len += u_v(p_bs, 5, h->fdec->rps.idx_in_gop, "predict for RCS"); } else { len += u_v(p_bs, 1, h->fdec->rps.referd_by_others, "referenced by others"); len += u_v(p_bs, 3, h->fdec->rps.num_of_ref, "num of reference picture"); for (i = 0; i < h->fdec->rps.num_of_ref; i++) { len += u_v(p_bs, 6, h->fdec->rps.ref_pic[i], "delta COI of ref pic"); } len += u_v(p_bs, 3, h->fdec->rps.num_to_rm, "num of removed picture"); for (i = 0; i < h->fdec->rps.num_to_rm; i++) { len += u_v(p_bs, 6, h->fdec->rps.rm_pic[i], "delta COI of removed pic"); } len += u_v(p_bs, 1, 1, "marker bit"); } if (h->param->low_delay) { len += ue_v(p_bs, 0, "bbv check times"); } len += u_v(p_bs, 1, h->b_progressive, "progressive_frame"); if (!h->b_progressive) { len += u_v(p_bs, 1, 1, "picture_structure"); } len += u_v(p_bs, 1, h->param->top_field_first, "top_field_first"); len += u_v(p_bs, 1, h->param->repeat_first_field, "repeat_first_field"); if (h->param->InterlaceCodingOption == FIELD_CODING) { len += u_v(p_bs, 1, h->b_top_field, "is top field"); len += u_v(p_bs, 1, 1, "reserved bit for interlace coding"); } len += u_v(p_bs, 1, h->param->fixed_picture_qp, "fixed_picture_qp"); len += u_v(p_bs, 7, h->i_qp, "picture_qp"); len += u_v(p_bs, 1, h->param->loop_filter_disable, "loop_filter_disable"); if (!h->param->loop_filter_disable) { len += u_v(p_bs, 1, h->param->loop_filter_parameter_flag, "loop_filter_parameter_flag"); if (h->param->loop_filter_parameter_flag) { len += se_v(p_bs, h->param->alpha_c_offset, "alpha offset"); len += se_v(p_bs, h->param->beta_offset, "beta offset"); } } #if ENABLE_WQUANT len += u_v(p_bs, 1, h->param->chroma_quant_param_disable, "chroma_quant_param_disable"); if (!h->param->chroma_quant_param_disable) { len += se_v(p_bs, h->param->chroma_quant_param_delta_u, "chroma_quant_param_delta_cb"); len += se_v(p_bs, h->param->chroma_quant_param_delta_v, "chroma_quant_param_delta_cr"); } else { assert(h->param->chroma_quant_param_delta_u == 0); assert(h->param->chroma_quant_param_delta_v == 0); } #else len += u_v(p_bs, 1, 1, "chroma_quant_param_disable"); #endif // ENABLE_WQUANT if (!is_valid_qp(h, h->i_qp)) { xavs2_log(h,XAVS2_LOG_ERROR,"Invalid I Picture QP: %d\n",h->i_qp); } #if ENABLE_WQUANT // adaptive frequency weighting quantization if (h->param->enable_wquant) { len += u_v(p_bs, 1, h->param->PicWQEnable, "pic_weight_quant_enable"); if (h->param->PicWQEnable) { len += u_v(p_bs, 2, h->param->PicWQDataIndex, "pic_weight_quant_data_index"); if (h->param->PicWQDataIndex == 1) { len += u_v(p_bs, 1, 0, "reserved_bits"); len += u_v(p_bs, 2, h->param->WQParam, "weighting_quant_param_index"); len += u_v(p_bs, 2, h->param->WQModel, "weighting_quant_model"); if ((h->param->WQParam == 1) || ((h->param->MBAdaptQuant) && (h->param->WQParam == 3))) { for (i = 0; i < 6; i++) { len += se_v(p_bs, (int)(h->wq_data.wq_param[UNDETAILED][i] - tab_wq_param_default[UNDETAILED][i]), "quant_param_delta_u"); } } if ((h->param->WQParam == 2) || ((h->param->MBAdaptQuant) && (h->param->WQParam == 3))) { for (i = 0; i < 6; i++) { len += se_v(p_bs, (int)(h->wq_data.wq_param[DETAILED][i] - tab_wq_param_default[DETAILED][i]), "quant_param_delta_d"); } } } else if (h->param->PicWQDataIndex == 2) { int x, y, sizeId, iWqMSize; for (sizeId = 0; sizeId < 2; sizeId++) { i = 0; iWqMSize = XAVS2_MIN(1 << (sizeId + 2), 8); for (y = 0; y < iWqMSize; y++) { for (x = 0; x < iWqMSize; x++) { len += ue_v(p_bs, h->wq_data.pic_user_wq_matrix[sizeId][i++], "weight_quant_coeff"); } } } } } } #endif return len; } /* --------------------------------------------------------------------------- */ int xavs2_inter_picture_header_write(xavs2_t *h, bs_t *p_bs) { int bbv_delay = 0xFFFF; int picture_coding_type; int display_delay = 0; int len; int i; if (h->i_type == SLICE_TYPE_P) { picture_coding_type = 1; } else if (h->i_type == SLICE_TYPE_F) { picture_coding_type = 3; } else { picture_coding_type = 2; } len = u_0(p_bs, 24, 1, "start_code_prefix"); len += u_0(p_bs, 8, 0xB6, "picture start code"); len += u_v(p_bs, 32, bbv_delay, "bbv delay"); len += u_v(p_bs, 2, picture_coding_type, "picture_coding_type"); if (!h->param->low_delay) { display_delay = h->fenc->i_frame - h->fdec->i_frm_coi + h->picture_reorder_delay; } len += u_v(p_bs, 8, get_frame_coi_to_write(h, h->fdec), "coding_order"); if (h->param->temporal_id_exist_flag == 1) { len += u_v(p_bs, TEMPORAL_MAXLEVEL_BIT, h->i_layer, "temporal_id"); } if (!h->param->low_delay) { len += ue_v(p_bs, display_delay, "displaydelay"); } len += u_v(p_bs, 1, h->fdec->rps.idx_in_gop >= 0, "use RPS in SPS"); if (h->fdec->rps.idx_in_gop >= 0) { len += u_v(p_bs, 5, h->fdec->rps.idx_in_gop, "predict for RPS"); } else { len += u_v(p_bs, 1, h->fdec->rps.referd_by_others, "refered by others"); len += u_v(p_bs, 3, h->fdec->rps.num_of_ref, "num of reference picture"); for (i = 0; i < h->fdec->rps.num_of_ref; i++) { len += u_v(p_bs, 6, h->fdec->rps.ref_pic[i], "delta COI of ref pic"); } len += u_v(p_bs, 3, h->fdec->rps.num_to_rm, "num of removed picture"); for (i = 0; i < h->fdec->rps.num_to_rm; i++) { len += u_v(p_bs, 6, h->fdec->rps.rm_pic[i], "delta COI of removed pic"); } len += u_v(p_bs, 1, 1, "marker bit"); } if (h->param->low_delay) { len += ue_v(p_bs, 0, "bbv check times"); } len += u_v(p_bs, 1, h->b_progressive, "progressive_frame"); if (!h->b_progressive) { len += u_v(p_bs, 1, 1, "picture_structure"); } len += u_v(p_bs, 1, h->param->top_field_first, "top_field_first"); len += u_v(p_bs, 1, h->param->repeat_first_field, "repeat_first_field"); if (h->param->InterlaceCodingOption == FIELD_CODING) { len += u_v(p_bs, 1, h->b_top_field, "is top field"); len += u_v(p_bs, 1, 1, "reserved bit for interlace coding"); } len += u_v(p_bs, 1, h->param->fixed_picture_qp, "fixed_picture_qp"); len += u_v(p_bs, 7, h->i_qp, "picture_qp"); if (picture_coding_type != 2) { len += u_v(p_bs, 1, 0, "reserved_bit"); } len += u_v(p_bs, 1, h->fenc->b_random_access_decodable, "random_access_decodable_flag"); len += u_v(p_bs, 1, h->param->loop_filter_disable, "loop_filter_disable"); if (!h->param->loop_filter_disable) { len += u_v(p_bs, 1, h->param->loop_filter_parameter_flag, "loop_filter_parameter_flag"); if (h->param->loop_filter_parameter_flag) { len += se_v(p_bs, h->param->alpha_c_offset, "alpha offset"); len += se_v(p_bs, h->param->beta_offset, "beta offset"); } } #if ENABLE_WQUANT len += u_v(p_bs, 1, h->param->chroma_quant_param_disable, "chroma_quant_param_disable"); if (!h->param->chroma_quant_param_disable) { len += se_v(p_bs, h->param->chroma_quant_param_delta_u, "chroma_quant_param_delta_cb"); len += se_v(p_bs, h->param->chroma_quant_param_delta_v, "chroma_quant_param_delta_cr"); } else { assert(h->param->chroma_quant_param_delta_u == 0); assert(h->param->chroma_quant_param_delta_v == 0); } #else len += u_v(p_bs, 1, 1, "chroma_quant_param_disable"); #endif // ENABLE_WQUANT if (!is_valid_qp(h, h->i_qp)) { xavs2_log(h, XAVS2_LOG_ERROR, "Invalid PB Picture QP: %d\n", h->i_qp); } // adaptive frequency weighting quantization #if ENABLE_WQUANT if (h->param->enable_wquant) { len += u_v(p_bs, 1, h->param->PicWQEnable, "pic_weight_quant_enable"); if (h->param->PicWQEnable) { len += u_v(p_bs, 2, h->param->PicWQDataIndex, "pic_weight_quant_data_index"); if (h->param->PicWQDataIndex == 1) { len += u_v(p_bs, 1, 0, "reserved_bits"); len += u_v(p_bs, 2, h->param->WQParam, "weighting_quant_param_index"); len += u_v(p_bs, 2, h->param->WQModel, "weighting_quant_model"); if ((h->param->WQParam == 1) || ((h->param->MBAdaptQuant) && (h->param->WQParam == 3))) { for (i = 0; i < 6; i++) { len += se_v(p_bs, (int)(h->wq_data.wq_param[UNDETAILED][i] - tab_wq_param_default[UNDETAILED][i]), "quant_param_delta_u"); } } if ((h->param->WQParam == 2) || ((h->param->MBAdaptQuant) && (h->param->WQParam == 3))) { for (i = 0; i < 6; i++) { len += se_v(p_bs, (int)(h->wq_data.wq_param[DETAILED][i] - tab_wq_param_default[DETAILED][i]), "quant_param_delta_d"); } } } else if (h->param->PicWQDataIndex == 2) { int x, y, sizeId, iWqMSize; for (sizeId = 0; sizeId < 2; sizeId++) { i = 0; iWqMSize = XAVS2_MIN(1 << (sizeId + 2), 8); for (y = 0; y < iWqMSize; y++) { for (x = 0; x < iWqMSize; x++) { len += ue_v(p_bs, h->wq_data.pic_user_wq_matrix[sizeId][i++], "weight_quant_coeff"); } } } } } } #endif return len; } static void writeAlfCoeff(ALFParam *Alfp, bs_t *p_bs, int componentID) { int groupIdx[NO_VAR_BINS]; int pos, i; int f = 0; switch (componentID) { case IMG_U: case IMG_V: for (pos = 0; pos < ALF_MAX_NUM_COEF; pos++) { se_v(p_bs, Alfp->coeffmulti[0][pos], "Chroma ALF coefficients"); } break; case IMG_Y: ue_v(p_bs, Alfp->filters_per_group - 1, "ALF filter number"); groupIdx[0] = 0; f++; if (Alfp->filters_per_group > 1) { for (i = 1; i < NO_VAR_BINS; i++) { if (Alfp->filterPattern[i] == 1) { groupIdx[f] = i; f++; } } } for (f = 0; f < Alfp->filters_per_group; f++) { if (f > 0 && Alfp->filters_per_group != 16) { ue_v(p_bs, (uint32_t)(groupIdx[f] - groupIdx[f - 1]), "Region distance"); } for (pos = 0; pos < ALF_MAX_NUM_COEF; pos++) { se_v(p_bs, Alfp->coeffmulti[f][pos], "Luma ALF coefficients"); } } break; default: xavs2_log(NULL, XAVS2_LOG_ERROR, "Not a legal component ID.\n"); assert(0); exit(-1); } } /* --------------------------------------------------------------------------- */ void xavs2_picture_header_alf_write(xavs2_t *h, ALFParam *alfPictureParam, bs_t *p_bs) { if (h->param->enable_alf) { u_v(p_bs, 1, h->pic_alf_on[0], "alf_pic_flag_Y"); u_v(p_bs, 1, h->pic_alf_on[1], "alf_pic_flag_Cb"); u_v(p_bs, 1, h->pic_alf_on[2], "alf_pic_flag_Cr"); if (h->pic_alf_on[0]) { writeAlfCoeff(alfPictureParam + 0, p_bs, 0); } if (h->pic_alf_on[1]) { writeAlfCoeff(alfPictureParam + 1, p_bs, 1); } if (h->pic_alf_on[2]) { writeAlfCoeff(alfPictureParam + 2, p_bs, 2); } } } /* --------------------------------------------------------------------------- * slice header write, only full-row slices supported */ int xavs2_slice_header_write(xavs2_t *h, slice_t *p_slice) { int len; bs_t *p_bs = &p_slice->bs; len = u_0(p_bs, 24, 1, "start code prefix"); len += u_v(p_bs, 8, p_slice->i_first_lcu_y, "slice vertical position"); if (h->i_height > (144 * (1 << h->i_lcu_level))) { int slice_vertical_position_extension = 0; /* TODO: ? */ len += u_v(p_bs, 3, slice_vertical_position_extension, "slice vertical position extension"); } len += u_v(p_bs, 8, 0, "slice horizontal position"); if (h->i_width > (255 * (1 << h->i_lcu_level))) { int slice_horizontal_position_extension = 0; /* TODO: ? */ len += u_v(p_bs, 2, slice_horizontal_position_extension, "slice horizontal position extension"); } if (!h->param->fixed_picture_qp) { len += u_v(p_bs, 1, h->param->i_rc_method != XAVS2_RC_CBR_SCU, "fixed_slice_qp"); len += u_v(p_bs, 7, p_slice->i_qp, "slice_qp"); } if (h->param->enable_sao) { len += u_v(p_bs, 1, h->slice_sao_on[0], "sao_slice_flag_Y"); len += u_v(p_bs, 1, h->slice_sao_on[1], "sao_slice_flag_Cb"); len += u_v(p_bs, 1, h->slice_sao_on[2], "sao_slice_flag_Cr"); } if (!is_valid_qp(h, h->i_qp)){ xavs2_log(h, XAVS2_LOG_ERROR, "Invalid Slice QP: %d\n", h->i_qp); } return len; } xavs2-1.3/source/encoder/header.h000066400000000000000000000044051340660520300167420ustar00rootroot00000000000000/* * header.h * * Description of this file: * Header writing functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_HEADER_H #define XAVS2_HEADER_H #define xavs2_sequence_write FPFX(sequence_write) int xavs2_sequence_write(xavs2_t *h, bs_t *p_bs); #define xavs2_user_data_write FPFX(user_data_write) int xavs2_user_data_write(bs_t *p_bs); #define xavs2_intra_picture_header_write FPFX(intra_picture_header_write) int xavs2_intra_picture_header_write(xavs2_t *h, bs_t *p_bs); #define xavs2_inter_picture_header_write FPFX(inter_picture_header_write) int xavs2_inter_picture_header_write(xavs2_t *h, bs_t *p_bs); #define xavs2_picture_header_alf_write FPFX(picture_header_alf_write) void xavs2_picture_header_alf_write(xavs2_t *h, ALFParam *alfPictureParam, bs_t *p_bs); #define xavs2_slice_header_write FPFX(slice_header_write) int xavs2_slice_header_write(xavs2_t *h, slice_t *p_slice); #endif // XAVS2_HEADER_H xavs2-1.3/source/encoder/md_inter.c000066400000000000000000001403471340660520300173140ustar00rootroot00000000000000/* * md_inter.c * * Description of this file: * Mode decision functions definition for Inter prediction of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "predict.h" #include "block_info.h" #include "cudata.h" #include "me.h" /** * =========================================================================== * global variables * =========================================================================== */ static const double tab_umh_alpha_2nd[MAX_INTER_MODES] = { 0.0f, 0.01f, 0.01f, 0.01f, 0.02f, 0.03f, 0.03f, 0.04f }; static const double tab_umh_alpha_3rd[MAX_INTER_MODES] = { 0.0f, 0.06f, 0.07f, 0.07f, 0.08f, 0.12f, 0.11f, 0.15f }; /** * =========================================================================== * local function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * determine the MVD value (1/4 pixel) is legal or not * Return: 0: out of the legal mv range; 1: in the legal mv range */ static ALWAYS_INLINE int check_mvd(xavs2_t *h, int mvd_x, int mvd_y) { if (h->param->i_frame_threads > 1) { return (mvd_x < 4096 && mvd_x >= -4096 && mvd_y < ((1 << h->i_lcu_level) << 2) && mvd_y >= -((1 << h->i_lcu_level) << 2)); } return (mvd_x < 4096 && mvd_x >= -4096 && mvd_y < 4096 && mvd_y >= -4096); } /* --------------------------------------------------------------------------- * determine the forward and backward mv value (1/4 pixel) is legal or not * return: 0: out of the legal mv range; 1: in the legal mv range */ static int check_mv_range_sym(xavs2_t *h, mv_t *mv, int pix_x, int pix_y, int bsx, int bsy, int distance_fwd, int distance_bwd) { int bsize = 1 << h->i_lcu_level; /* valid padding size */ int min_x = -((pix_x + bsize) << 2); int min_y = -((pix_y + bsize) << 2); int max_x = ((h->i_width - (pix_x + bsx)) + bsize) << 2; int max_y = ((h->i_height - (pix_y + bsy)) + bsize) << 2; int bwd_mvx, bwd_mvy; min_x = XAVS2_MAX(min_x, h->min_mv_range[0]); min_y = XAVS2_MAX(min_y, h->min_mv_range[1]); max_x = XAVS2_MIN(max_x, h->max_mv_range[0]); max_y = XAVS2_MIN(max_y, h->max_mv_range[1]); if (h->i_type == SLICE_TYPE_B) { bwd_mvx = -scale_mv_skip( mv->x, distance_bwd, distance_fwd); bwd_mvy = -scale_mv_skip_y(h, mv->y, distance_bwd, distance_fwd); } else { /* SLICE_TYPE_F or SLICE_TYPE_P */ bwd_mvx = scale_mv_skip( mv->x, distance_bwd, distance_fwd); bwd_mvy = scale_mv_skip_y(h, mv->y, distance_bwd, distance_fwd); } if (mv->x > max_x || mv->x < min_x || mv->y > max_y || mv->y < min_y) { return 0; } if (bwd_mvx > max_x || bwd_mvx < min_x || bwd_mvy > max_y || bwd_mvy < min_y) { return 0; } return 1; } /** * --------------------------------------------------------------------------- * Function : get temporal motion vector predictor for SKIP/DIRECT mode in B frame * Parameters : * [in ] : h - encoder handler * [out] : none * Return : none * --------------------------------------------------------------------------- */ static INLINE void get_bskip_mv_spatial(cu_mode_t *p_cumode, const neighbor_inter_t *p_neighbors) { int bid_flag = 0, bw_flag = 0, fwd_flag = 0, sym_flag = 0, bid2 = 0; int j; g_funcs.fast_memset(p_cumode->skip_mv_1st, 0, sizeof(p_cumode->skip_mv_1st) + sizeof(p_cumode->skip_mv_2nd)); for (j = 0; j < 6; j++) { if (p_neighbors[j].i_dir_pred == PDIR_BID) { p_cumode->skip_mv_2nd[DS_B_BID] = p_neighbors[j].mv[1]; p_cumode->skip_mv_1st[DS_B_BID] = p_neighbors[j].mv[0]; bid_flag++; if (bid_flag == 1) { bid2 = j; } } else if (p_neighbors[j].i_dir_pred == PDIR_SYM) { p_cumode->skip_mv_2nd[DS_B_SYM] = p_neighbors[j].mv[1]; p_cumode->skip_mv_1st[DS_B_SYM] = p_neighbors[j].mv[0]; sym_flag++; } else if (p_neighbors[j].i_dir_pred == PDIR_BWD) { p_cumode->skip_mv_2nd[DS_B_BWD] = p_neighbors[j].mv[1]; bw_flag++; } else if (p_neighbors[j].i_dir_pred == PDIR_FWD) { p_cumode->skip_mv_1st[DS_B_FWD] = p_neighbors[j].mv[0]; fwd_flag++; } } /* ڿ鲻˫Ԥʱ˫Skip/Directģʽ */ if (bid_flag == 0 && fwd_flag != 0 && bw_flag != 0) { p_cumode->skip_mv_2nd[DS_B_BID] = p_cumode->skip_mv_2nd[DS_B_BWD]; p_cumode->skip_mv_1st[DS_B_BID] = p_cumode->skip_mv_1st[DS_B_FWD]; } p_cumode->skip_ref_1st[DS_B_BID] = B_FWD; p_cumode->skip_ref_2nd[DS_B_BID] = B_BWD; /* ڿ鲻ڶԳԤʱԳSkip/Directģʽ */ if (sym_flag == 0) { if (bid_flag > 1) { /* ˫Ԥ飬ʹ˫Ԥ */ p_cumode->skip_mv_2nd[DS_B_SYM] = p_neighbors[bid2].mv[1]; p_cumode->skip_mv_1st[DS_B_SYM] = p_neighbors[bid2].mv[0]; } else if (bw_flag != 0) { /* ںԤ飬ʹúԤ */ p_cumode->skip_mv_2nd[DS_B_SYM] = p_cumode->skip_mv_2nd[DS_B_BWD]; p_cumode->skip_mv_1st[DS_B_SYM].x = -p_cumode->skip_mv_2nd[DS_B_BWD].x; p_cumode->skip_mv_1st[DS_B_SYM].y = -p_cumode->skip_mv_2nd[DS_B_BWD].y; } else if (fwd_flag != 0) { /* ǰԤ飬ʹǰԤ */ p_cumode->skip_mv_2nd[DS_B_SYM].x = -p_cumode->skip_mv_1st[DS_B_FWD].x; p_cumode->skip_mv_2nd[DS_B_SYM].y = -p_cumode->skip_mv_1st[DS_B_FWD].y; p_cumode->skip_mv_1st[DS_B_SYM] = p_cumode->skip_mv_1st[DS_B_FWD]; } } p_cumode->skip_ref_1st[DS_B_SYM] = B_FWD; p_cumode->skip_ref_2nd[DS_B_SYM] = B_BWD; /* Ԥ鲻ʱSkip/Directģʽ */ if (bw_flag == 0 && bid_flag > 1) { /* ˫Ԥ飬ʹ˫ԤһԪ */ p_cumode->skip_mv_2nd[DS_B_BWD] = p_neighbors[bid2].mv[1]; } else if (bw_flag == 0 && bid_flag != 0) { /* ֻһ˫Ԥʱʹ˫бĺ */ p_cumode->skip_mv_2nd[DS_B_BWD] = p_cumode->skip_mv_2nd[DS_B_BID]; } p_cumode->skip_ref_1st[DS_B_BWD] = INVALID_REF; p_cumode->skip_ref_2nd[DS_B_BWD] = B_BWD; /* ǰԤ鲻ʱǰSkip/Directģʽ䣬ƺSkip/Directģʽ */ if (fwd_flag == 0 && bid_flag > 1) { p_cumode->skip_mv_1st[DS_B_FWD] = p_neighbors[bid2].mv[0]; } else if (fwd_flag == 0 && bid_flag != 0) { p_cumode->skip_mv_1st[DS_B_FWD] = p_cumode->skip_mv_1st[DS_B_BID]; } p_cumode->skip_ref_1st[DS_B_FWD] = B_FWD; p_cumode->skip_ref_2nd[DS_B_FWD] = INVALID_REF; } /** * --------------------------------------------------------------------------- * Function : get spatial motion vector predictor for SKIP/DIRECT mode in P/F frame * Parameters : * [in ] : h - encoder handler * [in ] : blocksize - size of current block * [out] : - * Return : * --------------------------------------------------------------------------- */ static void get_pskip_mv_spatial(cu_mode_t *p_cumode, const neighbor_inter_t *p_neighbors) { int bid_flag = 0, fwd_flag = 0, bid2 = 0, fwd2 = 0; int j; g_funcs.fast_memset(p_cumode->skip_mv_1st, 0, sizeof(p_cumode->skip_mv_1st) + sizeof(p_cumode->skip_mv_2nd) + sizeof(p_cumode->skip_ref_1st) + sizeof(p_cumode->skip_ref_2nd)); for (j = 0; j < 6; j++) { if (p_neighbors[j].ref_idx[0] != INVALID_REF && p_neighbors[j].ref_idx[1] != INVALID_REF) { // dual prediction p_cumode->skip_ref_1st[DS_DUAL_1ST] = p_neighbors[j].ref_idx[0]; p_cumode->skip_ref_2nd[DS_DUAL_1ST] = p_neighbors[j].ref_idx[1]; p_cumode->skip_mv_1st[DS_DUAL_1ST] = p_neighbors[j].mv[0]; p_cumode->skip_mv_2nd[DS_DUAL_1ST] = p_neighbors[j].mv[1]; bid_flag++; if (bid_flag == 1) { bid2 = j; } } else if (p_neighbors[j].ref_idx[0] != INVALID_REF && p_neighbors[j].ref_idx[1] == INVALID_REF) { // fwd p_cumode->skip_ref_1st[DS_SINGLE_1ST] = p_neighbors[j].ref_idx[0]; p_cumode->skip_ref_2nd[DS_SINGLE_1ST] = INVALID_REF; p_cumode->skip_mv_1st[DS_SINGLE_1ST] = p_neighbors[j].mv[0]; fwd_flag++; if (fwd_flag == 1) { fwd2 = j; } } } // first dual if (bid_flag == 0 && fwd_flag > 1) { p_cumode->skip_ref_1st[DS_DUAL_1ST] = p_cumode->skip_ref_1st[DS_SINGLE_1ST]; p_cumode->skip_ref_2nd[DS_DUAL_1ST] = p_neighbors[fwd2].ref_idx[0]; p_cumode->skip_mv_1st[DS_DUAL_1ST] = p_cumode->skip_mv_1st[DS_SINGLE_1ST]; p_cumode->skip_mv_2nd[DS_DUAL_1ST] = p_neighbors[fwd2].mv[0]; } // second dual if (bid_flag > 1) { p_cumode->skip_ref_1st[DS_DUAL_2ND] = p_neighbors[bid2].ref_idx[0]; p_cumode->skip_ref_2nd[DS_DUAL_2ND] = p_neighbors[bid2].ref_idx[1]; p_cumode->skip_mv_1st[DS_DUAL_2ND] = p_neighbors[bid2].mv[0]; p_cumode->skip_mv_2nd[DS_DUAL_2ND] = p_neighbors[bid2].mv[1]; } else if (bid_flag == 1 && fwd_flag > 1) { p_cumode->skip_ref_1st[DS_DUAL_2ND] = p_cumode->skip_ref_1st[DS_SINGLE_1ST]; p_cumode->skip_ref_2nd[DS_DUAL_2ND] = p_neighbors[fwd2].ref_idx[0]; p_cumode->skip_mv_1st[DS_DUAL_2ND] = p_cumode->skip_mv_1st[DS_SINGLE_1ST]; p_cumode->skip_mv_2nd[DS_DUAL_2ND] = p_neighbors[fwd2].mv[0]; } // first fwd p_cumode->skip_ref_2nd[DS_SINGLE_1ST] = INVALID_REF; if (fwd_flag == 0 && bid_flag > 1) { p_cumode->skip_ref_1st[DS_SINGLE_1ST] = p_neighbors[bid2].ref_idx[0]; p_cumode->skip_mv_1st [DS_SINGLE_1ST] = p_neighbors[bid2].mv[0]; } else if (fwd_flag == 0 && bid_flag == 1) { p_cumode->skip_ref_1st[DS_SINGLE_1ST] = p_cumode->skip_ref_1st[DS_DUAL_1ST]; p_cumode->skip_mv_1st [DS_SINGLE_1ST] = p_cumode->skip_mv_1st[DS_DUAL_1ST]; } // second fwd p_cumode->skip_ref_2nd[DS_SINGLE_2ND] = INVALID_REF; if (fwd_flag > 1) { p_cumode->skip_ref_1st[DS_SINGLE_2ND] = p_neighbors[fwd2].ref_idx[0]; p_cumode->skip_mv_1st [DS_SINGLE_2ND] = p_neighbors[fwd2].mv[0]; } else if (bid_flag > 1) { p_cumode->skip_ref_1st[DS_SINGLE_2ND] = p_neighbors[bid2].ref_idx[1]; p_cumode->skip_mv_1st [DS_SINGLE_2ND] = p_neighbors[bid2].mv[1]; } else if (bid_flag == 1) { p_cumode->skip_ref_1st[DS_SINGLE_2ND] = p_cumode->skip_ref_2nd[DS_DUAL_1ST]; p_cumode->skip_mv_1st [DS_SINGLE_2ND] = p_cumode->skip_mv_2nd[DS_DUAL_1ST]; } } /** * --------------------------------------------------------------------------- * Function : get temporal motion vector predictor for SKIP/DIRECT mode in P/F frame * Parameters : * [in ] : h - encoder handler * [in ] : p_cu - current encoding CU * [out] : none * Return : none * --------------------------------------------------------------------------- */ static void get_pskip_mv_temporal(xavs2_t *h, cu_t *p_cu) { cu_mode_t *p_cumode = cu_get_layer_mode(h, p_cu->cu_info.i_level); int blocksize2 = p_cu->i_size >> 1; int w_in_16x16 = (h->i_width_in_minpu + 3) >> 2; const int8_t *col_ref = h->fref[0]->pu_ref; const mv_t *col_mv = h->fref[0]->pu_mv; int pic_x = p_cu->i_pix_x; int pic_y = p_cu->i_pix_y; int refframe; int curT, colT; int k; for (k = 0; k < 4; k++) { int b_pix_x = pic_x + blocksize2 * (k & 1); int b_pix_y = pic_y + blocksize2 * (k >> 1); int col_pos = (b_pix_y >> 4) * w_in_16x16 + (b_pix_x >> 4); mv_t mv_1st; refframe = col_ref[col_pos]; if (refframe >= 0) { curT = calculate_distance(h, 0); colT = h->fref[0]->ref_dpoc[refframe]; mv_1st.x = scale_mv_skip(col_mv[col_pos].x, curT, colT); mv_1st.y = scale_mv_skip(col_mv[col_pos].y, curT, colT); } else { mv_1st.v = 0; } p_cumode->tskip_mv[k][0].v = mv_1st.v; } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int get_mvp_type_default(int ref_frame, int rFrameL, int rFrameU, int rFrameUR, cb_t *p_cb) { int mvp_type = MVP_MEDIAN; if ((rFrameL != INVALID_REF) && (rFrameU == INVALID_REF) && (rFrameUR == INVALID_REF)) { mvp_type = MVP_LEFT; } else if ((rFrameL == INVALID_REF) && (rFrameU != INVALID_REF) && (rFrameUR == INVALID_REF)) { mvp_type = MVP_TOP; } else if ((rFrameL == INVALID_REF) && (rFrameU == INVALID_REF) && (rFrameUR != INVALID_REF)) { mvp_type = MVP_TR; } else if (p_cb->w < p_cb->h) { if (p_cb->x == 0) { if (rFrameL == ref_frame) { mvp_type = MVP_LEFT; } } else { if (rFrameUR == ref_frame) { mvp_type = MVP_TR; } } } else if (p_cb->w > p_cb->h) { if (p_cb->y == 0) { if (rFrameU == ref_frame) { mvp_type = MVP_TOP; } } else { if (rFrameL == ref_frame) { mvp_type = MVP_LEFT; } } } return mvp_type; } /* --------------------------------------------------------------------------- */ static int derive_median_mv(int16_t mva, int16_t mvb, int16_t mvc, int16_t *pmv) { int mvp_type; if (((mva < 0) && (mvb > 0) && (mvc > 0)) || ((mva > 0) && (mvb < 0) && (mvc < 0))) { *pmv = (mvb + mvc) / 2; mvp_type = 1; // b } else if (((mvb < 0) && (mva > 0) && (mvc > 0)) || ((mvb > 0) && (mva < 0) && (mvc < 0))) { *pmv = (mvc + mva) / 2; mvp_type = 2; // c } else if (((mvc < 0) && (mva > 0) && (mvb > 0)) || ((mvc > 0) && (mva < 0) && (mvb < 0))) { *pmv = (mva + mvb) / 2; mvp_type = 0; // a } else { const int dAB = XAVS2_ABS(mva - mvb); // for Ax const int dBC = XAVS2_ABS(mvb - mvc); // for Bx const int dCA = XAVS2_ABS(mvc - mva); // for Cx const int min_diff = XAVS2_MIN(dAB, XAVS2_MIN(dBC, dCA)); if (min_diff == dAB) { *pmv = (mva + mvb) / 2; mvp_type = 0; // a; } else if (min_diff == dBC) { *pmv = (mvb + mvc) / 2; mvp_type = 1; // b; } else { *pmv = (mvc + mva) / 2; mvp_type = 2; // c; } } return mvp_type; } /* --------------------------------------------------------------------------- * MV scaling for Normal Inter Mode (MVP + MVD) */ static ALWAYS_INLINE int16_t scale_mv_default(int mv, int dist_dst, int dist_src_scale) { mv = xavs2_sign3(mv) * ((XAVS2_ABS(mv) * dist_dst * dist_src_scale + HALF_MULTI) >> OFFSET); return (int16_t)(XAVS2_CLIP3(-32768, 32767, mv)); } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int16_t scale_mv_default_y(xavs2_t *h, int16_t mvy, int dist_dst, int dist_src, int dist_src_scale) { int oriPOC = h->fdec->i_frm_poc; int oriRefPOC = oriPOC - dist_src; int scaledPOC = h->fdec->i_frm_poc; int scaledRefPOC = scaledPOC - dist_dst; int delta, delta2; getDeltas(h, &delta, &delta2, oriPOC, oriRefPOC, scaledPOC, scaledRefPOC); return (int16_t)(scale_mv_default(mvy + delta, dist_dst, dist_src_scale) - delta2); } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void check_scaling_neighbor_mv_b(xavs2_t *h, mv_t *mv, int dist_dst, int dist_src_scale, int ref_neighbor) { if (ref_neighbor >= 0) { if (h->b_field_sequence == 0) { mv->y = scale_mv_default(mv->y, dist_dst, dist_src_scale); mv->x = scale_mv_default(mv->x, dist_dst, dist_src_scale); } else { mv->y = scale_mv_default_y(h, mv->y, dist_dst, dist_dst, dist_src_scale); mv->x = scale_mv_default ( mv->x, dist_dst, dist_src_scale); } } else { mv->v = 0; } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void check_scaling_neighbor_mv(xavs2_t *h, mv_t *mv, int dist_dst, int ref_neighbor) { if (ref_neighbor >= 0) { int dist_src_scale = h->fdec->ref_dpoc_multi[ref_neighbor]; if (h->b_field_sequence == 0) { mv->y = scale_mv_default(mv->y, dist_dst, dist_src_scale); mv->x = scale_mv_default(mv->x, dist_dst, dist_src_scale); } else { int dist_src = h->fdec->ref_dpoc[ref_neighbor]; mv->y = scale_mv_default_y(h, mv->y, dist_dst, dist_src, dist_src_scale); mv->x = scale_mv_default ( mv->x, dist_dst, dist_src_scale); } } else { mv->v = 0; } } /* --------------------------------------------------------------------------- */ void get_mvp_default(xavs2_t *h, const neighbor_inter_t *p_neighbors, mv_t *pmv, int bwd_2nd, cb_t *p_cb, int ref_idx) { int is_available_UR = p_neighbors[BLK_TOPRIGHT].is_available; int rFrameL, rFrameU, rFrameUR, rFrameUL; int mvp_type; mv_t mva, mvb, mvc, mvd; rFrameL = p_neighbors[BLK_LEFT ].ref_idx[bwd_2nd]; rFrameU = p_neighbors[BLK_TOP ].ref_idx[bwd_2nd]; rFrameUL = p_neighbors[BLK_TOPLEFT ].ref_idx[bwd_2nd]; rFrameUR = is_available_UR ? p_neighbors[BLK_TOPRIGHT].ref_idx[bwd_2nd] : rFrameUL; mva = p_neighbors[BLK_LEFT ].mv[bwd_2nd]; mvb = p_neighbors[BLK_TOP ].mv[bwd_2nd]; mvd = p_neighbors[BLK_TOPLEFT ].mv[bwd_2nd]; mvc = is_available_UR ? p_neighbors[BLK_TOPRIGHT].mv[bwd_2nd] : mvd; mvp_type = get_mvp_type_default(ref_idx, rFrameL, rFrameU, rFrameUR, p_cb); if (h->i_type == SLICE_TYPE_B) { int mult_distance = h->fdec->ref_dpoc [bwd_2nd ? B_BWD : B_FWD]; int dist_src_scale = h->fdec->ref_dpoc_multi[bwd_2nd ? B_BWD : B_FWD]; check_scaling_neighbor_mv_b(h, &mva, mult_distance, dist_src_scale, rFrameL); check_scaling_neighbor_mv_b(h, &mvb, mult_distance, dist_src_scale, rFrameU); check_scaling_neighbor_mv_b(h, &mvc, mult_distance, dist_src_scale, rFrameUR); } else { int mult_distance = calculate_distance(h, ref_idx); check_scaling_neighbor_mv(h, &mva, mult_distance, rFrameL); check_scaling_neighbor_mv(h, &mvb, mult_distance, rFrameU); check_scaling_neighbor_mv(h, &mvc, mult_distance, rFrameUR); } switch (mvp_type) { case MVP_MEDIAN: // for x component derive_median_mv(mva.x, mvb.x, mvc.x, &pmv->x); // for y component derive_median_mv(mva.y, mvb.y, mvc.y, &pmv->y); break; case MVP_LEFT: pmv->v = mva.v; break; case MVP_TOP: pmv->v = mvb.v; break; case MVP_TR: pmv->v = mvc.v; break; default: assert(0); break; } } /* --------------------------------------------------------------------------- */ static INLINE void get_mvp_default_sad(xavs2_t *h, const neighbor_inter_t *p_neighbors, cu_t *p_cu, xavs2_me_t *p_me, mv_t *pmv, int bwd_2nd, cb_t *p_cb, int ref_idx) { int mode = p_cu->cu_info.i_mode; int pic_block_x = (p_cu->i_pix_x + p_cb->x) >> MIN_PU_SIZE_IN_BIT; int pic_block_y = (p_cu->i_pix_y + p_cb->y) >> MIN_PU_SIZE_IN_BIT; int width_in_4x4 = h->i_width_in_minpu; dist_t SAD[4] = { 0, 0, 0, 0 }; int is_available_UL = p_neighbors[BLK_TOPLEFT ].is_available; int is_available_UR = p_neighbors[BLK_TOPRIGHT].is_available; int rFrameL, rFrameU, rFrameUR, rFrameUL; int mvp_type; dist_t sad_space; mv_t mva, mvb, mvc, mvd; rFrameL = p_neighbors[BLK_LEFT ].ref_idx[bwd_2nd]; rFrameU = p_neighbors[BLK_TOP ].ref_idx[bwd_2nd]; rFrameUL = p_neighbors[BLK_TOPLEFT ].ref_idx[bwd_2nd]; rFrameUR = is_available_UR ? p_neighbors[BLK_TOPRIGHT].ref_idx[bwd_2nd] : rFrameUL; mva = p_neighbors[BLK_LEFT ].mv[bwd_2nd]; mvb = p_neighbors[BLK_TOP ].mv[bwd_2nd]; mvd = p_neighbors[BLK_TOPLEFT ].mv[bwd_2nd]; mvc = is_available_UR ? p_neighbors[BLK_TOPRIGHT].mv[bwd_2nd] : mvd; SAD[0] = pic_block_x > 0 ? h->all_mincost[(pic_block_y ) * width_in_4x4 + pic_block_x - 1][mode][ref_idx] : 0; SAD[1] = pic_block_y > 0 ? h->all_mincost[(pic_block_y - 1) * width_in_4x4 + pic_block_x ][mode][ref_idx] : 0; SAD[2] = is_available_UR ? h->all_mincost[(pic_block_y - 1) * width_in_4x4 + pic_block_x + 1][mode][ref_idx] : 0; SAD[3] = is_available_UL ? h->all_mincost[(pic_block_y - 1) * width_in_4x4 + pic_block_x - 1][mode][ref_idx] : 0; mvp_type = get_mvp_type_default(ref_idx, rFrameL, rFrameU, rFrameUR, p_cb); if (h->i_type == SLICE_TYPE_B) { int mult_distance = h->fdec->ref_dpoc [bwd_2nd ? B_BWD : B_FWD]; int dist_src_scale = h->fdec->ref_dpoc_multi[bwd_2nd ? B_BWD : B_FWD]; check_scaling_neighbor_mv_b(h, &mva, mult_distance, dist_src_scale, rFrameL); check_scaling_neighbor_mv_b(h, &mvb, mult_distance, dist_src_scale, rFrameU); check_scaling_neighbor_mv_b(h, &mvc, mult_distance, dist_src_scale, rFrameUR); } else { int mult_distance = calculate_distance(h, ref_idx); check_scaling_neighbor_mv(h, &mva, mult_distance, rFrameL); check_scaling_neighbor_mv(h, &mvb, mult_distance, rFrameU); check_scaling_neighbor_mv(h, &mvc, mult_distance, rFrameUR); } switch (mvp_type) { case MVP_MEDIAN: // for x component derive_median_mv(mva.x, mvb.x, mvc.x, &pmv->x); // for y component sad_space = SAD[derive_median_mv(mva.y, mvb.y, mvc.y, &pmv->y)]; break; case MVP_LEFT: pmv->v = mva.v; sad_space = SAD[0]; // a break; case MVP_TOP: pmv->v = mvb.v; sad_space = SAD[1]; // b break; case MVP_TR: pmv->v = mvc.v; sad_space = SAD[2]; // c break; default: sad_space = 0; assert(0); break; } p_me->pred_sad_space = sad_space; } /* --------------------------------------------------------------------------- */ static void fast_me_prepare_info_remove_mvp(xavs2_t *h, xavs2_me_t *p_me, int mode, int ref_idx, dist_t mincosts[MAX_INTER_MODES][MAX_REFS]) { dist_t pred_sad, sad_reference, sad_uplayer; /* get mvp & sad in upper layer */ if (mode == PRED_2NxnU || mode == PRED_2NxnD) { sad_uplayer = mincosts[PRED_2NxN][ref_idx] / 2; // sad in upper layer } else if (mode == PRED_nLx2N || mode == PRED_nRx2N) { sad_uplayer = mincosts[PRED_Nx2N][ref_idx] / 2; // sad in upper layer } else if (mode > PRED_2Nx2N) { sad_uplayer = mincosts[PRED_2Nx2N][ref_idx] / 2; // sad in upper layer } else { sad_uplayer = 0; // set flag, the up layer cannot be used } p_me->pred_sad_uplayer = sad_uplayer; /* get mvp & sad in nearest reference frame */ if (h->i_type == SLICE_TYPE_B && ref_idx == B_FWD) { sad_reference = 0; } else if (ref_idx > 0) { sad_reference = mincosts[mode][ref_idx - 1]; // sad in nearest reference frame } else { sad_reference = 0; } p_me->pred_sad_ref = sad_reference; /* get pred sad */ if (h->i_type != SLICE_TYPE_B && ref_idx > 0) { pred_sad = p_me->pred_sad_ref; } else if (mode == PRED_2Nx2N) { pred_sad = p_me->pred_sad_space; } else { pred_sad = p_me->pred_sad_uplayer; } p_me->pred_sad = pred_sad; } /* --------------------------------------------------------------------------- */ static void fast_me_prepare_info(xavs2_t *h, xavs2_me_t *p_me, int mode, int ref_idx,int pu_idx, dist_t mincosts[MAX_INTER_MODES][MAX_REFS]) { dist_t pred_sad, sad_reference, sad_uplayer; mv_t(*best_mvs)[4][MAX_REFS] = p_me->all_best_mv; /* get mvp & sad in upper layer */ if (mode == PRED_2NxnU || mode == PRED_2NxnD) { p_me->mvp1 = best_mvs[PRED_2NxN][pu_idx][ref_idx]; sad_uplayer = mincosts[PRED_2NxN][ref_idx] / 2; // sad in upper layer } else if (mode == PRED_nLx2N || mode == PRED_nRx2N) { p_me->mvp1 = best_mvs[PRED_Nx2N][pu_idx][ref_idx]; sad_uplayer = mincosts[PRED_Nx2N][ref_idx] / 2; // sad in upper layer } else if (mode > PRED_2Nx2N) { p_me->mvp1 = best_mvs[PRED_2Nx2N][pu_idx][ref_idx]; sad_uplayer = mincosts[PRED_2Nx2N][ref_idx] / 2; // sad in upper layer } else { p_me->mvp1.v = 0; sad_uplayer = 0; // set flag, the up layer cannot be used } p_me->pred_sad_uplayer = sad_uplayer; /* get mvp & sad in nearest reference frame */ if (h->i_type == SLICE_TYPE_B && ref_idx == B_FWD) { mv_t mv_bwd = best_mvs[mode][pu_idx][B_BWD]; p_me->mvp2.x = (int16_t)(-mv_bwd.x); p_me->mvp2.y = (int16_t)(-mv_bwd.y); sad_reference = mincosts[mode][B_BWD]; } else if (ref_idx > 0) { mv_t mv_last = best_mvs[mode][pu_idx][ref_idx - 1]; int dpoc_last = h->fdec->ref_dpoc[ref_idx - 1]; int dpoc_curr = h->fdec->ref_dpoc[ref_idx]; p_me->mvp2.x = (int16_t)(mv_last.x * dpoc_curr / (float)dpoc_last); p_me->mvp2.y = (int16_t)(mv_last.y * dpoc_curr / (float)dpoc_last); sad_reference = mincosts[mode][ref_idx - 1]; // sad in nearest reference frame } else { sad_reference = 0; } p_me->pred_sad_ref = sad_reference; /* get MV of collocated block */ if (h->fref[0] != NULL && h->fref[0]->i_frm_type != XAVS2_TYPE_I) { int stride_mvstore = (h->i_width_in_minpu + 3) >> 2; int pu_pos = (p_me->i_pix_y >> 4) * stride_mvstore + (p_me->i_pix_x >> 4); mv_t mv_col = h->fref[0]->pu_mv [pu_pos]; int ref_idx_col = h->fref[0]->pu_ref[pu_pos]; if (ref_idx_col >= 0) { int dpoc_col = h->fref[0]->ref_dpoc[ref_idx_col]; int dpoc_curr = h->fdec->ref_dpoc[ref_idx]; p_me->mvp3.x = (int16_t)(mv_col.x * dpoc_curr / (float)dpoc_col); p_me->mvp3.y = (int16_t)(mv_col.y * dpoc_curr / (float)dpoc_col); } else { p_me->mvp3.v = 0; } } else { p_me->mvp3.v = 0; } /* get pred sad */ if (h->i_type != SLICE_TYPE_B && ref_idx > 0) { pred_sad = p_me->pred_sad_ref; } else if (mode == PRED_2Nx2N) { pred_sad = p_me->pred_sad_space; } else { pred_sad = p_me->pred_sad_uplayer; } p_me->pred_sad = pred_sad; /* init beta parameters for UMH */ if (pred_sad != 0) { double threshold = h->umh_bsize[mode] / (pred_sad * pred_sad); p_me->beta2 = threshold - tab_umh_alpha_2nd[mode]; p_me->beta3 = threshold - tab_umh_alpha_3rd[mode]; } else { p_me->beta2 = 0; p_me->beta3 = 0; } } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ int get_mv_predictors_bskip(xavs2_t *h, cu_t *p_cu) { cu_mode_t *p_cumode = cu_get_layer_mode(h, p_cu->cu_info.i_level); neighbor_inter_t *p_neighbors = cu_get_layer(h, p_cu->cu_info.i_level)->neighbor_inter; mv_t mv_1st, mv_2nd; cb_t cur_cb; const int8_t *col_ref = h->fref[0]->pu_ref; const mv_t *col_mv = h->fref[0]->pu_mv; int w_in_16x16 = (h->i_width_in_minpu + 3) >> 2; int i_level = p_cu->cu_info.i_level; int pix_x = p_cu->i_pix_x; int pix_y = p_cu->i_pix_y; int pic_block_x, pic_block_y; int col_mv_pos; int col_blk_ref; int k; int blocksize = 1 << i_level; int blocksize2 = 1 << (i_level - 1); assert(SLICE_TYPE_B == h->i_type); cur_cb.x = cur_cb.y = 0; cur_cb.w = cur_cb.h = (int8_t)blocksize; for (k = 0; k < 4; k++) { pic_block_y = pix_y + (k >> 1) * blocksize2; pic_block_x = pix_x + (k & 1) * blocksize2; col_mv_pos = (pic_block_y >> 4) * w_in_16x16 + (pic_block_x >> 4); col_blk_ref = col_ref[col_mv_pos]; if (col_blk_ref == INVALID_REF) { ///! 9.5.8.4.3 ˶ʸ2 ԪΪ B_Skip_BiʱPUIJοΪ INVALID_REF get_mvp_default(h, p_neighbors, &mv_1st, 0, &cur_cb, B_FWD); // ﴫݵref_idxӰp_me->pred_sad_spaceʹ get_mvp_default(h, p_neighbors, &mv_2nd, 1, &cur_cb, B_BWD); } else { int TRp = h->fref[B_BWD]->ref_dpoc[col_blk_ref]; int dst_src_scale = h->fref[B_BWD]->ref_dpoc_multi[col_blk_ref]; int TRd = calculate_distance(h, B_BWD); int TRb = calculate_distance(h, B_FWD); mv_t mv_col = col_mv[col_mv_pos]; if (h->b_field_sequence == 0) { mv_1st.x = scale_mv_biskip(mv_col.x, TRb, dst_src_scale); mv_1st.y = scale_mv_biskip(mv_col.y, TRb, dst_src_scale); mv_2nd.x = -scale_mv_biskip(mv_col.x, TRd, dst_src_scale); mv_2nd.y = -scale_mv_biskip(mv_col.y, TRd, dst_src_scale); } else { mv_1st.x = scale_mv_biskip( mv_col.x, TRb, dst_src_scale); mv_1st.y = scale_mv_biskip_y(h, mv_col.y, TRb, TRp, dst_src_scale); mv_2nd.x = -scale_mv_biskip( mv_col.x, TRd, dst_src_scale); mv_2nd.y = -scale_mv_biskip_y(h, mv_col.y, TRd, TRp, dst_src_scale); } } p_cumode->tskip_mv[k][0] = mv_1st; p_cumode->tskip_mv[k][1] = mv_2nd; // only calculate block 0 for smallest CU, need copy MV of block 0 to block 1/2/3 if (i_level == MIN_CU_SIZE_IN_BIT) { for (k = 1; k < 4; k++) { p_cumode->tskip_mv[k][0] = mv_1st; p_cumode->tskip_mv[k][1] = mv_2nd; } break; } } get_bskip_mv_spatial(p_cumode, p_neighbors); return 1; } /* --------------------------------------------------------------------------- */ int get_mv_predictors_pskip(xavs2_t *h, cu_t *p_cu) { int i, k; get_pskip_mv_temporal(h, p_cu); if (h->i_type == SLICE_TYPE_F) { cu_mode_t *p_cu_mode = cu_get_layer_mode(h, p_cu->cu_info.i_level); neighbor_inter_t *p_neighbors = cu_get_layer(h, p_cu->cu_info.i_level)->neighbor_inter; if (h->i_ref > 1) { int *delta_P = h->fdec->ref_dpoc; for (k = 0; k < 4; k++) { mv_t mv_1st = p_cu_mode->tskip_mv[k][0]; for (i = 1; i < h->i_ref; i++) { mv_t mv_2nd; mv_2nd.x = scale_mv_skip ( mv_1st.x, delta_P[i], delta_P[0]); mv_2nd.y = scale_mv_skip_y(h, mv_1st.y, delta_P[i], delta_P[0]); p_cu_mode->tskip_mv[k][i] = mv_2nd; } } } get_pskip_mv_spatial(p_cu_mode, p_neighbors); if (h->param->enable_wsm) { return h->i_ref; } } return 1; } /* --------------------------------------------------------------------------- */ static INLINE int add_one_mv_candidate(xavs2_me_t *p_me, int16_t (*mvc)[2], int i_mvc, int x, int y) { int mv_x_min = p_me->mv_min_fpel[0]; int mv_y_min = p_me->mv_min_fpel[1]; int mv_x_max = p_me->mv_max_fpel[0]; int mv_y_max = p_me->mv_max_fpel[1]; int i; x = IPEL(x); y = IPEL(y); x = XAVS2_CLIP3(mv_x_min, mv_x_max, x); y = XAVS2_CLIP3(mv_y_min, mv_y_max, y); for (i = 0; i < i_mvc; i++) { if (mvc[i][0] == x && mvc[i][1] == y) { break; } } if (i == i_mvc) { mvc[i][0] = (int16_t)x; mvc[i][1] = (int16_t)y; return i_mvc + 1; } else { return i_mvc; } } /* --------------------------------------------------------------------------- */ int pred_inter_search_single(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me, dist_t *fwd_cost, dist_t *bwd_cost) { int16_t mvc[8][2] = {{0}}; int i_mvc = 0; int pu_size_shift = p_cu->cu_info.i_level - MIN_CU_SIZE_IN_BIT; int mode = p_cu->cu_info.i_mode; int ref_idx; int best_ref_idx = 0; dist_t cost; int mv_mempos_x; int mv_mempos_y; mv_t mv; int b_mv_valid; // MVǷЧСȡֵǷڱ׼涨ЧΧ int pu_idx_x = p_cb->x != 0; // PU index in CU int pu_idx_y = p_cb->y != 0; int pu_idx = (pu_idx_y << 1) + pu_idx_x; int pix_x = p_cu->i_pix_x + p_cb->x; int pix_y = p_cu->i_pix_y + p_cb->y; int bsx = p_cb->w; int bsy = p_cb->h; int i, j, m, n, k; cu_mv_mode_t *p_mode_mvs = cu_get_layer_mode(h, p_cu->cu_info.i_level)->mvs[mode]; neighbor_inter_t *p_neighbors = cu_get_layer(h, p_cu->cu_info.i_level)->neighbor_inter; dist_t(*all_min_costs)[MAX_INTER_MODES][MAX_REFS]; int width_in_4x4 = h->i_width_in_minpu; int max_ref = h->i_ref; *fwd_cost = MAX_DISTORTION; mv_mempos_x = (pix_x + MIN_PU_SIZE - 1) >> MIN_PU_SIZE_IN_BIT; // ǵ8x8ķǶԳƻ֣Ҫһλ mv_mempos_y = (pix_y + MIN_PU_SIZE - 1) >> MIN_PU_SIZE_IN_BIT; all_min_costs = &h->all_mincost[mv_mempos_y * width_in_4x4 + mv_mempos_x]; /* make p_fenc point to the start address of the current PU */ p_me->p_fenc = h->lcu.p_fenc[0] + (pix_y - h->lcu.i_pix_y) * FENC_STRIDE + pix_x - h->lcu.i_pix_x; p_me->i_pixel = PART_INDEX(bsx, bsy); p_me->i_pix_x = pix_x; p_me->i_pix_y = pix_y; p_me->i_block_w = bsx; p_me->i_block_h = bsy; /* calculate max allowed MV range * limit motion search to a slightly smaller range than the theoretical limit, * since the search may go a few iterations past its given range */ m = 6; // UMH: 1 for diamond, 2 for octagon, 2 for subpel i = (-MAX_CU_SIZE - pix_x) << 2; // mv min j = (h->i_width + MAX_CU_SIZE - pix_x - bsx) << 2; // mv max p_me->mv_min[0] = XAVS2_CLIP3(h->min_mv_range[0], h->max_mv_range[0], i); p_me->mv_max[0] = XAVS2_CLIP3(h->min_mv_range[0], h->max_mv_range[0], j); p_me->mv_min_fpel[0] = (p_me->mv_min[0] >> 2) + m; p_me->mv_max_fpel[0] = (p_me->mv_max[0] >> 2) - m; i = (-MAX_CU_SIZE - pix_y) << 2; // mv min j = (h->i_height + MAX_CU_SIZE - pix_y - bsy) << 2; // mv max p_me->mv_min[1] = XAVS2_CLIP3(h->min_mv_range[1], h->max_mv_range[1], i); p_me->mv_max[1] = XAVS2_CLIP3(h->min_mv_range[1], h->max_mv_range[1], j); p_me->mv_min_fpel[1] = (p_me->mv_min[1] >> 2) + m; p_me->mv_max_fpel[1] = (p_me->mv_max[1] >> 2) - m; // loop over all reference frames for (ref_idx = 0; ref_idx < max_ref; ref_idx++) { int bwd_2nd = h->i_type == SLICE_TYPE_B && ref_idx == B_BWD; xavs2_frame_t *p_ref_frm = h->fref[ref_idx]; mv_t *pred_mv = &p_mode_mvs[pu_idx].all_mvp[ref_idx]; /* get MVP (motion vector predictor) */ if (h->param->me_method == XAVS2_ME_UMH) { get_mvp_default_sad(h, p_neighbors, p_cu, p_me, pred_mv, bwd_2nd, p_cb, ref_idx); } else { get_mvp_default(h, p_neighbors, pred_mv, bwd_2nd, p_cb, ref_idx); } // MVP ȡִ֮У߶ p_me ״̬ p_me->i_ref_idx = (int16_t)ref_idx; if (h->param->me_method == XAVS2_ME_UMH) { fast_me_prepare_info(h, p_me, mode, ref_idx, pu_idx, all_min_costs[0]); } /* set reference index and pointer */ p_me->i_bias = pix_y * p_ref_frm->i_stride[IMG_Y] + pix_x; p_me->p_fref_1st = p_ref_frm; p_me->mvp.v = pred_mv->v; /* MVPȡֵMVPֵME */ b_mv_valid = check_mv_range(h, pred_mv, ref_idx, pix_x, pix_y, bsx, bsy); b_mv_valid &= check_mvd(h, pred_mv->x, pred_mv->y); /* Ĭϱĵλ */ i_mvc = 0; i_mvc = add_one_mv_candidate(p_me, mvc, i_mvc, p_me->mvp.x, p_me->mvp.y); i_mvc = add_one_mv_candidate(p_me, mvc, i_mvc, 0, 0); if (b_mv_valid) { cost = xavs2_me_search(h, p_me, mvc, i_mvc); } else { p_me->bmv = p_me->mvp; // MVPԽʱMVóɺMVPһС cost = MAX_DISTORTION; } mv = p_me->bmv; /* store motion vectors and reference frame (for motion vector prediction) */ p_me->all_best_imv[ref_idx] = p_me->bmv2; m = XAVS2_MAX(bsx >> (MIN_PU_SIZE_IN_BIT + pu_size_shift), 1); n = XAVS2_MAX(bsy >> (MIN_PU_SIZE_IN_BIT + pu_size_shift), 1); if (h->param->me_method == XAVS2_ME_UMH) { for (j = 0; j < n; j++) { for (i = 0; i < m; i++) { k = ((pu_idx_y + j) << 1) + (pu_idx_x + i); assert(mode >= 0 && mode < MAX_INTER_MODES && k < 4 && k >= 0); p_mode_mvs[k].all_single_mv[ref_idx] = mv; p_me->all_best_mv[mode][k][ref_idx] = mv; } } } else { for (j = 0; j < n; j++) { for (i = 0; i < m; i++) { k = ((pu_idx_y + j) << 1) + (pu_idx_x + i); assert(mode >= 0 && mode < MAX_INTER_MODES && k < 4 && k >= 0); p_mode_mvs[k].all_single_mv[ref_idx] = mv; } } } if (h->param->me_method == XAVS2_ME_UMH) { m = XAVS2_MAX(bsx >> MIN_PU_SIZE_IN_BIT, 1); n = XAVS2_MAX(bsy >> MIN_PU_SIZE_IN_BIT, 1); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) { all_min_costs[j * width_in_4x4 + i][mode][ref_idx] = p_me->bcost2; } } } b_mv_valid &= check_mv_range(h, &mv, ref_idx, pix_x, pix_y, bsx, bsy); b_mv_valid &= check_mvd(h, (mv.x - pred_mv->x), (mv.y - pred_mv->y)); if (!b_mv_valid) { cost = MAX_DISTORTION; } if (h->i_type == SLICE_TYPE_B) { // for SLICE_TYPE_B: only get the forward cost if (ref_idx == B_FWD) { *fwd_cost = cost; // forward cost p_me->bmvcost[PDIR_FWD] = p_me->mvcost[PDIR_FWD]; } else { *bwd_cost = cost; // backward cost p_me->bmvcost[PDIR_BWD] = p_me->mvcost[PDIR_FWD]; } } else { // for SLICE_TYPE_F or SLICE_TYPE_P cost += REF_COST(ref_idx); if (cost < *fwd_cost) { *fwd_cost = cost; best_ref_idx = ref_idx; p_me->bmvcost[PDIR_FWD] = p_me->mvcost[PDIR_FWD]; } } } return best_ref_idx; } /* --------------------------------------------------------------------------- * get cost for symirectional prediction */ void pred_inter_search_bi(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me, dist_t *sym_mcost, dist_t *bid_mcost) { int mode = p_cu->cu_info.i_mode; mv_t mvp, mv; cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); cu_mv_mode_t *p_mode_mv = cu_get_layer_mode(h, p_cu->cu_info.i_level)->mvs[mode]; pel_t *buf_pixel_temp = p_enc->buf_pixel_temp; int pu_size_shift = p_cu->cu_info.i_level - MIN_CU_SIZE_IN_BIT; dist_t cost, cost_bid; int m, n, i, j; int b_mv_valid; // MVǷЧСȡֵǷڱ׼涨ЧΧ int pu_idx_x = p_cb->x != 0; // PU index in CU int pu_idx_y = p_cb->y != 0; int k = (pu_idx_y << 1) + pu_idx_x; int pix_x = p_cu->i_pix_x + p_cb->x; int pix_y = p_cu->i_pix_y + p_cb->y; int bsx = p_cb->w; int bsy = p_cb->h; int distance_fwd = calculate_distance(h, B_FWD); int distance_bwd = calculate_distance(h, B_BWD); // get fullpel search results mv_t fwd_mv = p_me->all_best_imv[B_FWD]; mv_t bwd_mv = p_mode_mv[k].all_single_mv[B_BWD]; assert(mode >= 0 && mode < MAX_INTER_MODES && k < 4 && k >= 0); // get MVP (motion vector predicator p_me->mvp1 = p_mode_mv[k].all_mvp[B_FWD]; p_me->mvp2 = p_mode_mv[k].all_mvp[B_BWD]; mvp = p_me->mvp1; // init motion vectors fwd_mv.x <<= 2; fwd_mv.y <<= 2; mv = fwd_mv; /* set reference index and pointer */ p_me->i_ref_idx = B_BWD; p_me->p_fref_1st = h->fref[B_FWD]; p_me->p_fref_2nd = h->fref[B_BWD]; p_me->i_distance_1st = distance_fwd; p_me->i_distance_2nd = distance_bwd; b_mv_valid = check_mv_range_sym(h, &mvp, pix_x, pix_y, bsx, bsy, distance_fwd, distance_bwd); b_mv_valid &= check_mv_range_sym(h, &mv, pix_x, pix_y, bsx, bsy, distance_fwd, distance_bwd); b_mv_valid &= check_mvd(h, mvp.x, mvp.y); // avoid mv-bits calculation error if (b_mv_valid) { cost = xavs2_me_search_sym(h, p_me, buf_pixel_temp, &mv); } else { cost = MAX_DISTORTION; } b_mv_valid = check_mv_range(h, &fwd_mv, B_FWD, pix_x, pix_y, bsx, bsy); b_mv_valid &= check_mv_range(h, &bwd_mv, B_BWD, pix_x, pix_y, bsx, bsy); b_mv_valid &= check_mvd(h, p_me->mvp1.x, p_me->mvp1.y); // avoid mv-bits calculation error b_mv_valid &= check_mvd(h, p_me->mvp2.x, p_me->mvp2.y); if (b_mv_valid) { cost_bid = xavs2_me_search_bid(h, p_me, buf_pixel_temp, &fwd_mv, &bwd_mv, p_enc); } else { cost_bid = MAX_DISTORTION; } // store motion vectors m = XAVS2_MAX((bsx >> (MIN_PU_SIZE_IN_BIT + pu_size_shift)), 1); n = XAVS2_MAX((bsy >> (MIN_PU_SIZE_IN_BIT + pu_size_shift)), 1); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) { k = ((pu_idx_y + j) << 1) + (pu_idx_x + i); p_mode_mv[k].all_sym_mv [0] = mv; p_mode_mv[k].all_dual_mv_1st[0] = fwd_mv; p_mode_mv[k].all_dual_mv_2nd[0] = bwd_mv; } } if (!(check_mv_range(h, &fwd_mv, B_FWD, pix_x, pix_y, bsx, bsy) && check_mvd(h, (fwd_mv.x - p_me->mvp1.x), (fwd_mv.y - p_me->mvp1.y)))) { cost_bid = MAX_DISTORTION; } if (!(check_mv_range(h, &bwd_mv, B_BWD, pix_x, pix_y, bsx, bsy) && check_mvd(h, (bwd_mv.x - p_me->mvp2.x), (bwd_mv.y - p_me->mvp2.y)))) { cost_bid = MAX_DISTORTION; } if (!(check_mv_range_sym(h, &mv, pix_x, pix_y, bsx, bsy, distance_fwd, distance_bwd) && check_mvd(h, (mv.x - mvp.x), (mv.y - mvp.y)))) { cost = MAX_DISTORTION; } p_me->bmvcost[PDIR_SYM] = p_me->mvcost[PDIR_SYM]; p_me->bmvcost[PDIR_BID] = p_me->mvcost[PDIR_BID]; *sym_mcost = cost; *bid_mcost = cost_bid; } /* --------------------------------------------------------------------------- * get cost for dual hypothesis prediction */ void pred_inter_search_dual(xavs2_t *h, cu_t *p_cu, cb_t *p_cb, xavs2_me_t *p_me, dist_t *dual_mcost, int *dual_best_fst_ref, int *dual_best_snd_ref) { int mode = p_cu->cu_info.i_mode; mv_t fst_dual, snd_dual; cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); cu_mv_mode_t *p_mode_mv = cu_get_layer_mode(h, p_cu->cu_info.i_level)->mvs[mode]; pel_t *buf_pixel_temp = p_enc->buf_pixel_temp; int pix_x = p_cu->i_pix_x + p_cb->x; int pix_y = p_cu->i_pix_y + p_cb->y; int pu_idx_x = p_cb->x != 0; // PU index int pu_idx_y = p_cb->y != 0; int bsx = p_cb->w; // block size int bsy = p_cb->h; int pu_size_shift = p_cu->cu_info.i_level - MIN_CU_SIZE_IN_BIT; int ref_idx; dist_t cost; int distance_fwd, distance_bwd; int b_mv_valid; // MVǷЧСȡֵǷڱ׼涨ЧΧ int m, n, i, j, k; int max_ref = h->i_ref; *dual_mcost = MAX_DISTORTION; // loop over reference frames for (ref_idx = 0; ref_idx < max_ref; ref_idx++) { int snd_ref = !ref_idx; // get MVPs(motion vector predictors) k = (pu_idx_y << 1) + pu_idx_x; assert(mode >= 0 && mode < MAX_INTER_MODES && k < 4 && k >= 0); p_me->mvp1 = p_mode_mv[k].all_mvp[ref_idx]; /* set reference index and pointer */ p_me->i_ref_idx = (int16_t)ref_idx; p_me->i_distance_1st = distance_fwd = calculate_distance(h, ref_idx); p_me->i_distance_2nd = distance_bwd = calculate_distance(h, snd_ref); p_me->p_fref_1st = h->fref[ref_idx]; p_me->p_fref_2nd = h->fref[snd_ref]; // get the best fullpel search result fst_dual = p_me->all_best_imv[ref_idx]; // only for F frame, B frame are not called here fst_dual.x <<= 2; fst_dual.y <<= 2; // get the min motion cost for dual hypothesis prediction b_mv_valid = check_mv_range_sym(h, &fst_dual, pix_x, pix_y, bsx, bsy, distance_fwd, distance_bwd); b_mv_valid &= check_mvd(h, (fst_dual.x - p_me->mvp1.x), (fst_dual.y - p_me->mvp1.y)); b_mv_valid &= check_mvd(h, p_me->mvp1.x, p_me->mvp1.y); b_mv_valid &= check_mvd(h, p_me->mvp.x, p_me->mvp.y); if (b_mv_valid) { cost = xavs2_me_search_sym(h, p_me, buf_pixel_temp, &fst_dual); } else { cost = MAX_DISTORTION; } /* store motion vectors and reference frame (for motion vector prediction) */ snd_dual.v = MAKEDWORD(scale_mv_skip ( fst_dual.x, distance_bwd, distance_fwd), scale_mv_skip_y(h, fst_dual.y, distance_bwd, distance_fwd)); m = XAVS2_MAX((bsx >> (MIN_PU_SIZE_IN_BIT + pu_size_shift)), 1); n = XAVS2_MAX((bsy >> (MIN_PU_SIZE_IN_BIT + pu_size_shift)), 1); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) { k = ((pu_idx_y + j) << 1) + (pu_idx_x + i); p_mode_mv[k].all_dual_mv_1st[ref_idx] = fst_dual; p_mode_mv[k].all_dual_mv_2nd[ref_idx] = snd_dual; } } b_mv_valid &= check_mv_range_sym(h, &fst_dual, pix_x, pix_y, bsx, bsy, distance_fwd, distance_bwd); b_mv_valid &= check_mvd(h, (fst_dual.x - p_me->mvp1.x), (fst_dual.y - p_me->mvp1.y)); if (!b_mv_valid) { cost = MAX_DISTORTION; } else { cost += REF_COST(ref_idx); if (cost < *dual_mcost) { *dual_mcost = cost; *dual_best_fst_ref = ref_idx; *dual_best_snd_ref = !ref_idx; p_me->bmvcost[PDIR_DUAL] = p_me->mvcost[PDIR_SYM]; } } } } xavs2-1.3/source/encoder/md_intra.c000066400000000000000000000522531340660520300173060ustar00rootroot00000000000000/* * md_intra.c * * Description of this file: * Mode decision functions definition for Intra prediction of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "block_info.h" #include "cudata.h" /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE bool_t is_block_available(xavs2_t *h, int x_4x4, int y_4x4, int dx_4x4, int dy_4x4, int cur_slice_idx) { int x2_4x4 = x_4x4 + dx_4x4; int y2_4x4 = y_4x4 + dy_4x4; if (x2_4x4 < 0 || y2_4x4 < 0 || x2_4x4 >= h->i_width_in_minpu || y2_4x4 >= h->i_height_in_minpu) { return 0; } else { return cur_slice_idx == cu_get_slice_index(h, x2_4x4 >> 1, y2_4x4 >> 1); } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE uint32_t get_intra_neighbors(xavs2_t *h, int x_4x4, int y_4x4, int bsx, int bsy, int cur_slice_idx) { const int lcu_mask = (1 << (h->i_lcu_level - 2)) - 1; int leftdown, topright; /* 1. ڿǷͬһSlice */ uint32_t b_LEFT = is_block_available(h, x_4x4, y_4x4, -1, 0, cur_slice_idx); uint32_t b_TOP = is_block_available(h, x_4x4, y_4x4, 0, -1, cur_slice_idx); uint32_t b_TOP_LEFT = is_block_available(h, x_4x4, y_4x4, -1, -1, cur_slice_idx); uint32_t b_TOP_RIGHT = is_block_available(h, x_4x4, y_4x4, (bsx >> 1) - 1, -1, cur_slice_idx); // (bsx >> MIN_PU_SIZE_IN_BIT << 1) uint32_t b_LEFT_DOWN = is_block_available(h, x_4x4, y_4x4, -1, (bsy >> 1) - 1, cur_slice_idx); // (bsy >> MIN_PU_SIZE_IN_BIT << 1) /* 2. ڿǷڵǰ֮ǰع */ x_4x4 &= lcu_mask; y_4x4 &= lcu_mask; leftdown = h->tab_avail_DL[((y_4x4 + (bsy >> 2) - 1) << (h->i_lcu_level - B4X4_IN_BIT)) + (x_4x4)]; topright = h->tab_avail_TR[((y_4x4) << (h->i_lcu_level - B4X4_IN_BIT)) + (x_4x4 + (bsx >> 2) - 1)]; b_LEFT_DOWN = b_LEFT_DOWN && leftdown; b_TOP_RIGHT = b_TOP_RIGHT && topright; return (b_LEFT << MD_I_LEFT) | (b_TOP << MD_I_TOP) | (b_TOP_LEFT << MD_I_TOP_LEFT) | (b_TOP_RIGHT << MD_I_TOP_RIGHT) | (b_LEFT_DOWN << MD_I_LEFT_DOWN); } /* --------------------------------------------------------------------------- * get intra PU availability */ static ALWAYS_INLINE uint32_t get_intra_pu_avail(cu_t *p_cu, int block_x, int block_y, int bsx, int bsy) { int cu_size = p_cu->i_size; uint32_t cu_avail = p_cu->intra_avail; uint32_t avail; if (block_x == 0 && block_y == 0) { avail = cu_avail; if (bsx < cu_size) { avail = (avail & (~(1 << MD_I_TOP_RIGHT))) | (!!IS_NEIGHBOR_AVAIL(cu_avail, MD_I_TOP) << MD_I_TOP_RIGHT); } if (bsy < cu_size) { avail = (avail & (~(1 << MD_I_LEFT_DOWN))) | (!!IS_NEIGHBOR_AVAIL(cu_avail, MD_I_LEFT) << MD_I_LEFT_DOWN); } } else if (block_y == 0) { avail = (cu_avail & (1 << MD_I_TOP)); // ϱ߽CUϱ߽¾ avail |= (1 << MD_I_LEFT); // ߽ avail |= ((cu_avail >> MD_I_TOP) & 1) << MD_I_TOP_LEFT; // CUϱ߽Ծ if (block_x + bsx < cu_size) { // CUϱ߽ϱ߽ avail |= (!!IS_NEIGHBOR_AVAIL(cu_avail, MD_I_TOP)) << MD_I_TOP_RIGHT; } else { avail |= cu_avail & (1 << MD_I_TOP_RIGHT); } } else if (block_x == 0) { avail = (cu_avail & (1 << MD_I_LEFT)); // ߽CU߽ avail |= (1 << MD_I_TOP); // ϱ߽ avail |= ((cu_avail >> MD_I_LEFT) & 1) << MD_I_TOP_LEFT; // CUϱ߽Ծ if (bsx < cu_size && bsy < cu_size) { // avail |= 1 << MD_I_TOP_RIGHT; } // if (block_y + bsy < cu_size) { avail |= (!!IS_NEIGHBOR_AVAIL(cu_avail, MD_I_LEFT)) << MD_I_LEFT_DOWN; } else { avail |= cu_avail & (1 << MD_I_LEFT_DOWN); } } else { // ϡ² avail = (1 << MD_I_LEFT) | (1 << MD_I_TOP) | (1 << MD_I_TOP_LEFT); } return avail; } /* --------------------------------------------------------------------------- * fill reference samples for luma component */ static INLINE void fill_ref_samples_luma(xavs2_t *h, cu_t *p_cu, pel_t *EP, int img_x, int img_y, int block_x, int block_y, int bsx, int bsy) { int pos_x = (img_x - h->lcu.i_pix_x - 1); int pos_y = (img_y - h->lcu.i_pix_y - 1); pel_t *pTL = h->lcu.p_fdec[0] + pos_y * FDEC_STRIDE + pos_x; int xy = (((pos_y + 1) != 0) << 1) + ((pos_x + 1) != 0); uint32_t avail; /* 1, ο߽Ч */ if (img_x + 2 * bsx <= h->i_width && img_y + 2 * bsy <= h->i_height && 0) { // TODO: ߵ²ƥ䣬ԲԭĬģʽ avail = get_intra_pu_avail(p_cu, block_x, block_y, bsx, bsy); } else { int cur_slice_idx = cu_get_slice_index(h, img_x >> MIN_CU_SIZE_IN_BIT, img_y >> MIN_CU_SIZE_IN_BIT); int b8_x = img_x >> MIN_PU_SIZE_IN_BIT; int b8_y = img_y >> MIN_PU_SIZE_IN_BIT; avail = get_intra_neighbors(h, b8_x, b8_y, bsx, bsy, cur_slice_idx); } p_cu->block_avail = (uint8_t)avail; /* 2, ɲο߽ص */ g_funcs.fill_edge_f[xy](pTL, FDEC_STRIDE, h->lcu.ctu_border[0].rec_top + pos_x - pos_y, EP, avail, bsx, bsy); } /* --------------------------------------------------------------------------- * \param h : handle of the encoder * \param src: (src + 1) is aligned to 32-byte, src[1] is the 1st pixel in top reference row * \param dst: aligned to 32-byte */ static INLINE void xavs2_intra_prediction(xavs2_t *h, pel_t *src, pel_t *dst, int i_dst, int dir_mode, int i_avail, int bsx, int bsy) { UNUSED_PARAMETER(h); if (dir_mode != DC_PRED) { g_funcs.intraf[dir_mode](src, dst, i_dst, dir_mode, bsx, bsy); } else { int b_top = !!IS_NEIGHBOR_AVAIL(i_avail, MD_I_TOP); int b_left = !!IS_NEIGHBOR_AVAIL(i_avail, MD_I_LEFT); int mode_ex = ((b_top << 8) + b_left); g_funcs.intraf[dir_mode](src, dst, i_dst, mode_ex, bsx, bsy); } } /** * =========================================================================== * interface function definition * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void update_candidate_list(int mode, rdcost_t cost, int max_num, intra_candidate_t *p_candidates) { int shift = 0; p_candidates += max_num - 1; while (shift < max_num && cost < p_candidates->cost) { p_candidates[0].mode = p_candidates[-1].mode; p_candidates[0].cost = p_candidates[-1].cost; shift++; p_candidates--; } p_candidates[1].mode = mode; p_candidates[1].cost = cost; } /* --------------------------------------------------------------------------- * used for generating intra luma prediction samples */ #define PREDICT_ADD_LUMA(MODE_IDX) \ {\ pel_t *p_pred = p_enc->intra_pred[MODE_IDX];\ int mode_bits = (mpm[0] == (MODE_IDX) || mpm[1] == (MODE_IDX)) ? 2 : 6;\ rdcost_t cost = h->f_lambda_mode * mode_bits; \ \ xavs2_intra_prediction(h, edge_pixels, p_pred, block_w, MODE_IDX,\ p_cu->block_avail, block_w, block_h);\ cost += intra_cmp(p_fenc, FENC_STRIDE, p_pred, block_w);\ update_candidate_list(MODE_IDX, cost, INTRA_MODE_NUM_FOR_RDO, p_candidates);\ } /* --------------------------------------------------------------------------- * return numbers for RDO and candidate list by scanning all the intra modes */ int rdo_get_pred_intra_luma(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, pel_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h) { pixel_cmp_t intra_cmp = g_funcs.pixf.intra_cmp[PART_INDEX(block_w, block_h)]; cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); pel_t *edge_pixels = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 2) - 1]; int mode; int img_x = h->lcu.i_pix_x + p_cu->i_pos_x + block_x; int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y; /* get edge samples for intra prediction */ fill_ref_samples_luma(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h); UNUSED_PARAMETER(blockidx); /* loop over all intra predication modes */ for (mode = 0; mode < NUM_INTRA_MODE; mode++) { PREDICT_ADD_LUMA(mode); } p_cu->feature.intra_had_cost = p_candidates[0].cost; return h->tab_num_intra_rdo[p_cu->cu_info.i_level - (p_cu->cu_info.i_tu_split != TU_SPLIT_NON)]; } /* --------------------------------------------------------------------------- * return numbers for RDO and candidate list by rough scanning */ int rdo_get_pred_intra_luma_rmd(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, pel_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h) { int visited[NUM_INTRA_MODE] = { 0 }; /* 0: not visited yet * 1: visited in the first phase * 2: visited in final_mode */ pixel_cmp_t intra_cmp = g_funcs.pixf.intra_cmp[PART_INDEX(block_w, block_h)]; cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); pel_t *edge_pixels = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 2) - 1]; int mode, i, j; int num_angle = 0; int num_for_rdo; int num_to_add; int img_x = h->lcu.i_pix_x + p_cu->i_pos_x + block_x; int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y; /* get edge samples for intra prediction */ fill_ref_samples_luma(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h); UNUSED_PARAMETER(blockidx); /* 1, ģʽ * (1.1) ؼĽǶ */ for (mode = 0; mode < 3; mode++) { PREDICT_ADD_LUMA(mode); visited[mode] = 1; } /* (1.2) ǶԤģʽ */ for (mode = 4; mode < NUM_INTRA_MODE; mode += 4) { PREDICT_ADD_LUMA(mode); visited[mode] = 1; } /* 2, NŵģʽľΪģʽŵCandModeList */ num_to_add = h->num_intra_rmd_dist2; for (i = 0; i < num_to_add; i++) { mode = p_candidates[i].mode; if (mode <= 2) { continue; } if (mode > 3 && !visited[mode - 2]) { j = mode - 2; PREDICT_ADD_LUMA(j); visited[j] = 1; } if (mode < NUM_INTRA_MODE - 2 && !visited[mode + 2]) { j = mode + 2; PREDICT_ADD_LUMA(j); visited[j] = 1; } } /* 3, ϵõѵģʽľΪһģʽCandModeList */ num_to_add = h->num_intra_rmd_dist1; for (i = 0, num_angle = 0; num_angle < num_to_add && i < INTRA_MODE_NUM_FOR_RDO; i++) { mode = p_candidates[i].mode; if (mode <= 2) { continue; } if (mode > 3 && !visited[mode - 1]) { j = mode - 1; PREDICT_ADD_LUMA(j); visited[j] = 1; num_angle++; } if (mode < NUM_INTRA_MODE - 1 && !visited[mode + 1]) { j = mode + 1; PREDICT_ADD_LUMA(j); visited[j] = 1; num_angle++; } } /* 4, бǷMPMsûУ룬ü */ if (!visited[mpm[0]]) { mode = mpm[0]; PREDICT_ADD_LUMA(mode); visited[mode] = 1; } if (!visited[mpm[1]]) { mode = mpm[1]; PREDICT_ADD_LUMA(mode); visited[mode] = 1; } num_for_rdo = h->tab_num_intra_rdo[p_cu->cu_info.i_level - (p_cu->cu_info.i_tu_split != TU_SPLIT_NON)]; /* ǰֲŵģʽMPM֮һRDOģʽ */ if (p_candidates[0].mode == mpm[0] || p_candidates[0].mode == mpm[1] || p_candidates[1].mode == mpm[0] || p_candidates[1].mode == mpm[1]) { num_for_rdo = XAVS2_MIN(num_for_rdo, 3); return num_for_rdo; } /* MģʽѡղμRDOģʽȥ */ visited[p_candidates[0].mode] = 2; visited[p_candidates[1].mode] = 2; for (i = 2, j = 2; i < INTRA_MODE_NUM_FOR_RDO && j < num_for_rdo; i++) { mode = p_candidates[i].mode; if (!visited[mode]) { continue; } if (mode <= 2) { p_candidates[j++].mode = mode; visited[mode] = 2; } else if (mode == 3) { if (visited[4] == 1) { p_candidates[j++].mode = 3; visited[3] = 2; } } else if (mode == 32) { if (visited[31] == 1) { p_candidates[j++].mode = 32; visited[32] = 2; } } else { if (visited[mode - 1] == 1 && visited[mode + 1] == 1) { p_candidates[j++].mode = mode; visited[mode] = 2; } } if (visited[0] == 2 && visited[1] == 2 && visited[2] == 2) { break; } } p_cu->feature.intra_had_cost = p_candidates[0].cost; return XAVS2_MIN(num_for_rdo, j); } /* --------------------------------------------------------------------------- * return the best intra prediction mode from the 1st run */ int rdo_get_pred_intra_luma_2nd_pass(xavs2_t *h, cu_t *p_cu, intra_candidate_t *p_candidates, pel_t *p_fenc, int mpm[], int blockidx, int block_x, int block_y, int block_w, int block_h) { cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); int best_intra_mode = p_cu->cu_info.real_intra_modes[blockidx]; pel_t *edge_pixels = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 2) - 1]; pel_t *p_pred = p_enc->intra_pred[best_intra_mode]; int img_x = h->lcu.i_pix_x + p_cu->i_pos_x + block_x; int img_y = h->lcu.i_pix_y + p_cu->i_pos_y + block_y; /* get edge samples for intra prediction */ fill_ref_samples_luma(h, p_cu, edge_pixels, img_x, img_y, block_x, block_y, block_w, block_h); UNUSED_PARAMETER(p_fenc); UNUSED_PARAMETER(mpm); xavs2_intra_prediction(h, edge_pixels, p_pred, block_w, best_intra_mode, p_cu->block_avail, block_w, block_h); p_candidates[0].mode = best_intra_mode; p_candidates[0].cost = 0; return 1; } #undef PREDICT_ADD_LUMA //#if OPT_FAST_RDO_INTRA_C /* --------------------------------------------------------------------------- * predict an intra chroma block (fast) */ int rdo_get_pred_intra_chroma_fast(xavs2_t *h, cu_t *p_cu, int i_level, int pix_y_c, int pix_x_c, intra_candidate_t *p_candidate_list) { cu_parallel_t *p_enc = cu_get_enc_context(h, i_level + 1); pel_t *p_fenc_u = h->lcu.p_fenc[IMG_U] + pix_y_c * FENC_STRIDE + pix_x_c; pel_t *p_fenc_v = h->lcu.p_fenc[IMG_V] + pix_y_c * FENC_STRIDE + pix_x_c; int blksize = 1 << i_level; pixel_cmp_t intra_chroma_cost = g_funcs.pixf.intra_cmp[PART_INDEX(blksize, blksize)]; int num_for_rdo = 0; int LUMA_MODE[5] = { -1, DC_PRED, HOR_PRED, VERT_PRED, BI_PRED }; // map chroma mode to luma mode pel_t *EP_u = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 1) - 1]; pel_t *EP_v = EP_u + (MAX_CU_SIZE << 2); int xy = p_cu->in_lcu_edge; /* UVϽصλ */ pel_t *pTL_u = h->lcu.p_fdec[1] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; pel_t *pTL_v = h->lcu.p_fdec[2] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; int offset = (FREC_CSTRIDE >> 1); int m; /* ߽Ч */ uint32_t avail = p_cu->intra_avail; /* ÿģʽŶӦԤģʽ */ LUMA_MODE[0] = p_cu->cu_info.real_intra_modes[0]; /* 2.1, ȡο߽ */ g_funcs.fill_edge_f[xy](pTL_u, FDEC_STRIDE, h->lcu.ctu_border[1].rec_top + pix_x_c - pix_y_c, EP_u, avail, blksize, blksize); g_funcs.fill_edge_f[xy](pTL_v, FDEC_STRIDE, h->lcu.ctu_border[2].rec_top + pix_x_c - pix_y_c, EP_v, avail, blksize, blksize); for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) { p_candidate_list[m].mode = DM_PRED_C; p_candidate_list[m].cost = MAX_COST; } /* 2.2, ִԤ */ for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) { pel_t *p_pred_u = p_enc->intra_pred_c[m]; pel_t *p_pred_v = p_enc->intra_pred_c[m] + offset; rdcost_t est_cost; xavs2_intra_prediction(h, EP_u, p_pred_u, FREC_CSTRIDE, LUMA_MODE[m], avail, blksize, blksize); xavs2_intra_prediction(h, EP_v, p_pred_v, FREC_CSTRIDE, LUMA_MODE[m], avail, blksize, blksize); est_cost = intra_chroma_cost(p_fenc_u, FENC_STRIDE, p_pred_u, FREC_CSTRIDE); est_cost += intra_chroma_cost(p_fenc_v, FENC_STRIDE, p_pred_v, FREC_CSTRIDE); update_candidate_list(m, est_cost, NUM_INTRA_MODE_CHROMA, p_candidate_list); } if (h->i_type != SLICE_TYPE_I) { num_for_rdo = NUM_INTRA_C_FULL_RD; if (i_level == 6) { num_for_rdo -= 2; } else if (i_level == 5) { num_for_rdo -= 1; } } else { num_for_rdo = NUM_INTRA_MODE_CHROMA; } if (p_candidate_list[0].mode == DM_PRED_C) { num_for_rdo = 1; } num_for_rdo = XAVS2_MIN(h->num_rdo_intra_chroma, num_for_rdo); return num_for_rdo; } //#endif /* --------------------------------------------------------------------------- * predict an intra chroma block */ int rdo_get_pred_intra_chroma(xavs2_t *h, cu_t *p_cu, int i_level_c, int pix_y_c, int pix_x_c, intra_candidate_t *p_candidate_list) { int LUMA_MODE[5] = { -1, DC_PRED, HOR_PRED, VERT_PRED, BI_PRED }; // map chroma mode to luma mode cu_parallel_t *p_enc = cu_get_enc_context(h, i_level_c + 1); pel_t *EP_u = &p_enc->buf_edge_pixels[(MAX_CU_SIZE << 1) - 1]; pel_t *EP_v = EP_u + (MAX_CU_SIZE << 2); int bsize = 1 << i_level_c; int xy = p_cu->in_lcu_edge; /* UVϽصλ */ pel_t *pTL_u = h->lcu.p_fdec[1] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; pel_t *pTL_v = h->lcu.p_fdec[2] + (pix_y_c - 1) * FDEC_STRIDE + pix_x_c - 1; int offset = (FREC_CSTRIDE >> 1); int m; /* ߽Ч */ uint32_t avail = p_cu->intra_avail; /* ÿģʽŶӦԤģʽ */ LUMA_MODE[0] = p_cu->cu_info.real_intra_modes[0]; /* 2.1, ȡο߽ */ g_funcs.fill_edge_f[xy](pTL_u, FDEC_STRIDE, h->lcu.ctu_border[1].rec_top + pix_x_c - pix_y_c, EP_u, avail, bsize, bsize); g_funcs.fill_edge_f[xy](pTL_v, FDEC_STRIDE, h->lcu.ctu_border[2].rec_top + pix_x_c - pix_y_c, EP_v, avail, bsize, bsize); /* 2.2, ִԤ */ for (m = 0; m < NUM_INTRA_MODE_CHROMA; m++) { xavs2_intra_prediction(h, EP_u, p_enc->intra_pred_c[m] + 0, FREC_CSTRIDE, LUMA_MODE[m], avail, bsize, bsize); xavs2_intra_prediction(h, EP_v, p_enc->intra_pred_c[m] + offset, FREC_CSTRIDE, LUMA_MODE[m], avail, bsize, bsize); p_candidate_list[m].mode = m; p_candidate_list[m].cost = MAX_COST; } return NUM_INTRA_MODE_CHROMA; } /* --------------------------------------------------------------------------- */ uint32_t xavs2_intra_get_cu_neighbors(xavs2_t *h, cu_t *p_cu, int img_x, int img_y, int cu_size) { UNUSED_PARAMETER(p_cu); int cur_slice_idx = cu_get_slice_index(h, img_x >> MIN_CU_SIZE_IN_BIT, img_y >> MIN_CU_SIZE_IN_BIT); int b8_x = img_x >> MIN_PU_SIZE_IN_BIT; int b8_y = img_y >> MIN_PU_SIZE_IN_BIT; return get_intra_neighbors(h, b8_x, b8_y, cu_size, cu_size, cur_slice_idx); } xavs2-1.3/source/encoder/me.c000066400000000000000000001506341340660520300161140ustar00rootroot00000000000000/* * me.c * * Description of this file: * ME functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "block_info.h" #include "me.h" #include "common/cpu.h" #include "common/mc.h" #include "predict.h" /** * =========================================================================== * type defines * =========================================================================== */ typedef struct mv_info { int bmx; /* best mv (x) */ int bmy; /* best mv (y) */ int bdir; /* best direction */ dist_t bcost; /* best cost */ dist_t bdist; /* best distance */ } mv_info; /** * =========================================================================== * local/global variables * =========================================================================== */ /* --------------------------------------------------------------------------- * big hexagon for UMH */ static const int8_t HEX4[16][2] = { { 0, -4 }, { 0, 4 }, { -2, -3 }, { 2, -3 }, { -4, -2 }, { 4, -2 }, { -4, -1 }, { 4, -1 }, { -4, 0 }, { 4, 0 }, { -4, 1 }, { 4, 1 }, { -4, 2 }, { 4, 2 }, { -2, 3 }, { 2, 3 } }; static const int8_t FAST_HEX4[8][2] = { { 0, -4 }, { 0, 4 }, { -2, -3 }, { 2, -3 }, { -4, 0 }, { 4, 0 }, { -2, 3 }, { 2, 3 } }; /* --------------------------------------------------------------------------- * radius 2 hexagon * repeated entries are to avoid having to compute mod6 every time */ const int8_t HEX2[8][2] = { {-1, -2}, /* 0, 0(6) 5 */ {-2, 0}, /* 1, */ {-1, 2}, /* 2, */ { 1, 2}, /* 3, 1(7) * 4 */ { 2, 0}, /* 4, */ { 1, -2}, /* 5, */ {-1, -2}, /* 6, 2 3 */ {-2, 0} /* 7, */ }; /* --------------------------------------------------------------------------- * (x - 1) % 6 */ const int8_t M1MOD6[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* --------------------------------------------------------------------------- * radius 1 diamond * repeated entries are to avoid having to compute mod4 every time */ const int8_t DIA1[6][2] = { { 0, -1}, /* 0, */ {-1, 0}, /* 1, 0(4) */ { 0, 1}, /* 2, */ { 1, 0}, /* 3, 1(5) * 3 */ { 0, -1}, /* 4, */ {-1, 0} /* 5, 2 */ }; /* --------------------------------------------------------------------------- * (x - 1) % 4 */ const int8_t M1MOD4[6] = { 3, 0, 1, 2, 3, 0 }; /* --------------------------------------------------------------------------- * uneven multi-hexagon-grid: 5x5 */ static int8_t GRID[24][2] = { { -1, -1 }, { 0, -1 }, { 1, -1 }, /* inside 8 points */ { -1, 0 }, { 1, 0 }, { -1, 1 }, { 0, 1 }, { 1, 1 }, { -2, -2 }, { -1, -2 }, { 0, 2 }, { 1, -2 }, { 2, -2 }, /* outside 16 points */ { -2, -1 }, { 2, -1 }, { -2, 0 }, { 2, 0 }, { -2, 1 }, { 2, 1 }, { -2, 2 }, { -1, 2 }, { 0, 2 }, { 1, 2 }, { 2, 2 } }; /* --------------------------------------------------------------------------- * ڷ */ static const int8_t Spiral[9][2] = { { 0, 0 }, { 0, -1 }, { 0, 1 }, { -1, -1 }, { 1, -1 }, { -1, 0 }, { 1, 0 }, { -1, 1 }, { 1, 1 } }; static const int8_t Spiral2[9][2] = { { 0, 0 }, { 0, -1 }, { -1, -1 }, /* 2 1 8 */ { -1, 0 }, { -1, 1 }, { 0, 1 }, /* 3 0 7 */ { 1, 1 }, { 1, 0 }, { 1, -1 } /* 4 5 6 */ }; /* --------------------------------------------------------------------------- * offsets for Two Point Search (TZ) */ static const int offsets[16][2] = { { -1, 0 }, { 0, -1 }, { -1, -1 }, { 1, -1 }, { -1, 0 }, { 1, 0 }, { -1, 1 }, { -1, -1 }, { 1, -1 }, { 1, 1 }, { -1, 0 }, { 0, 1 }, { -1, 1 }, { 1, 1 }, { 1, 0 }, { 0, 1 }, }; static const int i_org = FENC_STRIDE; /** * =========================================================================== * macros * =========================================================================== */ /* --------------------------------------------------------------------------- * early termination */ #define EARLY_TERMINATION(pred_sad) \ if (bcost < (pred_sad) * beta3) {\ goto umh_step_3;\ } else if (bcost < (pred_sad) * beta2) {\ goto umh_step_2;\ } /** * =========================================================================== * calculate cost for integer pixel motion search * =========================================================================== */ /* --------------------------------------------------------------------------- */ #define CAL_COST_IPEL(mx, my) \ g_funcs.pixf.sad[i_pixel](p_org, i_org,\ p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my) /* --------------------------------------------------------------------------- */ #define ME_COST_IPEL(mx, my) \ if (CHECK_MV_RANGE(mx, my)) {\ int cost = g_funcs.pixf.sad[i_pixel](p_org, i_org,\ p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\ COPY3_IF_LT(bcost, cost, bmx, mx, bmy, my);\ } /* --------------------------------------------------------------------------- */ #define ME_COST_IPEL_DIR(mx, my, d) \ if (CHECK_MV_RANGE(mx, my)) {\ int cost = g_funcs.pixf.sad[i_pixel](p_org, i_org,\ p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\ COPY4_IF_LT(bcost, cost, bmx, mx, bmy, my, dir, d);\ } /* --------------------------------------------------------------------------- */ #define ME_COST_IPEL_X3(m0x, m0y, m1x, m1y, m2x, m2y) \ {\ pel_t *pix_base = p_fref + omy * i_fref + omx;\ g_funcs.pixf.sad_x3[i_pixel](p_org,\ pix_base + (m0y) * i_fref + (m0x),\ pix_base + (m1y) * i_fref + (m1x),\ pix_base + (m2y) * i_fref + (m2x),\ i_fref, costs);\ costs[0] += MV_COST_IPEL(omx + (m0x), omy + (m0y));\ costs[1] += MV_COST_IPEL(omx + (m1x), omy + (m1y));\ costs[2] += MV_COST_IPEL(omx + (m2x), omy + (m2y));\ COPY3_IF_LT(bcost, costs[0], bmx, omx + (m0x), bmy, omy + (m0y));\ COPY3_IF_LT(bcost, costs[1], bmx, omx + (m1x), bmy, omy + (m1y));\ COPY3_IF_LT(bcost, costs[2], bmx, omx + (m2x), bmy, omy + (m2y));\ } /* --------------------------------------------------------------------------- */ #define ME_COST_IPEL_X3_DIR(m0x, m0y, d0, m1x, m1y, d1, m2x, m2y, d2) \ {\ pel_t *pix_base = p_fref + omy * i_fref + omx;\ g_funcs.pixf.sad_x3[i_pixel](p_org,\ pix_base + (m0y) * i_fref + (m0x),\ pix_base + (m1y) * i_fref + (m1x),\ pix_base + (m2y) * i_fref + (m2x),\ i_fref, costs);\ costs[0] += MV_COST_IPEL(omx + (m0x), omy + (m0y));\ costs[1] += MV_COST_IPEL(omx + (m1x), omy + (m1y));\ costs[2] += MV_COST_IPEL(omx + (m2x), omy + (m2y));\ COPY4_IF_LT(bcost, costs[0], bmx, omx + (m0x), bmy, omy + (m0y), dir, d0);\ COPY4_IF_LT(bcost, costs[1], bmx, omx + (m1x), bmy, omy + (m1y), dir, d1);\ COPY4_IF_LT(bcost, costs[2], bmx, omx + (m2x), bmy, omy + (m2y), dir, d2);\ } /* --------------------------------------------------------------------------- */ #define ME_COST_IPEL_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \ {\ if (CHECK_MV_RANGE_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y)) { \ pel_t *pix_base = p_fref + omy * i_fref + omx;\ g_funcs.pixf.sad_x4[i_pixel](p_org,\ pix_base + (m0y) * i_fref + (m0x),\ pix_base + (m1y) * i_fref + (m1x),\ pix_base + (m2y) * i_fref + (m2x),\ pix_base + (m3y) * i_fref + (m3x),\ i_fref, costs);\ costs[0] += MV_COST_IPEL(omx + (m0x), omy + (m0y));\ costs[1] += MV_COST_IPEL(omx + (m1x), omy + (m1y));\ costs[2] += MV_COST_IPEL(omx + (m2x), omy + (m2y));\ costs[3] += MV_COST_IPEL(omx + (m3x), omy + (m3y));\ COPY3_IF_LT(bcost, costs[0], bmx, omx + (m0x), bmy, omy + (m0y));\ COPY3_IF_LT(bcost, costs[1], bmx, omx + (m1x), bmy, omy + (m1y));\ COPY3_IF_LT(bcost, costs[2], bmx, omx + (m2x), bmy, omy + (m2y));\ COPY3_IF_LT(bcost, costs[3], bmx, omx + (m3x), bmy, omy + (m3y));\ } else { \ ME_COST_IPEL(m0x, m0y); \ ME_COST_IPEL(m1x, m1y); \ ME_COST_IPEL(m2x, m2y); \ ME_COST_IPEL(m3x, m3y); \ } \ } /* --------------------------------------------------------------------------- */ #define ME_COST_IPEL_X4_DIR(m0x, m0y, d0, m1x, m1y, d1, m2x, m2y, d2, m3x, m3y, d3) \ {\ pel_t *pix_base = p_fref + omy * i_fref + omx;\ g_funcs.pixf.sad_x4[i_pixel](p_org,\ pix_base + (m0y) * i_fref + (m0x),\ pix_base + (m1y) * i_fref + (m1x),\ pix_base + (m2y) * i_fref + (m2x),\ pix_base + (m3y) * i_fref + (m3x), i_fref, costs);\ costs[0] += MV_COST_IPEL(omx + (m0x), omy + (m0y));\ costs[1] += MV_COST_IPEL(omx + (m1x), omy + (m1y));\ costs[2] += MV_COST_IPEL(omx + (m2x), omy + (m2y));\ costs[3] += MV_COST_IPEL(omx + (m3x), omy + (m3y));\ COPY4_IF_LT(bcost, costs[0], bmx, omx + (m0x), bmy, omy + (m0y), dir, d0);\ COPY4_IF_LT(bcost, costs[1], bmx, omx + (m1x), bmy, omy + (m1y), dir, d1);\ COPY4_IF_LT(bcost, costs[2], bmx, omx + (m2x), bmy, omy + (m2y), dir, d2);\ COPY4_IF_LT(bcost, costs[3], bmx, omx + (m3x), bmy, omy + (m3y), dir, d3);\ } /* --------------------------------------------------------------------------- * for TZ */ #define ME_COST_IPEL_DIR_DIST(mx, my, direction, dist) \ if (CHECK_MV_RANGE(mx, my)) {\ int cost = g_funcs.pixf.sad[i_pixel](p_org, i_org,\ p_fref + (my) * i_fref + (mx), i_fref) + MV_COST_IPEL(mx, my);\ COPY5_IF_LT(mv->bcost, cost, mv->bmx, mx, mv->bmy, my, mv->bdir, direction, mv->bdist, dist);\ } /* --------------------------------------------------------------------------- * for TZ */ #define ME_COST_IPEL_X4_DIR_DIST(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \ {\ g_funcs.pixf.sad_x4[i_pixel](p_org,\ p_fref + (m0x) + (m0y) * i_fref,\ p_fref + (m1x) + (m1y) * i_fref,\ p_fref + (m2x) + (m2y) * i_fref,\ p_fref + (m3x) + (m3y) * i_fref,\ i_fref, costs);\ (costs)[0] += MV_COST_IPEL(m0x, m0y);\ (costs)[1] += MV_COST_IPEL(m1x, m1y);\ (costs)[2] += MV_COST_IPEL(m2x, m2y);\ (costs)[3] += MV_COST_IPEL(m3x, m3y);\ if (CHECK_MV_RANGE(m0x,m0y)) {\ COPY5_IF_LT(mv->bcost, costs[0], mv->bmx, m0x, mv->bmy, m0y, mv->bdir, p0, mv->bdist, d0);\ }\ if (CHECK_MV_RANGE(m1x,m1y)) {\ COPY5_IF_LT(mv->bcost, costs[1], mv->bmx, m1x, mv->bmy, m1y, mv->bdir, p1, mv->bdist, d1);\ }\ if (CHECK_MV_RANGE(m2x,m2y)) {\ COPY5_IF_LT(mv->bcost, costs[2], mv->bmx, m2x, mv->bmy, m2y, mv->bdir, p2, mv->bdist, d2);\ }\ if (CHECK_MV_RANGE(m3x,m3y)) {\ COPY5_IF_LT(mv->bcost, costs[3], mv->bmx, m3x, mv->bmy, m3y, mv->bdir, p3, mv->bdist, d3);\ }\ } /* --------------------------------------------------------------------------- * diamond: 1 * 1 0 1 * 1 */ #define DIA_ITER(mx, my) \ {\ omx = mx;\ omy = my;\ ME_COST_IPEL_X4(0,-1, -1,0, 1,0, 0,1);\ } /** * =========================================================================== * calculate cost for fractional pixel refine * =========================================================================== */ /* --------------------------------------------------------------------------- */ #define ME_COST_QPEL(mx, my) \ {\ pel_t *p_pred = p_filtered[(((my) & 3) << 2) + ((mx) & 3)] + i_offset\ + ((my) >> 2) * i_fref + ((mx) >> 2); \ cost = g_funcs.pixf.fpel_cmp[i_pixel](p_org, i_org, p_pred, i_fref) + MV_COST_FPEL(mx, my);\ } /* --------------------------------------------------------------------------- */ #define ME_COST_QPEL_SYM \ {\ int mx_sym;\ int my_sym;\ \ cost = MAX_DISTORTION;\ if (h->i_type == SLICE_TYPE_B) {\ mx_sym = -scale_mv_skip ( mx, distance_bwd, distance_fwd);\ my_sym = -scale_mv_skip_y(h, my, distance_bwd, distance_fwd);\ } else {\ mx_sym = scale_mv_skip ( mx, distance_bwd, distance_fwd);\ my_sym = scale_mv_skip_y(h, my, distance_bwd, distance_fwd);\ }\ \ if (CHECK_MV_RANGE(mx, my) && CHECK_MV_RANGE(mx_sym, my_sym)) {\ int xx1 = mx >> 2;\ int yy1 = my >> 2;\ int xx2 = mx_sym >> 2;\ int yy2 = my_sym >> 2;\ pel_t *p_src1 = p_filtered1[((my & 3) << 2) + (mx & 3)]; \ pel_t *p_src2 = p_filtered2[((my_sym & 3) << 2) + (mx_sym & 3)]; \ pel_t *p_pred = buf_pixel_temp;\ \ if (p_src1 != NULL && p_src2 != NULL) { \ p_src1 += i_offset + yy1 * i_fref + xx1;\ p_src2 += i_offset + yy2 * i_fref + xx2;\ g_funcs.pixf.avg[i_pixel](p_pred, 64, p_src1, i_fref, p_src2, i_fref, 32); \ cost = g_funcs.pixf.fpel_cmp[i_pixel](p_org, i_org, p_pred, MAX_CU_SIZE)\ + MV_COST_FPEL(mx, my);\ } \ }\ } /* --------------------------------------------------------------------------- */ #define ME_COST_QPEL_BID \ if (CHECK_MV_RANGE(mx, my) && CHECK_MV_RANGE(mx_bid, my_bid)) {\ int xx1 = mx >> 2;\ int yy1 = my >> 2;\ pel_t *p_src1 = p_filtered1[((my & 3) << 2) + (mx & 3)] + i_offset + yy1 * i_fref + xx1;\ int distortion = g_funcs.pixf.fpel_cmp[i_pixel](buf_pixel_temp, MAX_CU_SIZE, p_src1, i_fref) >> 1;\ \ cost = distortion + MV_COST_FPEL(mx, my) + mv_bid_bit;\ } else {\ cost = MAX_DISTORTION;\ } /** * =========================================================================== * local function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * candMV1/4ȵֵµMVӦ2ʱµMVֵΧڣ򷵻1ʾµMVӦ * candMV1/4ֵΧڣµMVõʱMVֵΧ򷵻1ʾµMVӦ * 򣬷0ֵʾµMVӦü */ static int pmvr_adapt_mv(int *mx, int *my, int ctr_x, int ctr_y, int mv_x, int mv_y, int step_x, int step_y) { if (XAVS2_ABS(mv_x - ctr_x) > TH_PMVR || XAVS2_ABS(mv_y - ctr_y) > TH_PMVR) { *mx = mv_x + step_x * 2; *my = mv_y + step_y * 2; return (XAVS2_ABS(*mx - ctr_x) <= TH_PMVR && XAVS2_ABS(*my - ctr_y) <= TH_PMVR); } else { *mx = mv_x + step_x; *my = mv_y + step_y; return (XAVS2_ABS(*mx - ctr_x) > TH_PMVR || XAVS2_ABS(*my - ctr_y) > TH_PMVR); } } /* --------------------------------------------------------------------------- */ static int ALWAYS_INLINE mv_roundclip(int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_min[2], int mv_max[2], uint32_t pmv) { int cnt = 0; int i; for (i = 0; i < i_mvc; i++) { int mx = IPEL(mvc[i][0]); int my = IPEL(mvc[i][1]); uint32_t mv = MAKEDWORD(mx, my); if (!mv || mv == pmv) { continue; } dst[cnt][0] = (int16_t)XAVS2_CLIP3(mv_min[0], mv_max[0], mx); dst[cnt][1] = (int16_t)XAVS2_CLIP3(mv_min[1], mv_max[1], my); cnt++; } return cnt; } /* --------------------------------------------------------------------------- */ static int ALWAYS_INLINE mv_clip(int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_min[2], int mv_max[2], uint32_t pmv) { int cnt = 0; int i; for (i = 0; i < i_mvc; i++) { int mx = mvc[i][0]; int my = mvc[i][1]; uint32_t mv = M32(mvc[i]); if (!mv || mv == pmv) { continue; } dst[cnt][0] = (int16_t)XAVS2_CLIP3(mv_min[0], mv_max[0], mx); dst[cnt][1] = (int16_t)XAVS2_CLIP3(mv_min[1], mv_max[1], my); cnt++; } return cnt; } /* --------------------------------------------------------------------------- * sub pixel block motion search */ static dist_t me_subpel_refine(xavs2_t *h, xavs2_me_t *p_me) { #if !ENABLE_FRAME_SUBPEL_INTPL ALIGN32(pel_t p_pred[MAX_CU_SIZE * MAX_CU_SIZE]); #endif pel_t *p_org = p_me->p_fenc; pel_t **p_filtered = p_me->p_fref_1st->filtered; int i_fref = p_me->p_fref_1st->i_stride[IMG_Y]; int pmx = p_me->mvp.x; int pmy = p_me->mvp.y; int i_pixel = p_me->i_pixel; int i_offset = p_me->i_bias; const uint16_t *p_cost_mvx = h->mvbits - p_me->mvp.x; const uint16_t *p_cost_mvy = h->mvbits - p_me->mvp.y; int lambda = h->i_lambda_factor; const int search_pos2 = 9; const int search_pos4 = 9; const int search_step = h->use_fast_sub_me ? 2 : 1; const int8_t(*search_pattern)[2] = h->use_fast_sub_me ? Spiral2 : Spiral; dist_t bcost; int ctr_x = (pmx >> 1) << 1; int ctr_y = (pmy >> 1) << 1; int pos, cost; int mx, my, bmx, bmy; mv_t bmv; // convert search center to quarter-pel units bmx = p_me->bmv.x; bmy = p_me->bmv.y; bmv = p_me->bmv; if (h->param->enable_hadamard) { ME_COST_QPEL(bmx, bmy); bcost = cost; } else { bcost = p_me->bcost; } /* ------------------------------------------------------------- * half-pel refine */ // loop over search positions for (pos = 1; pos < search_pos2; pos += search_step) { mx = bmx + (search_pattern[pos][0] << 1); my = bmy + (search_pattern[pos][1] << 1); #if ENABLE_FRAME_SUBPEL_INTPL ME_COST_QPEL(mx, my); #else mv_t mvt; mvt.v = MAKEDWORD(mx, my); get_mv_for_mc(h, &mvt, p_me->i_pix_x, p_me->i_pix_y, p_me->i_block_w, p_me->i_block_h); mc_luma(p_pred, MAX_CU_SIZE, mvt.x, mvt.y, p_me->i_block_w, p_me->i_block_h, p_me->p_fref_1st); cost = g_funcs.pixf.fpel_cmp[i_pixel](p_org, i_org, p_pred, MAX_CU_SIZE) + MV_COST_FPEL(mx, my); #endif if (cost < bcost) { bcost = cost; bmv.v = MAKEDWORD(mx, my); } } bmx = bmv.x; bmy = bmv.y; /* ------------------------------------------------------------- * quarter-pel refine */ if (h->use_fractional_me > 1) { // loop over search positions for (pos = 1; pos < search_pos4; pos += search_step) { if (h->param->enable_pmvr) { if (pmvr_adapt_mv(&mx, &my, ctr_x, ctr_y, bmx, bmy, search_pattern[pos][0], search_pattern[pos][1])) { continue; } } else { mx = bmx + search_pattern[pos][0]; // quarter-pel units my = bmy + search_pattern[pos][1]; // quarter-pel units } // set motion vector cost #if ENABLE_FRAME_SUBPEL_INTPL ME_COST_QPEL(mx, my); #else mv_t mvt; mvt.v = MAKEDWORD(mx, my); get_mv_for_mc(h, &mvt, p_me->i_pix_x, p_me->i_pix_y, p_me->i_block_w, p_me->i_block_h); mc_luma(p_pred, MAX_CU_SIZE, mvt.x, mvt.y, p_me->i_block_w, p_me->i_block_h, p_me->p_fref_1st); cost = g_funcs.pixf.fpel_cmp[i_pixel](p_org, i_org, p_pred, MAX_CU_SIZE) + MV_COST_FPEL(mx, my); #endif if (cost < bcost) { bcost = cost; bmv.v = MAKEDWORD(mx, my); } } } // save the results p_me->bmv = bmv; p_me->bcost = bcost; p_me->mvcost[PDIR_FWD] = MV_COST_FPEL(bmv.x,bmv.y); return bcost; } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * initialize the motion search */ int xavs2_me_get_buf_size(const xavs2_param_t *param) { int me_range = XAVS2_MAX(256, param->search_range); int subpel_num = 4 * (2 * me_range + 3); int max_mv_bits = 5 + 2 * (int)ceil(log(subpel_num + 1) / log(2) + 1e-10); int max_mvd = (1 << ((max_mv_bits >> 1))) - 1; int mem_size; /* buffer size for mvbits */ mem_size = (max_mvd * 2 + 1) * sizeof(uint16_t) + CACHE_LINE_SIZE; return mem_size; } /** * --------------------------------------------------------------------------- * Function : initialize the motion search module * Parameters : * [in ] : h - pointer to struct xavs2_t, the HiPE encoder * [out] : none * Return : none * --------------------------------------------------------------------------- */ void xavs2_me_init(xavs2_t *h, uint8_t **mem_base) { uint8_t *mbase = *mem_base; int me_range = XAVS2_MAX(256, h->param->search_range); int subpel_num = 4 * (2 * me_range + 3); int max_mv_bits = 5 + 2 * (int)ceil(log(subpel_num + 1) / log(2) + 1e-10); int max_mvd = (1 << ((max_mv_bits >> 1))) - 1; int bits, i, imin, imax; /* set pointer of mvbits */ h->mvbits = (uint16_t *)mbase; h->mvbits += max_mvd; // reset the array offset mbase += (max_mvd * 2 + 1) * sizeof(uint16_t); ALIGN_POINTER(mbase); *mem_base = mbase; // init array of motion vector bits h->mvbits[0] = 1; for (bits = 3; bits <= max_mv_bits; bits += 2) { imax = 1 << (bits >> 1); imin = imax >> 1; for (i = imin; i < imax; i++) { h->mvbits[-i] = h->mvbits[i] = (uint16_t)bits; } } } /* --------------------------------------------------------------------------- * i_qp: QP of P/F frame * TODO: call this function before encoding P/F frame? */ void xavs2_me_init_umh_threshold(xavs2_t *h, double *bsize, int i_qp) { const int quant_coef[6] = { 13107, 11916, 10082, 9362, 8192, 7282 }; int gb_qp_per = i_qp / 6; int gb_qp_rem = i_qp % 6; int gb_q_bits = 15 + gb_qp_per; int gb_qp_const = (h->i_type == SLICE_TYPE_I) ? ((1 << gb_q_bits) / 3) : ((1 << gb_q_bits) / 6); int threshold_4x4 = ((1 << gb_q_bits) - gb_qp_const) / quant_coef[gb_qp_rem]; double quantize_step = threshold_4x4 / (4 * 5.61f); memset(bsize, 0, MAX_INTER_MODES * sizeof(double)); bsize[PRED_nRx2N] = (16 * 16) * quantize_step; bsize[PRED_nLx2N] = 4 * bsize[PRED_nRx2N]; bsize[PRED_2NxnD] = 4 * bsize[PRED_nRx2N]; bsize[PRED_2NxnU] = 4 * bsize[PRED_2NxnD]; bsize[PRED_Nx2N ] = 4 * bsize[PRED_2NxnU]; bsize[PRED_2NxN ] = 4 * bsize[PRED_2NxnU]; bsize[PRED_2Nx2N] = 4 * bsize[PRED_2NxN ]; } /* --------------------------------------------------------------------------- */ static void tz_pattern_search(xavs2_t* h, xavs2_me_t *p_me, pel_t* p_org, pel_t* p_fref, mv_info* mv, int mv_x_min, int mv_y_min, int mv_x_max, int mv_y_max, int i_pixel, int i_fref, int earlyExitIters, int merange) { ALIGN16(int costs[16]); const uint32_t mv_min = pack16to32_mask2(-mv_x_min, -mv_y_min); const uint32_t mv_max = pack16to32_mask2(mv_x_max, mv_y_max) | 0x8000; const uint16_t *p_cost_mvx = h->mvbits - p_me->mvp.x; const uint16_t *p_cost_mvy = h->mvbits - p_me->mvp.y; int lambda = h->i_lambda_factor; int rounds = 0; int dist = 1; int omx = mv->bmx; int omy = mv->bmx; dist_t bcost = mv->bcost; int top = omy - dist; int bottom = omy + dist; int left = omx - dist; int right = omx + dist; int top2, bottom2, left2, right2; int posYT, posYB, posXL, posXR; int idx; if (top >= mv_y_min && left >= mv_x_min && right <= mv_x_max && bottom <= mv_y_max) { ME_COST_IPEL_X4_DIR_DIST(omx, top, 2, dist, /* direction */ left, omy, 4, dist, /* 2 */ right, omy, 5, dist, /* 4 * 5 */ omx, bottom, 7, dist); /* 7 */ } else { if (top >= mv_y_min) { // check top ME_COST_IPEL_DIR_DIST(omx, top, 2, dist); } if (left >= mv_x_min) { // check middle left ME_COST_IPEL_DIR_DIST(left, omy, 4, dist); } if (right <= mv_x_max) { // check middle right ME_COST_IPEL_DIR_DIST(right, omy, 5, dist); } if (bottom <= mv_y_max) { // check bottom ME_COST_IPEL_DIR_DIST(omx, bottom, 7, dist); } } if (mv->bcost < bcost) { rounds = 0; } else if (++rounds >= earlyExitIters) { return; } for (dist = 2; dist <= 8; dist <<= 1) { /* 2 points 2, 4, 5, 7 are dist * 1 3 points 1, 3, 6, 8 are dist/2 * 4 * 5 * 6 8 * 7 */ omx = mv->bmx; omy = mv->bmx; bcost = mv->bcost; top = omy - dist; bottom = omy + dist; left = omx - dist; right = omx + dist; top2 = omy - (dist >> 1); bottom2 = omy + (dist >> 1); left2 = omx - (dist >> 1); right2 = omx + (dist >> 1); // check border if (top >= mv_y_min && left >= mv_x_min && right <= mv_x_max && bottom <= mv_y_max) { ME_COST_IPEL_X4_DIR_DIST(omx, top, 2, dist, left2, top2, 1, dist >> 1, right2, top2, 3, dist >> 1, left, omy, 4, dist); ME_COST_IPEL_X4_DIR_DIST(right, omy, 5, dist, left2, bottom2, 6, dist >> 1, right2, bottom2, 8, dist >> 1, omx, bottom, 7, dist); } else { if (top >= mv_y_min) { // check top ME_COST_IPEL_DIR_DIST(omx, top, 2, dist); } if (top2 >= mv_y_min) { // check half top if (left2 >= mv_x_min) { // check half left ME_COST_IPEL_DIR_DIST(left2, top2, 1, (dist >> 1)); } if (right2 <= mv_x_max) { // check half right ME_COST_IPEL_DIR_DIST(right2, top2, 3, (dist >> 1)); } } if (left >= mv_x_min) { // check left ME_COST_IPEL_DIR_DIST(left, omy, 4, dist); } if (right <= mv_x_max) { // check right ME_COST_IPEL_DIR_DIST(right, omy, 5, dist); } if (bottom2 <= mv_y_max) { // check half bottom if (left2 >= mv_x_min) { // check half left ME_COST_IPEL_DIR_DIST(left2, bottom2, 6, (dist >> 1)); } if (right2 <= mv_x_max) { // check half right ME_COST_IPEL_DIR_DIST(right2, bottom2, 8, (dist >> 1)); } } if (bottom <= mv_y_max) { // check bottom ME_COST_IPEL_DIR_DIST(omx, bottom, 7, dist); } } if (mv->bcost < bcost) { rounds = 0; } else if (++rounds >= earlyExitIters) { return; } } for (dist = 16; dist <= merange; dist <<= 1) { omx = mv->bmx; omy = mv->bmx; bcost = mv->bcost; top = omy - dist; bottom = omy + dist; left = omx - dist; right = omx + dist; if (top >= mv_y_min && left >= mv_x_min && right <= mv_x_max && bottom <= mv_y_max) { // check border /* index: 0 * 3 * 2 * 1 * 0 3 2 1 * 1 2 3 0 * 1 * 2 * 3 * 0 */ ME_COST_IPEL_X4_DIR_DIST(omx, top, 0, dist, left, omy, 0, dist, right, omy, 0, dist, omx, bottom, 0, dist); for (idx = 1; idx < 4; idx++) { posYT = top + ((dist >> 2) * idx); posYB = bottom - ((dist >> 2) * idx); posXL = omx - ((dist >> 2) * idx); posXR = omx + ((dist >> 2) * idx); ME_COST_IPEL_X4_DIR_DIST(posXL, posYT, 0, dist, posXR, posYT, 0, dist, posXL, posYB, 0, dist, posXR, posYB, 0, dist); } } else { // check border for each mv if (top >= mv_y_min) { // check top ME_COST_IPEL_DIR_DIST(omx, top, 0, dist); } if (left >= mv_x_min) { // check left ME_COST_IPEL_DIR_DIST(left, omy, 0, dist); } if (right <= mv_x_max) { // check right ME_COST_IPEL_DIR_DIST(right, omy, 0, dist); } if (bottom <= mv_y_max) { // check bottom ME_COST_IPEL_DIR_DIST(omx, bottom, 0, dist); } for (idx = 1; idx < 4; idx++) { posYT = top + ((dist >> 2) * idx); posYB = bottom - ((dist >> 2) * idx); posXL = omx - ((dist >> 2) * idx); posXR = omx + ((dist >> 2) * idx); if (posYT >= mv_y_min) { // check top if (posXL >= mv_x_min) { // check left ME_COST_IPEL_DIR_DIST(posXL, posYT, 0, dist); } if (posXR <= mv_x_max) { // check right ME_COST_IPEL_DIR_DIST(posXR, posYT, 0, dist); } } if (posYB <= mv_y_max) { // check bottom if (posXL >= mv_x_min) { // check left ME_COST_IPEL_DIR_DIST(posXL, posYB, 0, dist); } if (posXR <= mv_x_max) { // check right ME_COST_IPEL_DIR_DIST(posXR, posYB, 0, dist); } } } } if (mv->bcost < bcost) { rounds = 0; } else if (++rounds >= earlyExitIters) { return; } } } // int g_me_time[4] = { 0 }; /* --------------------------------------------------------------------------- * return minimum motion cost after search */ dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc) { /* special version of pack to allow shortcuts in CHECK_MV_RANGE */ ALIGNED_ARRAY_16(int, costs,[8]); double beta2 = p_me->beta2 + 1; double beta3 = p_me->beta3 + 1; pel_t *p_org = p_me->p_fenc; pel_t *p_fref = p_me->p_fref_1st->planes[IMG_Y] + p_me->i_bias; int i_fref = p_me->p_fref_1st->i_stride[IMG_Y]; int i_pixel = p_me->i_pixel; int mv_x_min = p_me->mv_min_fpel[0]; int mv_y_min = p_me->mv_min_fpel[1]; int mv_x_max = p_me->mv_max_fpel[0]; int mv_y_max = p_me->mv_max_fpel[1]; int me_range = h->param->search_range; int lambda = h->i_lambda_factor; // factor for determining Lagrangian's motion cost const uint32_t mv_min = pack16to32_mask2(-mv_x_min, -mv_y_min); const uint32_t mv_max = pack16to32_mask2(mv_x_max, mv_y_max) | 0x8000; const uint16_t *p_cost_mvx = h->mvbits - p_me->mvp.x; const uint16_t *p_cost_mvy = h->mvbits - p_me->mvp.y; uint32_t pmv; dist_t bcost = MAX_DISTORTION; int bmx = 0, bmy = 0; int omx, omy; int i, j, dir, idx; const int umh_1_3_step = h->UMH_big_hex_level == 2 ? 16 : 8; const int8_t(*search_patern)[2] = h->UMH_big_hex_level == 2 ? HEX4 : FAST_HEX4; // g_me_time[0]++; /* ------------------------------------------------------------- * try MVP and some key searching points */ pmv = MAKEDWORD(mvc[0][0], mvc[0][1]); /* mvc[0][] is the MVP */ for (i = 0; i < i_mvc; i++) { int mx = mvc[i][0]; int my = mvc[i][1]; ME_COST_IPEL(mx, my); } if (bcost == MAX_DISTORTION) { goto _me_error; /* me failed */ } /* ------------------------------------------------------------- * search using different method */ switch (h->param->me_method) { case XAVS2_ME_TZ: /* TZ */ { const int RasterDistance = 16; const int MaxIters = 32; const int EarlyExitIters = 3; dist_t bdist; int mv1_x, mv1_y, mv2_x, mv2_y; mv_info mvinfo; omx = bmx; omy = bmy; ME_COST_IPEL_X3(-2, 0, -1, 2, 1, 2); ME_COST_IPEL_X3( 2, 0, 1, -2, -1, -2); if (CHECK_MV_RANGE(bmx, bmy)) { DIA_ITER(bmx, bmy); } mvinfo.bcost = bcost; mvinfo.bdist = 0; mvinfo.bmx = bmx; mvinfo.bmy = bmy; mvinfo.bdir = 0; tz_pattern_search(h, p_me, p_org, p_fref, &mvinfo, mv_x_min, mv_y_min, mv_x_max, mv_y_max, i_pixel, i_fref, EarlyExitIters, me_range); bcost = mvinfo.bcost; bdist = mvinfo.bdist; bmx = mvinfo.bmx; bmy = mvinfo.bmy; dir = mvinfo.bdir; if (bdist == 1) { if (!dir) { break; } /* if best distance was only 1, check two missing points. * for a given direction 1 to 8, check nearest two outer X pixels*/ mv1_x = bmx + offsets[(dir - 1) * 2 ][0]; /* X X */ mv1_y = bmy + offsets[(dir - 1) * 2 ][1]; /* X 1 2 3 X */ mv2_x = bmx + offsets[(dir - 1) * 2 + 1][0]; /* 4 * 5 */ mv2_y = bmy + offsets[(dir - 1) * 2 + 1][1]; /* X 6 7 8 X */ if (CHECK_MV_RANGE(mv1_x, mv1_y)) { /* X X */ ME_COST_IPEL(mv1_x, mv1_y); } if (CHECK_MV_RANGE(mv2_x, mv2_y)) { ME_COST_IPEL(mv2_x, mv2_y); } /* if no new point is found, stop */ if (bcost == mvinfo.bcost) { break; /* the bcost is not changed */ } } /* raster search refinement if original search distance was too big */ if (bdist > RasterDistance) { const int iRasterDist = RasterDistance >> 1; const int iRasterDist2 = RasterDistance >> 2; int rmv_y_min = XAVS2_MAX(mv_y_min, bmy - RasterDistance + 2); int rmv_y_max = XAVS2_MIN(mv_y_max, bmy + RasterDistance - 2); int rmv_x_min = XAVS2_MAX(mv_x_min, bmx - RasterDistance + 2); int rmv_x_max = XAVS2_MIN(mv_x_max, bmx + RasterDistance - 2); for (j = rmv_y_min; j < rmv_y_max; j += iRasterDist) { for (i = rmv_x_min; i < rmv_x_max; i += iRasterDist) { ME_COST_IPEL_X4(i, j, i, j + iRasterDist2, i + iRasterDist2, j, i + iRasterDist2, j + iRasterDist2); } } } while (bdist > 0) { // center a new search around current best mvinfo.bcost = bcost; mvinfo.bdist = 0; mvinfo.bmx = bmx; mvinfo.bmy = bmy; mvinfo.bdir = 0; tz_pattern_search(h, p_me, p_org, p_fref, &mvinfo, mv_x_min, mv_y_min, mv_x_max, mv_y_max, i_pixel, i_fref, MaxIters, me_range); bcost = mvinfo.bcost; bdist = mvinfo.bdist; bmx = mvinfo.bmx; bmy = mvinfo.bmy; dir = mvinfo.bdir; if (bdist == 1) { /* for a given direction 1 to 8, check nearest 2 outer X pixels */ if (dir) { /* X X */ mv1_x = bmx + offsets[(dir - 1) * 2 ][0]; /* X 1 2 3 X */ mv1_y = bmy + offsets[(dir - 1) * 2 ][1]; /* 4 * 5 */ mv2_x = bmx + offsets[(dir - 1) * 2 + 1][0]; /* X 6 7 8 X */ mv2_y = bmy + offsets[(dir - 1) * 2 + 1][1]; /* X X */ if (CHECK_MV_RANGE(mv1_x, mv1_y)) { ME_COST_IPEL(mv1_x, mv1_y); } if (CHECK_MV_RANGE(mv2_x, mv2_y)) { ME_COST_IPEL(mv2_x, mv2_y); } } break; } } /* equivalent to the above, but eliminates duplicate candidates */ goto umh_step_2; } case XAVS2_ME_UMH: /* UMH */ /* http://www.cnblogs.com/TaigaCon/archive/2014/06/16/3788984.html * 0. ʼ */ DIA_ITER(mvc[0][0], mvc[0][1]); if (pmv && (bmx != mvc[0][0] || bmy != mvc[0][1])) { DIA_ITER(bmx, bmy); pmv = MAKEDWORD(bmx, bmy); } // select different step according to the different cost from upper layer if (p_me->mvp1.v != 0) { int mx = IPEL(p_me->mvp1.x); int my = IPEL(p_me->mvp1.y); ME_COST_IPEL(mx, my); } EARLY_TERMINATION(p_me->pred_sad_uplayer); // g_me_time[1]++; // prediction using mv of last ref_idx motion vector if (p_me->i_ref_idx > 0) { ME_COST_IPEL(IPEL(p_me->mvp2.x), IPEL(p_me->mvp2.y)); } if (p_me->mvp3.v != 0) { ME_COST_IPEL(IPEL(p_me->mvp3.x), IPEL(p_me->mvp3.y)); } /* ǰMV MVPΧһС */ if (pmv != MAKEDWORD(bmx, bmy)) { DIA_ITER(bmx, bmy); } // early termination algorithm EARLY_TERMINATION(p_me->pred_sad); // umh_step_1: /* UMH 1. Unsymmetrical-cross search ǶԳʮ */ // g_me_time[2]++; omx = bmx; omy = bmy; for (i = 1; i <= me_range; i += 2) { ME_COST_IPEL(omx + i, omy); ME_COST_IPEL(omx - i, omy); } for (j = 1; j <= me_range / 2; j += 2) { ME_COST_IPEL(omx, omy + j); ME_COST_IPEL(omx, omy - j); } // early termination algorithm EARLY_TERMINATION(p_me->pred_sad); /* UMH 2. Spiral search */ omx = bmx; omy = bmy; for (i = 0; i < 24; i++) { ME_COST_IPEL(omx + GRID[i][0], omy + GRID[i][1]); } // early termination algorithm EARLY_TERMINATION(p_me->pred_sad); // big hexagon if (h->UMH_big_hex_level) { for (j = 1; j <= me_range / 4; j++) { omx = bmx; omy = bmy; for (i = 0; i < umh_1_3_step; i++) { ME_COST_IPEL(omx + search_patern[i][0] * j, omy + search_patern[i][1] * j); } if (bmx != omx || bmy != omy) { EARLY_TERMINATION(p_me->pred_sad); } } } /* !!! NO break statement here */ case XAVS2_ME_HEX: /* hexagon search */ umh_step_2 : /* UMH 3. Uneven Multi-Hexagon-grid Search ģ */ // g_me_time[3]++; dir = 0; /* 6 5 */ omx = bmx; /* */ omy = bmy; /* 1 * 4 */ ME_COST_IPEL_X3_DIR(-1,-2,6, 1,-2,5, -2,0,1); /* */ ME_COST_IPEL_X3_DIR( 2, 0,4, -1, 2,2, 1,2,3); /* 2 3 */ if (dir) { const int8_t (*hex)[2]; /* UMH 4. Extended Hexagon-based Search ģ巴 */ idx = dir - 1; /* start array index */ /* half hexagon, not overlapping the previous iteration */ for (i = 0; i < me_range && CHECK_MV_RANGE(bmx, bmy); i++) { dir = 0; omx = bmx; omy = bmy; hex = &HEX2[idx]; ME_COST_IPEL_X3_DIR(hex[0][0],hex[0][1],1, hex[1][0],hex[1][1],2, hex[2][0],hex[2][1],3); if (!dir) { break; /* early terminate */ } idx = M1MOD6[dir + idx - 1]; /* next start array index */ } } /* !!! NO break statement here */ case XAVS2_ME_DIA: /* diamond search */ umh_step_3: /* UMH 5. the third step with a small search pattern Сģ巴 */ dir = 0; if (CHECK_MV_RANGE(bmx, bmy)) { omx = bmx; /* 4 */ omy = bmy; /* 1 * 3 */ ME_COST_IPEL_X4_DIR(0,-1,4, -1,0,1, 1,0,3, 0,1,2); /* 2 */ } if (dir) { const int8_t (*dia)[2]; idx = dir - 1; /* start array index */ /* half diamond, not overlapping the previous iteration */ for (i = 0; i < me_range && CHECK_MV_RANGE(bmx, bmy); i++) { dir = 0; omx = bmx; omy = bmy; dia = &DIA1[idx]; ME_COST_IPEL_X3_DIR(dia[0][0],dia[0][1],1, dia[1][0],dia[1][1],2, dia[2][0],dia[2][1],3); if (!dir) { break; /* early terminate */ } idx = M1MOD4[dir + idx - 1]; /* next start array index */ } } break; default: /* XAVS2_ME_FS: full search */ omx = bmx; omy = bmy; for (j = -me_range; j < me_range; j++) { for (i = -me_range; i < me_range; i++) { ME_COST_IPEL(omx + i, omy + j); } } break; } /* ------------------------------------------------------------- * store the results of fullpel search */ p_me->bmv.v = MAKEDWORD(FPEL(bmx), FPEL(bmy)); p_me->bmv2.v = MAKEDWORD(bmx, bmy); p_me->bcost = bcost; p_me->bcost2 = bcost; p_me->mvcost[PDIR_FWD] = MV_COST_IPEL(bmx, bmy); /* ------------------------------------------------------------- * sub-pel refine */ if (h->use_fractional_me) { bcost = me_subpel_refine(h, p_me); } _me_error: return bcost; } /* --------------------------------------------------------------------------- * find motion vector for forward dual hypothesis prediction (sub-pel search) * return minimum motion cost after search */ dist_t xavs2_me_search_sym(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mv_t *mv) { const int search_pos2 = 5; // search positions for half-pel search (default: 9) const int search_pos4 = 5; // search positions for quarter-pel search (default: 9) pel_t **p_filtered1 = p_me->p_fref_1st->filtered; pel_t **p_filtered2 = p_me->p_fref_2nd->filtered; pel_t *p_org = p_me->p_fenc; int distance_fwd = p_me->i_distance_1st; int distance_bwd = p_me->i_distance_2nd; int i_pixel = p_me->i_pixel; int i_offset = p_me->i_bias; int ctr_x = (p_me->mvp1.x >> 1) << 1; int ctr_y = (p_me->mvp1.y >> 1) << 1; int mv_x_min = p_me->mv_min[0]; int mv_y_min = p_me->mv_min[1]; int mv_x_max = p_me->mv_max[0]; int mv_y_max = p_me->mv_max[1]; int lambda = h->i_lambda_factor; int min_pos2 = (h->param->enable_hadamard ? 0 : 1); int max_pos2 = (h->param->enable_hadamard ? XAVS2_MAX(1, search_pos2) : search_pos2); const uint32_t mv_min = pack16to32_mask2(-mv_x_min, -mv_y_min); const uint32_t mv_max = pack16to32_mask2(mv_x_max, mv_y_max) | 0x8000; const uint16_t *p_cost_mvx = h->mvbits - p_me->mvp.x; const uint16_t *p_cost_mvy = h->mvbits - p_me->mvp.y; mv_t bmv = *mv; // best mv dist_t bcost = MAX_DISTORTION; dist_t cost; int pos; int mx, my; int i_fref = p_me->p_fref_1st->i_stride[IMG_Y]; if (!h->use_fractional_me) { mx = mv->x; my = mv->y; ME_COST_QPEL_SYM; bcost = cost; bmv.v = MAKEDWORD(mx, my); return bcost; } // loop over search positions for (pos = min_pos2; pos < max_pos2; pos++) { mx = mv->x + (Spiral[pos][0] << 1); // quarter-pel units my = mv->y + (Spiral[pos][1] << 1); // quarter-pel units ME_COST_QPEL_SYM; if (cost < bcost) { bcost = cost; bmv.v = MAKEDWORD(mx, my); } } mv->v = bmv.v; /* ------------------------------------------------------------- * quarter-pel refine */ // loop over search positions if (h->use_fractional_me >= 2) { for (pos = 1; pos < search_pos4; pos++) { if (h->param->enable_pmvr) { if (pmvr_adapt_mv(&mx, &my, ctr_x, ctr_y, mv->x, mv->y, Spiral[pos][0], Spiral[pos][1])) { continue; } } else { mx = mv->x + Spiral[pos][0]; // quarter-pel units my = mv->y + Spiral[pos][1]; // quarter-pel units } ME_COST_QPEL_SYM; if (cost < bcost) { bcost = cost; bmv.v = MAKEDWORD(mx, my); } } } mv->v = bmv.v; p_me->mvcost[PDIR_SYM] = MV_COST_FPEL(bmv.x, bmv.y); // return minimum motion cost return bcost; } /* --------------------------------------------------------------------------- * return minimum motion cost after search (sub-pel search) */ dist_t xavs2_me_search_bid(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mv_t *fwd_mv, mv_t *bwd_mv, cu_parallel_t *p_enc) { pel_t **p_filtered1 = p_me->p_fref_1st->filtered; pel_t **p_filtered2 = p_me->p_fref_2nd->filtered; pel_t *p_org = p_me->p_fenc; const int search_pos2 = 9; // search positions for half-pel search (default: 9) const int search_pos4 = 9; // search positions for quarter-pel search (default: 9) int i_pixel = p_me->i_pixel; int i_offset = p_me->i_bias; int ctr_x = (p_me->mvp1.x >> 1) << 1; int ctr_y = (p_me->mvp1.y >> 1) << 1; int mv_x_min = p_me->mv_min[0]; int mv_y_min = p_me->mv_min[1]; int mv_x_max = p_me->mv_max[0]; int mv_y_max = p_me->mv_max[1]; int lambda = h->i_lambda_factor; int min_pos2 = (h->param->enable_hadamard ? 0 : 1); int max_pos2 = (h->param->enable_hadamard ? XAVS2_MAX(1, search_pos2) : search_pos2); int block_w = p_me->i_block_w; int xx2; int yy2; int mv_bid_bit; const uint32_t mv_min = pack16to32_mask2(-mv_x_min, -mv_y_min); const uint32_t mv_max = pack16to32_mask2(mv_x_max, mv_y_max) | 0x8000; const uint16_t *p_cost_mvx = h->mvbits - p_me->mvp1.x; const uint16_t *p_cost_mvy = h->mvbits - p_me->mvp1.y; const uint16_t *p_cost_bix = h->mvbits - p_me->mvp2.x; const uint16_t *p_cost_biy = h->mvbits - p_me->mvp2.y; mv_t bmv = *fwd_mv; // best mv dist_t bcost = MAX_DISTORTION; dist_t cost; int mx, my, mx_bid, my_bid; int pos; int i_fref = p_me->p_fref_1st->i_stride[IMG_Y]; coeff_t *cur_blk = p_enc->coeff_blk; mx_bid = bwd_mv->x; my_bid = bwd_mv->y; //ѱֵԤֵļ㹫ʽΪ2ֵ-Ԥֵ xx2 = mx_bid >> 2; yy2 = my_bid >> 2; mv_bid_bit = MV_COST_FPEL_BID(mx_bid, my_bid); if (CHECK_MV_RANGE(mx_bid, my_bid)) { pel_t *p_src2 = p_filtered2[((my_bid & 3) << 2) + (mx_bid & 3)]; if (p_src2 != NULL) { p_src2 += i_offset + yy2 * i_fref + xx2; g_funcs.pixf.sub_ps[i_pixel](cur_blk, block_w, p_org, p_src2, FENC_STRIDE, i_fref);//M-A } else { ALIGN32(pel_t tmp_pred[MAX_CU_SIZE * MAX_CU_SIZE]); mv_t mvt; mvt.x = (int16_t)mx_bid; mvt.y = (int16_t)my_bid; get_mv_for_mc(h, &mvt, p_me->i_pix_x, p_me->i_pix_y, block_w, p_me->i_block_h); mc_luma(tmp_pred, MAX_CU_SIZE, mvt.x, mvt.y, block_w, p_me->i_block_h, p_me->p_fref_2nd); g_funcs.pixf.sub_ps[i_pixel](cur_blk, block_w, p_org, tmp_pred, FENC_STRIDE, MAX_CU_SIZE);//M-A } g_funcs.pixf.add_ps[i_pixel](buf_pixel_temp, MAX_CU_SIZE, p_org, cur_blk, FENC_STRIDE, block_w);//M-A+M } if (!h->use_fractional_me) { mx = fwd_mv->x; my = fwd_mv->y; ME_COST_QPEL_BID; bcost = cost; bmv.v = MAKEDWORD(mx, my); return bcost; } // loop over search positions for (pos = min_pos2; pos < max_pos2; pos++) { mx = fwd_mv->x + (Spiral[pos][0] << 1); // quarter-pel units my = fwd_mv->y + (Spiral[pos][1] << 1); // quarter-pel units ME_COST_QPEL_BID; if (cost < bcost) { bcost = cost; bmv.v = MAKEDWORD(mx, my); } } fwd_mv->v = bmv.v; /* ------------------------------------------------------------- * quarter-pel refine */ // loop over search positions if (h->use_fractional_me >= 2) { for (pos = 1; pos < search_pos4; pos++) { if (h->param->enable_pmvr) { if (pmvr_adapt_mv(&mx, &my, ctr_x, ctr_y, fwd_mv->x, fwd_mv->y, Spiral[pos][0], Spiral[pos][1])) { continue; } } else { mx = fwd_mv->x + Spiral[pos][0]; // quarter-pel units my = fwd_mv->y + Spiral[pos][1]; // quarter-pel units } ME_COST_QPEL_BID; if (cost < bcost) { bcost = cost; bmv.v = MAKEDWORD(mx, my); } } } fwd_mv->v = bmv.v; p_me->mvcost[PDIR_BID] = MV_COST_FPEL(bmv.x, bmv.y) + MV_COST_FPEL_BID(mx_bid, my_bid); // return minimum motion cost return bcost; } xavs2-1.3/source/encoder/me.h000066400000000000000000000113411340660520300161100ustar00rootroot00000000000000/* * me.h * * Description of this file: * ME functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_ME_H #define XAVS2_ME_H /** * =========================================================================== * macros * =========================================================================== */ #define pack16to32_mask(x,y) (((x) << 16)|((y) & 0xFFFF)) #define pack16to32_mask2(mx,my) (((mx) << 16) | ((my) & 0x7FFF)) #define CHECK_MV_RANGE(mx,my) (!(((pack16to32_mask2(mx,my) + mv_min) | (mv_max - pack16to32_mask2(mx,my))) & 0x80004000)) #define CHECK_MV_RANGE_X4(x0,y0,x1,y1,x2,y2,x3,y3) (!(( \ (pack16to32_mask2(x0, y0) + mv_min) | (mv_max - pack16to32_mask2(x0, y0)) | \ (pack16to32_mask2(x1, y1) + mv_min) | (mv_max - pack16to32_mask2(x1, y1)) | \ (pack16to32_mask2(x2, y2) + mv_min) | (mv_max - pack16to32_mask2(x2, y2)) | \ (pack16to32_mask2(x3, y3) + mv_min) | (mv_max - pack16to32_mask2(x3, y3)) \ ) & 0x80004000)) /* --------------------------------------------------------------------------- * conversion */ #define IPEL(mv) (((mv) + 2) >> 2) /* convert fractional pixel MV to integer pixel with rounding */ #define FPEL(mv) ((mv) << 2) /* convert integer pixel MV to fractional pixel */ /* --------------------------------------------------------------------------- */ #define COPY1_IF_LT(x, y) \ if ((y) < (x)) {\ (x) = (y);\ } #define COPY2_IF_LT(x, y, a, b) \ if ((y) < (x)) {\ (x) = (y);\ (a) = (b);\ } #define COPY3_IF_LT(x, y, a, b, c, d) \ if ((y) < (x)) {\ (x) = (y);\ (a) = (b);\ (c) = (d);\ } #define COPY4_IF_LT(x, y, a, b, c, d, e, f) \ if ((y) < (x)) {\ (x) = (y);\ (a) = (b);\ (c) = (d);\ (e) = (f);\ } #define COPY5_IF_LT(x, y, a, b, c, d, e, f, g, h) \ if ((y) < (x)) {\ (x) = (y);\ (a) = (b);\ (c) = (d);\ (e) = (f);\ (g) = (h);\ } /* --------------------------------------------------------------------------- * MV cost */ #define MV_COST_IPEL(mx,my) (WEIGHTED_COST(lambda, p_cost_mvx[(mx) << 2] + p_cost_mvy[(my) << 2])) #define MV_COST_FPEL(mx,my) (WEIGHTED_COST(lambda, p_cost_mvx[mx] + p_cost_mvy[my])) #define MV_COST_FPEL_BID(mx,my) (WEIGHTED_COST(lambda, p_cost_bix[mx] + p_cost_biy[my])) /** * =========================================================================== * function declares * =========================================================================== */ #define xavs2_me_get_buf_size FPFX(me_get_buf_size) int xavs2_me_get_buf_size(const xavs2_param_t *param); #define xavs2_me_init FPFX(me_init) void xavs2_me_init(xavs2_t *h, uint8_t **mem_base); #define xavs2_me_init_umh_threshold FPFX(me_init_umh_threshold) void xavs2_me_init_umh_threshold(xavs2_t *h, double *bsize, int i_qp); #define xavs2_me_search FPFX(me_search) dist_t xavs2_me_search(xavs2_t *h, xavs2_me_t *p_me, int16_t(*mvc)[2], int i_mvc); #define xavs2_me_search_sym FPFX(me_search_sym) dist_t xavs2_me_search_sym(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mv_t *mv); #define xavs2_me_search_bid FPFX(me_search_bid) dist_t xavs2_me_search_bid(xavs2_t *h, xavs2_me_t *p_me, pel_t *buf_pixel_temp, mv_t *fwd_mv, mv_t *bwd_mv, cu_parallel_t *p_enc); #endif // XAVS2_ME_H xavs2-1.3/source/encoder/parameters.c000066400000000000000000001076271340660520300176620ustar00rootroot00000000000000/* * parameters.c * * Description of this file: * Parameters definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ /* --------------------------------------------------------------------------- * disable warning C4996: functions or variables may be unsafe. */ #if defined(_MSC_VER) #define _CRT_SECURE_NO_WARNINGS #endif /* --------------------------------------------------------------------------- * include files */ #include #include #include #include #include #if defined(_MSC_VER) #include #include #endif #include "xavs2.h" #include "common.h" #include "rps.h" #include "encoder/presets.h" #include "encoder/encoder.h" #include "encoder/wrapper.h" /** * =========================================================================== * defines and global variables * =========================================================================== */ #define MAP_TAB_SIZE 512 /* maximal size of mapping table */ #define MAX_ITEMS 1024 /* maximal number of items to parse */ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #define REMOVE_WARNING \ __pragma(warning(push))\ __pragma(warning(disable:4127)) #define RESTORE_WARNING \ __pragma(warning(pop)) #else #define REMOVE_WARNING #define RESTORE_WARNING #endif #define xavs2_param_match(x,y) (!strcasecmp(x,y)) /* --------------------------------------------------------------------------- * map type */ enum MAP_TYPE { MAP_STR = 1, /* char * */ MAP_NUM = 2, /* int data */ MAP_FLAG = 3, /* flag: 0/1 value */ MAP_FLOAT = 4, /* float data */ MAP_END = 9 }; /* --------------------------------------------------------------------------- * mapping for config item */ typedef struct mapping_t { char name[32]; /* name for configuration */ void *addr; /* memory address to store parameter value */ int type; /* type, string or number */ const char *s_instruction; /* instruction */ } mapping_t; /* mapping table of supported parameters */ typedef struct xavs2_param_map_t { xavs2_param_t *param; mapping_t map_tab[MAP_TAB_SIZE]; } xavs2_param_map_t; static xavs2_param_map_t g_param_map = { NULL }; /** * --------------------------------------------------------------------------- * Function : set default values for encoding parameters * Parameters : * [in ] : map_tab - mapping table * : p - pointer to struct xavs2_param_t * [out] : none * Return : none * --------------------------------------------------------------------------- */ static void mapping_default(xavs2_param_map_t *p_map_tab, xavs2_param_t *p) { mapping_t * map_tab = p_map_tab->map_tab; p_map_tab->param = p; /* token - token name * var - store address * t - type * instr - instruction of parameter * */ #define MAP(token, var, t, instr)\ REMOVE_WARNING\ if (strlen(token) > 0) {\ strcpy(map_tab[item_idx].name, (token));\ map_tab[item_idx].addr = (var);\ map_tab[item_idx].type = (t);\ map_tab[item_idx].s_instruction = (instr);\ item_idx++;\ } else {\ map_tab[item_idx].addr = NULL;\ map_tab[item_idx].type = MAP_END;\ map_tab[item_idx].s_instruction = "";\ }\ RESTORE_WARNING int item_idx = 0; /* input */ MAP("Width", &p->org_width, MAP_NUM, "Image width in pixels"); MAP("SourceWidth", &p->org_width, MAP_NUM, " - Same as `Width`"); MAP("Height", &p->org_height, MAP_NUM, "Image height in pixels"); MAP("SourceHeight", &p->org_height, MAP_NUM, " - Same as `Height`"); MAP("Input", &p->psz_in_file, MAP_STR, "Input sequence, YUV 4:2:0"); MAP("InputFile", &p->psz_in_file, MAP_STR, " - Same as `Input`"); MAP("InputHeaderLength", &p->infile_header, MAP_NUM, "If the inputfile has a header, state it's length in byte here "); MAP("FrameRate", &p->frame_rate_code, MAP_NUM, "FramerateCode, 1: 24000/1001,2: 24,3: 25(default), 4: 30000/1001,5: 30,6: 50,7: 60000/1001,8: 60"); MAP("ChromaFormat", &p->chroma_format, MAP_NUM, "YUV format, 0=4:0:0, 1=4:2:0(default), 2=4:2:2"); MAP("InputSampleBitDepth", &p->input_sample_bit_depth, MAP_NUM, "Sample Bitdepth of input file"); MAP("Frames", &p->num_frames, MAP_NUM, "Number of frames to be coded"); MAP("FramesToBeEncoded", &p->num_frames, MAP_NUM, " - Same as `Frames`"); /* output */ MAP("output", &p->psz_bs_file, MAP_STR, "Output bistream file path"); MAP("OutputFile", &p->psz_bs_file, MAP_STR, " - Same as `output`"); MAP("Recon", &p->psz_dump_yuv, MAP_STR, "Output reconstruction YUV file path"); MAP("ReconFile", &p->psz_dump_yuv, MAP_STR, " - Same as `Recon`"); /* encoder configurations */ MAP("MaxSizeInBit", &p->lcu_bit_level, MAP_NUM, "Maximum Coding Unit (CU) Size (4, 5, 6)"); MAP("MinSizeInBit", &p->scu_bit_level, MAP_NUM, "Minimum Coding Unit (CU) Size (3, 4, 5, 6)"); MAP("ProfileID", &p->profile_id, MAP_NUM, "Profile ID (18: MAIN PICTURE profile, 32: MAIN profile, 34: MAIN10 profile)"); MAP("LevelID", &p->level_id, MAP_NUM, "Level ID (16: 2.0; 32: 4.0; 34: 4.2; 64: 6.0; 66: 6.2)"); MAP("SampleBitDepth", &p->sample_bit_depth, MAP_NUM, "Encoding bit-depth"); MAP("IntraPeriodMax", &p->intra_period_max, MAP_NUM, "maximum intra-period, one I-frame mush appear in any NumMax of frames"); MAP("IntraPeriodMin", &p->intra_period_min, MAP_NUM, "minimum intra-period, only one I-frame can appear in at most NumMin of frames"); MAP("OpenGOP", &p->b_open_gop, MAP_NUM, "Open GOP or Closed GOP, 1: Open(default), 0: Closed"); MAP("UseHadamard", &p->enable_hadamard, MAP_NUM, "Hadamard transform (0=not used, 1=used)"); MAP("FME", &p->me_method, MAP_NUM, "Motion Estimation method: 0-Full Search, 1-DIA, 2-HEX, 3-UMH (default), 4-TZ"); MAP("SearchRange", &p->search_range, MAP_NUM, "Max search range"); MAP("NumberReferenceFrames", &p->num_max_ref, MAP_NUM, "Number of previous frames used for inter motion search (1-5)"); #if XAVS2_TRACE MAP("TraceFile", &p->psz_trace_file, MAP_STR, "Tracing file path"); #endif MAP("TemporalIdExistFlag", &p->temporal_id_exist_flag, MAP_NUM, "temporal ID"); MAP("FFRAMEEnable", &p->enable_f_frame, MAP_NUM, "Use F Frame or not (0: Don't use F frames 1:Use F frames instead of P frames)"); MAP("DHPEnable", &p->enable_dhp, MAP_NUM, "(0: Don't use DHP, 1:Use DHP)"); MAP("MHPSKIPEnable", &p->enable_mhp_skip, MAP_NUM, "(0: Don't use MH_PSKIP, 1:Use MH_PSKIP)"); MAP("WSMEnable", &p->enable_wsm, MAP_NUM, "(0: Don't use WSM, 1:Use WSM)"); MAP("NumberBFrames", &p->num_bframes, MAP_NUM, "Number of B frames inserted between I/P/F frames (0=not used)"); MAP("Inter2PU" , &p->inter_2pu, MAP_NUM, "inter partition mode 2NxN or Nx2N or AMP"); MAP("InterAMP", &p->enable_amp, MAP_NUM, "inter partition mode AMP"); MAP("IntraInInter", &p->enable_intra, MAP_NUM, "intra partition in inter frame"); MAP("RdoLevel", &p->i_rd_level, MAP_NUM, "RD-optimized mode decision (0:off, 1: only for best partition mode of one CU, 2: only for best 2 partition modes; 3: All partition modes)"); MAP("LoopFilterDisable", &p->loop_filter_disable, MAP_NUM, "Disable loop filter in picture header (0=Filter, 1=No Filter)"); MAP("LoopFilterParameter", &p->loop_filter_parameter_flag, MAP_NUM, "Send loop filter parameter (0= No parameter, 1= Send Parameter)"); MAP("LoopFilterAlphaOffset", &p->alpha_c_offset, MAP_NUM, "Aplha offset in loop filter"); MAP("LoopFilterBetaOffset", &p->beta_offset, MAP_NUM, "Beta offset in loop filter"); MAP("SAOEnable", &p->enable_sao, MAP_NUM, "Enable SAO or not (1: on, 0: off)"); MAP("ALFEnable", &p->enable_alf, MAP_NUM, "Enable ALF or not (1: on, 0: off)"); MAP("ALFLowLatencyEncodingEnable", &p->alf_LowLatencyEncoding, MAP_NUM, "Enable Low Latency ALF (1=Low Latency mode, 0=High Efficiency mode)"); MAP("CrossSliceLoopFilter", &p->b_cross_slice_loop_filter, MAP_NUM, "Enable Cross Slice Boundary Filter (0=Disable, 1=Enable)"); /* */ // MAP("InterlaceCodingOption", &p->InterlaceCodingOption, MAP_NUM); // MAP("RepeatFirstField", &p->repeat_first_field, MAP_NUM); // MAP("TopFieldFirst", &p->top_field_first, MAP_NUM); // MAP("OutputMergedPicture", &p->output_merged_picture, MAP_NUM); // MAP("Progressive_sequence", &p->progressive_sequence, MAP_NUM); // MAP("Progressive_frame", &p->progressive_frame, MAP_NUM); /* extension configuration */ // MAP("TDMode", &p->TD_mode, MAP_NUM); // MAP("ViewPackingMode", &p->view_packing_mode, MAP_NUM); // MAP("ViewReverse", &p->view_reverse, MAP_NUM); MAP("WQEnable", &p->enable_wquant, MAP_NUM, "Weighted quantization"); #if XAVS2_TRACE && ENABLE_WQUANT MAP("SeqWQM", &p->SeqWQM, MAP_NUM); MAP("SeqWQFile", &p->psz_seq_wq_file, MAP_STR); MAP("PicWQEnable", &p->PicWQEnable, MAP_NUM); MAP("WQParam", &p->WQParam, MAP_NUM); MAP("WQModel", &p->WQModel, MAP_NUM); MAP("WeightParamDetailed", &p->WeightParamDetailed, MAP_STR); MAP("WeightParamUnDetailed", &p->WeightParamUnDetailed, MAP_STR); MAP("ChromaDeltaQPDisable", &p->chroma_quant_param_disable, MAP_NUM); MAP("ChromaDeltaU", &p->chroma_quant_param_delta_u, MAP_NUM); MAP("ChromaDeltaV", &p->chroma_quant_param_delta_v, MAP_NUM); MAP("PicWQDataIndex", &p->PicWQDataIndex, MAP_NUM); MAP("PicWQFile", &p->psz_pic_wq_file, MAP_STR); #endif MAP("RdoqLevel", &p->i_rdoq_level, MAP_NUM, "Rdoq Level (0: off, 1: cu level, only for best partition mode, 2: all mode)"); MAP("LambdaFactor", &p->lambda_factor_rdoq, MAP_NUM, "default: 75, Rdoq Lambda factor"); MAP("LambdaFactorP", &p->lambda_factor_rdoq_p, MAP_NUM, "default: 120, Rdoq Lambda factor P/F frame"); MAP("LambdaFactorB", &p->lambda_factor_rdoq_b, MAP_NUM, "default: 100, Rdoq Lambda factor B frame"); MAP("PMVREnable", &p->enable_pmvr, MAP_NUM, "PMVR"); MAP("NSQT", &p->enable_nsqt, MAP_NUM, "NSQT"); MAP("SDIP", &p->enable_sdip, MAP_NUM, "SDIP"); MAP("SECTEnable", &p->enable_secT, MAP_NUM, "Secondary Transform"); MAP("TDRDOEnable", &p->enable_tdrdo, MAP_NUM, "TDRDO, only for LDP configuration (without B frames)"); MAP("RefineQP", &p->enable_refine_qp, MAP_NUM, "Refined QP, only for RA configuration (with B frames)"); MAP("RateControl", &p->i_rc_method, MAP_NUM, "0: CQP, 1: CBR (frame level), 2: CBR (SCU level), 3: VBR"); MAP("TargetBitRate", &p->i_target_bitrate, MAP_NUM, "target bitrate, in bps"); MAP("QP", &p->i_initial_qp, MAP_NUM, "initial qp for first frame (8bit: 0~63; 10bit: 0~79)"); MAP("InitialQP", &p->i_initial_qp, MAP_NUM, " - Same as `QP`"); MAP("QPIFrame", &p->i_initial_qp, MAP_NUM, " - Same as `QP`"); MAP("MinQP", &p->i_min_qp, MAP_NUM, "min qp (8bit: 0~63; 10bit: 0~79)"); MAP("MaxQP", &p->i_max_qp, MAP_NUM, "max qp (8bit: 0~63; 10bit: 0~79)"); MAP("GopSize", &p->i_gop_size, MAP_NUM, "sub GOP size (negative numbers indicating an employ of default settings, which will invliadate the following settings.)"); MAP("PresetLevel", &p->preset_level, MAP_NUM, "preset level for tradeoff between speed and performance, ordered from fastest to slowest (0, ..., 9), default: 5"); MAP("Preset", &p->preset_level, MAP_NUM, " - Same as `PresetLevel`"); MAP("SliceNum", &p->slice_num, MAP_NUM, "Number of slices for each frame"); MAP("NumParallelGop", &p->num_parallel_gop, MAP_NUM, "number of parallel GOPs (0,1: no GOP parallelization)"); MAP("ThreadFrames", &p->i_frame_threads, MAP_NUM, "number of parallel threads for frames ( 0: auto )"); MAP("ThreadRows", &p->i_lcurow_threads, MAP_NUM, "number of parallel threads for rows ( 0: auto )"); MAP("EnableAecThread", &p->enable_aec_thread, MAP_NUM, "Enable AEC thread or not (default: enabled)"); MAP("LogLevel", &p->i_log_level, MAP_NUM, "log level: -1: none, 0: error, 1: warning, 2: info, 3: debug"); MAP("Log", &p->i_log_level, MAP_NUM, " - Same as `LogLevel`"); MAP("EnablePSNR", &p->enable_psnr, MAP_NUM, "Enable PSNR or not (default: Enable)"); MAP("EnableSSIM", &p->enable_ssim, MAP_NUM, "Enable SSIM or not (default: Enable)"); /* end mapping */ MAP("", NULL, MAP_END, "") } /** * =========================================================================== * function defines * =========================================================================== */ /** * --------------------------------------------------------------------------- * Function : allocate memory buffer, and read contents from file. * Params : cfg_file - name of the file to be read. * Return : file content buffer, or NULL on error. * Remarks : the content buffer size is two times of file size so more * : additive config can be appended, and the file size must be * : less than 2M bytes. * --------------------------------------------------------------------------- */ static char *GetConfigFileContent(char *file_content, int file_buf_size, const char *cfg_file) { FILE *f_cfg; int file_len; /* open file */ if ((f_cfg = fopen(cfg_file, "rb")) == NULL) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Cannot open configuration file %s.\n", cfg_file); return NULL; } /* get the file size */ if (fseek(f_cfg, 0, SEEK_END) != 0) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Cannot fseek in configuration file %s.\n", cfg_file); fclose(f_cfg); return NULL; } file_len = (int)ftell(f_cfg); if (file_len < 0 || file_len > 2 * 1024 * 1024) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Unreasonable file size (%d) reported by ftell for configuration file %s.\n", file_len, cfg_file); fclose(f_cfg); return NULL; } if (fseek(f_cfg, 0, SEEK_SET) != 0) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Cannot fseek in configuration file %s.\n", cfg_file); fclose(f_cfg); return NULL; } if (file_len + 16 >= file_buf_size) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Too large configuration file: \"%s\", size %d Bytes\n", cfg_file, file_len); file_len = file_buf_size - 16; } /* read file to buffer * Note that ftell() gives us the file size as the file system sees it. * The actual file size, as reported by fread() below will be often smaller * due to CR/LF to CR conversion and/or control characters after the dos * EOF marker in the file. */ file_len = (int)fread(file_content, 1, file_len, f_cfg); file_content[file_len++] = '\n'; file_content[file_len++] = '\0'; /* close file */ fclose(f_cfg); return file_content; } /* --------------------------------------------------------------------------- */ static intptr_t ParseRefContent(xavs2_param_t *param, char **buf) { char header[10] = { 'F', 'r', 'a', 'm', 'e', '\0', '\0', '\0', '\0', '\0' }; char str[4]; char *token; const char *colon = ":"; char **p = buf; xavs2_rps_t *tmp; int i = 1; int j; int i_gop_size = 0; int predict; sprintf(str, "%d", i); strcat(header, str); strcat(header, colon); memset(param->cfg_ref_all, -1, XAVS2_MAX_GOPS * sizeof(xavs2_rps_t)); while (0 == strcmp(header, *p++)) { i_gop_size++; tmp = param->cfg_ref_all + i - 1; token = *p++; tmp->poc = atoi(token); token = *p++; tmp->qp_offset = atoi(token); token = *p++; tmp->num_of_ref = atoi(token); token = *p++; tmp->referd_by_others = atoi(token); for (j = 0; j < tmp->num_of_ref; j++) { token = *p++; tmp->ref_pic[j] = atoi(token); } token = *p++; predict = atoi(token); if (predict != 0) { token = *p++; j /* delta_rps */ = atoi(token); /* delta_rps, not used */ } token = *p++; tmp->num_to_rm = atoi(token); for (j = 0; j < tmp->num_to_rm; j++) { token = *p++; tmp->rm_pic[j] = atoi(token); } if (param->temporal_id_exist_flag == 1) { token = *p++; tmp->temporal_id = atoi(token); } header[5] = header[6] = header[7] = header[8] = header[9] = '\0'; sprintf(str, "%d", ++i); strcat(header, str); strcat(header, colon); } if (param->i_gop_size > 0 && param->i_gop_size != i_gop_size) { xavs2_log(NULL, XAVS2_LOG_ERROR, "gop_size set error.\n"); } if (i_gop_size > XAVS2_MAX_GOP_SIZE) { xavs2_log(NULL, XAVS2_LOG_ERROR, "gop_size set error (must <= %d).\n", XAVS2_MAX_GOP_SIZE); } return (p - buf - 1); } /* --------------------------------------------------------------------------- */ static INLINE int ParameterNameToMapIndex(xavs2_param_map_t *p_map_tab, const char *param_name) { mapping_t *map_tab = p_map_tab->map_tab; int i = 0; while (map_tab[i].name[0] != '\0') { // ֹλǿַ if (xavs2_param_match(map_tab[i].name, param_name)) { return i; } else { i++; } } return -1; } /* --------------------------------------------------------------------------- */ static INLINE void get_param_name(char *name, const char *param_item) { char *str; name[0] = '\0'; str = strtok(param_item, "_"); while (str) { strcat(name, (const char *)str); str = strtok(NULL, "_"); } } /* --------------------------------------------------------------------------- */ static INLINE int xavs2e_atoi(const char *str, int *b_error) { char *end; int v = strtol(str, &end, 0); if (end == str || *end != '\0') { *b_error = 1; } return v; } /* --------------------------------------------------------------------------- */ static INLINE float xavs2e_atof(const char* str, int *b_error) { char *end; float v = strtof(str, &end); if (end == str || *end != '\0') { *b_error = 1; } return v; } /*--------------------------------------------------------------------------- */ static char *copy_parameter(char *dst, const char *src) { while (*src != '\0') { if (*src == '=') { /* the parser expects whitespace before & after '=' */ *dst++ = ' '; *dst++ = '='; *dst++ = ' '; } else { *dst++ = *src; } src++; } *dst++ = ' '; // add a space to support multiple config items return dst; } /** * --------------------------------------------------------------------------- * Function : get contents from config file and command line * Params : argc - argument counter * : argv - argument viscera, an array of null-terminated strings * Return : file content buffer, or NULL on error * Remarks : the content buffer size is two times of file size so more * : additive config can be appended, and the file size must be * : less than 2M bytes * --------------------------------------------------------------------------- */ static char *xavs2_get_configs(int argc, const char * const *argv) { const int size_file_max = 1 << 20; // 1MB char item[4096]; char *dst; char *cfg_content = (char *)xavs2_malloc(2 * size_file_max); char *file_content = (char *)xavs2_malloc(size_file_max); int item_len; int num; /* number of parameters */ int i; /* config file is the first parameter */ if (cfg_content == NULL || file_content == NULL) { return NULL; } cfg_content[0] = '\0'; /* parse the rest command line */ for (i = 1; i < argc;) { if (0 == strncmp(argv[i], "-f", 2)) { /* a new configuration file */ GetConfigFileContent(file_content, size_file_max, argv[i + 1]); strcat(cfg_content, file_content); i += 2; } else if (argv[i][0] == '-' && argv[i][1] == '-') { // "--Parameter=XXX" style dst = copy_parameter(item, argv[i] + 2); /* add \n for each item */ *dst++ = '\n'; *dst = '\0'; xavs2_log(NULL, XAVS2_LOG_DEBUG, "Adding cmd-line string 1: %s", item); /* append this item to the cfg_content */ strcat(cfg_content, item); i++; } else if (0 == strncmp(argv[i], "-p", 2)) { /* a config change? */ /* collect all data until next parameter (starting with - * (x is any character)), and append it to cfg_content */ i++; item_len = 0; /* determine the necessary size for current item */ for (num = i; num < argc && argv[num][0] != '-'; num++) { /* calculate the length for all the strings of current item */ item_len += (int)(strlen(argv[num])); } /* additional bytes for spaces and \0s */ item_len = ((item_len + 128) >> 4) << 4; item[0] = '\0'; dst = item; /* concatenate all parameters identified before */ while (i < num) { dst = copy_parameter(dst, argv[i]); i++; } /* add \n for each item */ *dst++ = '\n'; *dst = '\0'; xavs2_log(NULL, XAVS2_LOG_DEBUG, "Adding cmd-line string 0: %s", item); /* append this item to the cfg_content */ strcat(cfg_content, item); } else { xavs2_log(NULL, XAVS2_LOG_WARNING, "Invalid parameter style, argc %d, around string '%s'\n", i, argv[i]); xavs2_free(cfg_content); cfg_content = NULL; break; } } xavs2_free(file_content); return cfg_content; } /** * --------------------------------------------------------------------------- * Function : Parsing encoding parameters * Parameters : * [in ] : param - pointer to struct xavs2_param_t * [in ] : argc - number of command line parameters * [in ] : argv - pointer to parameter strings * [out] : int - zero for success, otherwise failed * Return : none * --------------------------------------------------------------------------- */ int xavs2_encoder_opt_set(xavs2_param_t *param, int argc, char *argv[]) { char *items[MAX_ITEMS]; char *contents; char *p; char *bufend; char name[64]; int map_index; int item = 0; int in_string = 0; int in_item = 0; int i; if ((contents = xavs2_get_configs(argc, argv)) == NULL) { fprintf(stderr, "get contents from configure file error."); return -1; } p = contents; bufend = &contents[strlen(contents)]; /* alloc memory for mapping table and initialize the table */ memset(&g_param_map, 0, sizeof(g_param_map)); mapping_default(&g_param_map, param); /* generate an argc/argv-type list in items[], without comments and whitespace. * this is context insensitive and could be done most easily with lex(1). */ while (p < bufend) { switch (*p) { case '#': // found comment *p = '\0'; // replace '#' with '\0' in case of comment immediately following integer or string while (*p != '\n' && p < bufend) { p++; // skip till EOL or EOF, whichever comes first } in_string = 0; in_item = 0; break; case '\r': // case 13 case '\n': in_item = 0; in_string = 0; *p++ = '\0'; break; case ' ': case '\t': // skip whitespace, leave state unchanged if (in_string) { p++; } else { *p++ = '\0';// terminate non-strings once whitespace is found in_item = 0; } break; case '\'': case '\"': // begin/end of string *p++ = '\0'; if (!in_string) { items[item++] = p; in_item = ~in_item; } else { in_item = 0; } in_string = ~in_string; // toggle break; default: if (!in_item) { items[item++] = p; in_item = ~in_item; } p++; } } for (i = 0; i < item; i += 3) { get_param_name(name, items[i]); if (0 == strcmp(name, "Frame1:")) { i += (int)ParseRefContent(param, &items[i]); get_param_name(name, items[i]); } if ((map_index = ParameterNameToMapIndex(&g_param_map, name)) < 0) { xavs2_log(NULL, XAVS2_LOG_WARNING, "Parameter Name not recognized: '%s'.\n", items[i]); continue; // do not exit, continue to parse } if (i + 2 >= item) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Parsing error in the last parameter: %s.\n", items[i]); break; } if (strcmp("=", items[i + 1])) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Parsing error in config file: '=' expected as the second token in each line.\n"); return -1; } if (xavs2_encoder_opt_set2(param, name, items[i + 2]) < 0) { return -1; } } fflush(stdout); fflush(stderr); xavs2_free(contents); return 0; } /** * --------------------------------------------------------------------------- * Function : Parsing encoding parameters * Parameters : * [in ] : param - pointer to struct xavs2_param_t * [in ] : name - name of parameter * [in ] : value_string - parameter value * [in ] : value_i - when value_string is null, use this value * Return : int - zero for success, otherwise failed * --------------------------------------------------------------------------- */ int xavs2_encoder_opt_set2(xavs2_param_t *param, const char *name, const char *value_string) { int map_index; int b_error = 0; if (g_param_map.param != param) { /* alloc memory for mapping table and initialize the table */ memset(&g_param_map, 0, sizeof(g_param_map)); mapping_default(&g_param_map, param); } if ((map_index = ParameterNameToMapIndex(&g_param_map, name)) >= 0) { int item_value; switch (g_param_map.map_tab[map_index].type) { case MAP_NUM: // numerical item_value = xavs2e_atoi(value_string, &b_error); if (b_error) { xavs2_log(NULL, XAVS2_LOG_ERROR, " Parsing error: Expected numerical value for Parameter of %s, found '%s'.\n", name, value_string); return -1; } *(int *)(g_param_map.map_tab[map_index].addr) = item_value; if (xavs2_param_match(name, "preset_level") || xavs2_param_match(name, "presetlevel") || xavs2_param_match(name, "preset")) { parse_preset_level(param, param->preset_level); } // fprintf(stdout, "."); break; case MAP_FLAG: item_value = xavs2e_atoi(value_string, &b_error); if (b_error) { xavs2_log(NULL, XAVS2_LOG_ERROR, " Parsing error: Expected numerical value for Parameter of %s, found '%s'.\n", name, value_string); return -1; } *(bool_t *)(g_param_map.map_tab[map_index].addr) = (bool_t)(!!item_value); // fprintf(stdout, "."); break; case MAP_STR: // string strcpy((char *)g_param_map.map_tab[map_index].addr, value_string); // fprintf(stdout, "."); break; default: xavs2_log(NULL, XAVS2_LOG_ERROR, "Unknown value type in the map definition of config file.\n"); return -1; break; } } else if (xavs2_param_match(name, "threads")) { param->i_lcurow_threads = xavs2e_atoi(value_string, &b_error); param->i_frame_threads = 0; } else if (xavs2_param_match(name, "bframes")) { int value_i = xavs2e_atoi(value_string, &b_error); if (value_i > 0) { param->i_gop_size = value_i < 4 ? -4 : -8; param->num_bframes = XAVS2_ABS(param->i_gop_size) - 1; param->b_open_gop = 0; } else { param->num_bframes = 0; param->i_gop_size = -4; param->b_open_gop = 0; } } else if (xavs2_param_match(name, "fps")) { float fps = xavs2e_atof(value_string, &b_error); float min_error = 1000; int min_idx = 0; int i; for (i = 0; i < 8; i++) { float f_err = (float)fabs(FRAME_RATE[i] - fps); if (f_err < min_error) { min_error = f_err; min_idx = i; } } param->frame_rate_code = min_idx; param->frame_rate = fps; } else if (xavs2_param_match(name, "bitdepth")) { int value_i = xavs2e_atoi(value_string, &b_error); param->input_sample_bit_depth = value_i; param->sample_bit_depth = value_i; } return 0; } /** * --------------------------------------------------------------------------- * Function : get value of a specific parameter * Parameters : * [in ] : param - pointer to struct xavs2_param_t * [in ] : name - name of a parameter * Return : const char *: value string * --------------------------------------------------------------------------- */ const char * xavs2_encoder_opt_get(xavs2_param_t *param, const char *name) { static char buf[64]; if (xavs2_param_match(name, "input")) { return param->psz_in_file; } else if (xavs2_param_match(name, "output")) { return param->psz_bs_file; } else if (xavs2_param_match(name, "width")) { sprintf(buf, "%d", param->org_width); return buf; } else if (xavs2_param_match(name, "height")) { sprintf(buf, "%d", param->org_height); return buf; } else if (xavs2_param_match(name, "frames")) { sprintf(buf, "%d", param->num_frames); return buf; } else if (xavs2_param_match(name, "BitDepth")) { sprintf(buf, "%d", param->sample_bit_depth); return buf; } else if (xavs2_param_match(name, "SampleShift")) { sprintf(buf, "%d", param->sample_bit_depth - param->input_sample_bit_depth); return buf; } return NULL; } /** * --------------------------------------------------------------------------- * Function : Output help parameters * Parameters : * [in ] : param - pointer to struct xavs2_param_t * [out] : none * Return : none * --------------------------------------------------------------------------- */ void xavs2_encoder_opt_help(void) { mapping_t *p_map = NULL; xavs2_param_t param; xavs2_log(NULL, XAVS2_LOG_INFO, "Usage:\n\t [-f EncoderFile.cfg] [-p ParameterName=Value] [--ParameterName=value]\n"); xavs2_log(NULL, XAVS2_LOG_INFO, "Supported parameters:\n"); memset(&g_param_map, 0, sizeof(g_param_map)); mapping_default(&g_param_map, ¶m); p_map = g_param_map.map_tab; while (p_map != NULL) { if (p_map->addr == NULL) { break; } xavs2_log(NULL, XAVS2_LOG_INFO, " %-20s : %s\n", p_map->name, p_map->s_instruction); p_map++; } } xavs2-1.3/source/encoder/pre_encode.c000066400000000000000000000310651340660520300176120ustar00rootroot00000000000000/* * pre_encode.c * * Description of this file: * Pre-Encode functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "cudata.h" #include "wrapper.h" #include "frame.h" #include "encoder.h" #include "cpu.h" #include "ratecontrol.h" #include "tdrdo.h" #include "presets.h" #include "rps.h" /* --------------------------------------------------------------------------- */ static int slice_type_analyse(xavs2_handler_t *h_mgr, xavs2_frame_t *frm) { /* GOP structures * openGOP: I B......B F (...) B......B F B......B I B......B F (...) * |<-subGOP->| .. |<-subGOP->||<-subGOP->||<-subGOP->| * |<---------- GOP0 ---------->||<---------- GOP1 ---------->| * * closeGOP: I B......B F (...) B......B F I B......B F (...) B......B F * |<-subGOP->| .. |<-subGOP->| |<-subGOP->| |<-subGOP->| * |<---------- GOP0 ---------->||<----------- GOP1 ---------->| * */ lookahead_t *lookahead = &h_mgr->lookahead; const xavs2_param_t *param = h_mgr->p_coder->param; int b_delayed = 0; // the frame is normal to be encoded default /* slice type decision */ if (lookahead->start) { int p_frm_type = param->enable_f_frame ? XAVS2_TYPE_F : XAVS2_TYPE_P; if (param->intra_period_max == 1) { // for AI (All Intra) frm->i_frm_type = XAVS2_TYPE_I; frm->b_keyframe = 1; } else if (param->intra_period_max == 0 || param->num_bframes == 0) { // for LDP (with no intra period) frm->i_frm_type = p_frm_type; frm->b_keyframe = 0; lookahead->gopframes++; // when intra period is non-zero, set key frames if (lookahead->gopframes - 1 == param->intra_period_max) { frm->i_frm_type = XAVS2_TYPE_I; frm->b_keyframe = 1; lookahead->gopframes = 1; } } else { // for RA (with any intra period) or LDP (with an intra period > 1), // buffer all these frames lookahead->gopframes++; b_delayed = 1; // the frame is delayed to be encoded frm->b_keyframe = 0; --lookahead->bpframes; if (param->b_open_gop && lookahead->gopframes - 1 == param->intra_period_max) { // new sequence start // note: this i-frame's POI does NOT equal to its COI frm->i_frm_type = XAVS2_TYPE_I; frm->b_keyframe = 1; lookahead->gopframes = 1; lookahead->bpframes = param->i_gop_size; } else if (!param->b_open_gop && lookahead->gopframes == param->intra_period_max) { frm->i_frm_type = p_frm_type; lookahead->start = 0; lookahead->bpframes = param->i_gop_size; } else if (lookahead->bpframes > 0) { // the first 'bpframes - 1' frames is of type B frm->i_frm_type = XAVS2_TYPE_B; } else { frm->i_frm_type = p_frm_type; lookahead->bpframes = param->i_gop_size; } } } else { // the very first frame of an open GOP stream or the first frame (IDR) of a close GOP stream frm->i_frm_type = XAVS2_TYPE_I; frm->b_keyframe = 1; lookahead->start = 1; // set flag lookahead->bpframes = param->i_gop_size; lookahead->gopframes= 1; } return b_delayed; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void decide_frame_dts(xavs2_handler_t *h_mgr, xavs2_frame_t *frm) { int num_bframes_delay = h_mgr->p_coder->picture_reorder_delay; int num_encoded_frame = h_mgr->num_encoded_frames_for_dts; if (num_bframes_delay) { if (num_encoded_frame > num_bframes_delay) { frm->i_dts = h_mgr->prev_reordered_pts_set[num_encoded_frame % num_bframes_delay]; } else { frm->i_dts = frm->i_reordered_pts - num_bframes_delay; } h_mgr->prev_reordered_pts_set[num_encoded_frame % num_bframes_delay] = frm->i_reordered_pts; } else { frm->i_dts = frm->i_reordered_pts; } h_mgr->num_encoded_frames_for_dts++; /* Խ */ if (h_mgr->num_encoded_frames_for_dts > 32768) { h_mgr->num_encoded_frames_for_dts -= 16384; } } /* --------------------------------------------------------------------------- */ static INLINE void lookahead_append_frame(xavs2_handler_t *h_mgr, xlist_t *list_out, xavs2_frame_t *fenc, int num_bframes, int idx_in_gop) { if (fenc->i_state != XAVS2_EXIT_THREAD && fenc->i_state != XAVS2_FLUSH) { fenc->i_frm_coi = h_mgr->ipb.COI; h_mgr->ipb.COI++; frame_buffer_update(h_mgr->p_coder, &h_mgr->ipb, fenc); fenc->i_gop_idr_coi = h_mgr->ipb.COI_IDR; decide_frame_dts(h_mgr, fenc); UNUSED_PARAMETER(num_bframes); UNUSED_PARAMETER(idx_in_gop); } if (fenc != NULL) { xl_append(list_out, fenc); } } /* append a group of frames to the output list */ static INLINE void lookahead_append_subgop_frames(xavs2_handler_t *h_mgr, xlist_t *list_out, xavs2_frame_t **blocked_frm_set, int64_t *blocked_pts_set, int num_frames) { xavs2_t *h = h_mgr->p_coder; const xavs2_param_t *param = h->param; int i; /* append all frames one by one to output list */ if (param->i_gop_size == num_frames) { for (i = 0; i < num_frames; i++) { int k = param->cfg_ref_all[i].poc; if (k > 0) { /* get a frame to encode */ xavs2_frame_t *frm = blocked_frm_set[k]; if (frm == NULL) { break; } /* clear */ blocked_frm_set[k] = NULL; /* set DTS */ frm->i_reordered_pts = blocked_pts_set[i + 1]; /* append to output list to be encoded */ lookahead_append_frame(h_mgr, list_out, frm, param->num_bframes, i + 1); h_mgr->num_encode++; } else { break; } } #if !RELEASE_BUILD /* check the buffer */ for (i = 1; i <= num_frames; i++) { assert(blocked_frm_set[i] == NULL); } #endif } else if (num_frames > 0) { static const int tab_poc_order[][8] = { { 1, 0, 0, 0, 0, 0, 0, 0 }, // 1 { 2, 1, 0, 0, 0, 0, 0, 0 }, // 2: 1 B frame { 3, 1, 2, 0, 0, 0, 0, 0 }, // 3: 2 B frames { 4, 2, 1, 3, 0, 0, 0, 0 }, // 4: 3 B frames { 5, 2, 1, 3, 4, 0, 0, 0 }, // 5: 4 B frames { 6, 3, 1, 2, 4, 5, 0, 0 }, // 6: 5 B frames { 7, 3, 1, 2, 5, 4, 6, 0 }, // 7: 6 B frames { 8, 4, 2, 1, 3, 6, 5, 7 }, // 8: 7 B frames }; const int *p_tab_poc = tab_poc_order[num_frames - 1]; for (i = 0; i < num_frames; i++) { int k = p_tab_poc[i]; if (k > 0) { /* get a frame to encode */ xavs2_frame_t *frm = blocked_frm_set[k]; if (frm == NULL) { break; } /* clear */ blocked_frm_set[k] = NULL; /* set frame type */ if (i == 0) { frm->i_frm_type = h_mgr->p_coder->param->enable_f_frame ? XAVS2_TYPE_F : XAVS2_TYPE_P; } /* set DTS */ frm->i_reordered_pts = blocked_pts_set[i + 1]; /* append to output list to be encoded */ lookahead_append_frame(h_mgr, list_out, frm, param->num_bframes, i + 1); h_mgr->num_encode++; } else { break; } } } /* reset the index */ h_mgr->num_blocked_frames = 0; /* the buffer is empty now */ } /** * =========================================================================== * interface function defines (xavs2 encoder library APIs for AVS2 video encoder) * =========================================================================== */ /** * --------------------------------------------------------------------------- * Function : complexity analysis and slice type decision of one frame, * then send the frame into encoding queue * Parameters : * [in ] : h_mgr - pointer to xavs2_handler_t * [out] : end of encoding * Return : none * --------------------------------------------------------------------------- */ int send_frame_to_enc_queue(xavs2_handler_t *h_mgr, xavs2_frame_t *frm) { xavs2_t *h = h_mgr->p_coder; const xavs2_param_t *param = h->param; xavs2_frame_t **blocked_frm_set = h_mgr->blocked_frm_set; int64_t *blocked_pts_set = h_mgr->blocked_pts_set; xlist_t *list_out = &h_mgr->list_frames_ready; /* check state */ if (frm->i_state == XAVS2_EXIT_THREAD) { /* 1, estimate frame complexity and append rest frames */ lookahead_append_subgop_frames(h_mgr, list_out, blocked_frm_set, blocked_pts_set, h_mgr->num_blocked_frames); /* 2, append current frame */ lookahead_append_frame(h_mgr, list_out, frm, 0, 0); /* 3, exit this thread */ return -1; } /* process... */ if (frm->i_state != XAVS2_FLUSH) { /* decide the slice type of current frame */ int b_delayed = slice_type_analyse(h_mgr, frm); // is frame delayed to be encoded (B frame) ? if (b_delayed) { /* block a whole GOP until the last frame(I/P/F) of current GOP * a GOP should look somewhat like(POC order): B...BP */ h_mgr->num_blocked_frames++; assert(h_mgr->num_blocked_frames <= param->i_gop_size); /* store the frame in blocked buffers */ blocked_frm_set[h_mgr->num_blocked_frames] = frm; blocked_pts_set[h_mgr->num_blocked_frames] = frm->i_pts; /* is the last frame(I/P/F) of current GOP? */ if (frm->i_frm_type != XAVS2_TYPE_B) { lookahead_append_subgop_frames(h_mgr, list_out, blocked_frm_set, blocked_pts_set, h_mgr->num_blocked_frames); } } else { assert(h_mgr->num_blocked_frames == 0); frm->i_reordered_pts = frm->i_pts; /* DTS is same as PTS */ lookahead_append_frame(h_mgr, list_out, frm, param->num_bframes, h_mgr->num_blocked_frames); h_mgr->num_encode++; } } else { /* flushing... */ lookahead_append_subgop_frames(h_mgr, list_out, blocked_frm_set, blocked_pts_set, h_mgr->num_blocked_frames); h_mgr->num_blocked_frames = 0; /* append current frame to label flushing */ lookahead_append_frame(h_mgr, list_out, frm, 0, 0); } return 0; } xavs2-1.3/source/encoder/presets.c000066400000000000000000000361711340660520300171770ustar00rootroot00000000000000/* * presets.c * * Description of this file: * parse preset level functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common/common.h" #include "encoder/aec.h" #include "presets.h" /** * =========================================================================== * macros * =========================================================================== */ /* macros for enable/disable algorithms */ #define SWITCH_OFF(m) enable_algs &= (~(1LL << (m))) #define SWITCH_ON(m) enable_algs |= (1LL << (m)) /** * =========================================================================== * local tables * =========================================================================== */ /* --------------------------------------------------------------------------- * ֡ȿRDOģʽӦͬpreset */ static const uint8_t INTRA_FULL_RDO_NUM[][MAX_CU_SIZE_IN_BIT + 1] = { { 0, 0, 1, 1, 1, 1, 1 }, /* 0: 1x1, 2x2, 4x4, 8x8, 16x16, 32x32, 64x64 */ { 0, 0, 2, 2, 2, 2, 1 }, /* 1: 1x1, 2x2, 4x4, 8x8, 16x16, 32x32, 64x64 */ { 0, 0, 3, 3, 3, 3, 2 }, /* 2: 1x1, 2x2, 4x4, 8x8, 16x16, 32x32, 64x64 */ { 0, 0, 4, 4, 3, 3, 2 }, /* 3: 1x1, 2x2, 4x4, 8x8, 16x16, 32x32, 64x64 */ { 0, 0, 5, 5, 5, 4, 3 }, /* 4: 1x1, 2x2, 4x4, 8x8, 16x16, 32x32, 64x64 */ { 0, 0, 6, 6, 6, 4, 3 }, /* 5: 1x1, 2x2, 4x4, 8x8, 16x16, 32x32, 64x64 */ { 0, 0, 7, 7, 7, 6, 5 }, /* 6: 1x1, 2x2, 4x4, 8x8, 16x16, 32x32, 64x64 */ { 0, 0, 8, 8, 8, 6, 5 }, /* 7: 1x1, 2x2, 4x4, 8x8, 16x16, 32x32, 64x64 */ { 0, 0, 9, 9, 9, 9, 9 }, /* 8: 1x1, 2x2, 4x4, 8x8, 16x16, 32x32, 64x64 */ { 0, 0, 9, 9, 9, 9, 9 }, /* 9: 1x1, 2x2, 4x4, 8x8, 16x16, 32x32, 64x64 */ }; /* --------------------------------------------------------------------------- * ֡ɫȿ RDO ģʽ (ͬpreset) */ static const int8_t tab_num_rdo_chroma_intra_mode[] = { 1, 2, 2, 2, 3, 3, 4, 4, 5, 5 }; /* ֡RMDֵΪ21ĽǶ */ static const int8_t tab_num_angle_dist2[] = { 0, 0, 4, 4, 4, 4, 5, 5, 6, 6 }; static const int8_t tab_num_angle_dist1[] = { 0, 0, 0, 0, 2, 2, 3, 3, 4, 4 }; /* --------------------------------------------------------------------------- * ȫʱжֵ */ static const float tab_th_zero_block_factor[] = { 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 }; /* --------------------------------------------------------------------------- * QSFD㷨ֵϵͬpreset */ const static double tab_qsfd_s_presets[][10] = { /* preset_level: * 0 1 2 3 4 5 6 7 8 9 */ { 2.3, 1.8, 1.6, 1.3, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}, /* inter */ { 0.9, 0.7, 0.7, 0.6, 0.5, 0.4, 0.3, 0.3, 0.2, 0.2}, /* intra */ }; const static double tab_qsfd_cu_size_weight[4] = { 0.25, 1.0, 3.0, 7.5 /* 8x8, 16x16, 32x32, 64x64 */ }; double tab_qsfd_thres[MAX_QP][2][CTU_DEPTH]; /*-------------------------------------------------------------------------- */ static INLINE void algorithm_init_thresholds(xavs2_param_t *p_param) { int i_preset_level = p_param->preset_level; //trade-off encoding time and performance const double s_inter = tab_qsfd_s_presets[0][i_preset_level]; const double s_intra = tab_qsfd_s_presets[1][i_preset_level]; int i; /* QSFD threasholds */ for (i = 0; i < MAX_QP; i++) { double qstep = 32768.0 / tab_Q_TAB[i]; double th_base = 350 * pow(qstep, 0.9); double th__8 = th_base * tab_qsfd_cu_size_weight[0]; double th_16 = th_base * tab_qsfd_cu_size_weight[1]; double th_32 = th_base * tab_qsfd_cu_size_weight[2]; double th_64 = th_base * tab_qsfd_cu_size_weight[3]; /* inter frame */ tab_qsfd_thres[i][0][0] = th__8 * s_inter; tab_qsfd_thres[i][0][1] = th_16 * s_inter; tab_qsfd_thres[i][0][2] = th_32 * s_inter; tab_qsfd_thres[i][0][3] = th_64 * s_inter; if (i_preset_level < 2) { tab_qsfd_thres[i][0][1] *= 2.0; } /* intra frame */ tab_qsfd_thres[i][1][0] = th__8; tab_qsfd_thres[i][1][1] = th_16 * s_intra * 1.4; tab_qsfd_thres[i][1][2] = th_32 * s_intra * 1.2; tab_qsfd_thres[i][1][3] = th_64 * s_intra * 1.0; } /* ȫ */ p_param->factor_zero_block = tab_th_zero_block_factor[i_preset_level]; } /* --------------------------------------------------------------------------- * Function : modify configurations according to different preset levels. * Parameters : * [in/out] : p_param - the coding parameter to be set * [in ] : i_preset_level - the preset level * Return : none * --------------------------------------------------------------------------- */ void parse_preset_level(xavs2_param_t *p_param, int i_preset_level) { /* special settings */ if (i_preset_level < 2) { /* only for level: 0,1 */ p_param->search_range = XAVS2_MIN(p_param->search_range, 57); p_param->num_max_ref = 2; } else { /* only for level: 2,3,4,5,6,7,8,9 */ p_param->num_max_ref = XAVS2_MIN(i_preset_level, 4); } /* --------------------------- CUṹ --------------------------- | preset | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | +=================+=====+=====+=====+=====+=====+=====+======+======+======+======+ | ctu | 32 | 32 | 64 | 64 | 64 | 64 | 64 | 64 | 64 | 64 | | min-cu-size | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | */ p_param->lcu_bit_level = XAVS2_MIN(p_param->lcu_bit_level, 5 + (i_preset_level > 1)); /* --------------------------- Ԥ --------------------------- */ p_param->inter_2pu = i_preset_level > 1; p_param->enable_intra = i_preset_level > 0; p_param->enable_f_frame = i_preset_level > -1; p_param->enable_mhp_skip = i_preset_level > -1 && p_param->enable_f_frame; p_param->enable_wsm = i_preset_level > 7 && p_param->enable_f_frame; p_param->enable_dhp = i_preset_level > 7 && p_param->enable_f_frame; p_param->enable_dmh = i_preset_level > 4 && p_param->enable_f_frame; p_param->enable_amp = i_preset_level > 4; // NSQT /* --------------------------- 任 --------------------------- */ p_param->enable_sdip = i_preset_level > 4; p_param->enable_nsqt = i_preset_level > 4; p_param->enable_secT = i_preset_level > -1; p_param->b_fast_2lelvel_tu = i_preset_level < 4; /* --------------------------- --------------------------- * Level: All for preset 9, Off for preset 0~2 */ p_param->i_rdoq_level = i_preset_level > 6 ? RDOQ_ALL : i_preset_level > 2 ? RDOQ_CU_LEVEL : RDOQ_OFF; /* --------------------------- RDO --------------------------- */ if (i_preset_level < 0) { p_param->i_rd_level = RDO_OFF; } else if (i_preset_level < 1) { p_param->i_rd_level = RDO_CU_LEVEL1; } else if (i_preset_level < 5) { p_param->i_rd_level = RDO_CU_LEVEL2; } else { p_param->i_rd_level = RDO_ALL; } /* --------------------------- ر --------------------------- */ if (i_preset_level <= 1) { p_param->rdo_bit_est_method = 2; } else if (i_preset_level < 4) { p_param->rdo_bit_est_method = 1; } else { p_param->rdo_bit_est_method = 0; } /* --------------------------- ˲ --------------------------- */ p_param->enable_alf = p_param->enable_alf && i_preset_level > 4; p_param->enable_sao = p_param->enable_sao && i_preset_level > 0; p_param->b_fast_sao = i_preset_level < 5; // 4¿SAO /* --------------------------- --------------------------- */ p_param->enable_hadamard = i_preset_level > 0; p_param->enable_tdrdo = i_preset_level > 4 && p_param->enable_tdrdo; /* tell the encoder preset configuration is utilized */ p_param->is_preset_configured = TRUE; } /* --------------------------------------------------------------------------- * reconfigure encoder after one frame has been encoded */ void xavs2_reconfigure_encoder(xavs2_t *h) { UNUSED_PARAMETER(h); } /* --------------------------------------------------------------------------- * fast algorithms for different presets */ static INLINE uint64_t get_fast_algorithms(xavs2_t *h, int i_preset_level) { uint64_t enable_algs = 0; // disable all algorithms UNUSED_PARAMETER(h); switch (i_preset_level) { case 0: // ultra fast SWITCH_ON(OPT_ET_INTRA_DEPTH); SWITCH_ON(OPT_SKIP_DMH_THRES); SWITCH_ON(OPT_EARLY_SKIP); SWITCH_ON(OPT_BYPASS_MODE_FPIC); SWITCH_ON(OPT_BYPASS_SDIP); SWITCH_ON(OPT_BYPASS_INTRA_BPIC); case 1: // super fast SWITCH_ON(OPT_ECU); case 2: // very fast SWITCH_ON(OPT_FAST_ZBLOCK); SWITCH_ON(OPT_FAST_RDO_INTRA_C); case 3: // faster SWITCH_ON(OPT_FAST_CBF_MODE); SWITCH_ON(OPT_ET_RDO_INTRA_L); SWITCH_ON(OPT_BYPASS_INTRA_RDOQ); SWITCH_ON(OPT_RDOQ_AZPC); SWITCH_ON(OPT_PU_RMS); case 4: // fast SWITCH_ON(OPT_CU_DEPTH_CTRL); SWITCH_ON(OPT_SUBCU_SPLIT); SWITCH_ON(OPT_FAST_PU_SEL); SWITCH_ON(OPT_CMS_ETMD); case 5: SWITCH_ON(OPT_ROUGH_SKIP_SEL); SWITCH_ON(OPT_BIT_EST_PSZT); SWITCH_ON(OPT_FAST_ALF); SWITCH_ON(OPT_FAST_SAO); SWITCH_ON(OPT_CBP_DIRECT); SWITCH_ON(OPT_FAST_INTRA_IN_INTER); case 6: // slow SWITCH_ON(OPT_BYPASS_AMP); SWITCH_ON(OPT_CODE_OPTIMZATION); case 7: // slower SWITCH_ON(OPT_CU_QSFD); SWITCH_ON(OPT_TU_LEVEL_DEC); SWITCH_ON(OPT_TR_KEY_FRAME_MD); case 8: // very slow // fast inter SWITCH_ON(OPT_DMH_CANDIDATE); SWITCH_ON(OPT_ADVANCE_CHROMA_AEC); SWITCH_ON(OPT_ROUGH_MODE_SKIP); SWITCH_ON(OPT_PSC_MD); // fast intra SWITCH_ON(OPT_FAST_INTRA_MODE); break; case 9: // placebo enable_algs = 0; /* switch off all fast algorithms */ break; default: assert(0); break; } return enable_algs; } /** * --------------------------------------------------------------------------- * Function : set fast algorithms enabled according to different preset levels * Parameters : * [in ] : h - pointer to struct xavs2_t, the xavs2 encoder * Return : none * --------------------------------------------------------------------------- */ void encoder_set_fast_algorithms(xavs2_t *h) { const int num_algorithm = NUM_FAST_ALGS; int i_preset_level = h->param->preset_level; uint64_t enable_algs = 0; // disable all algorithms if (num_algorithm > 64) { xavs2_log(h, XAVS2_LOG_ERROR, "Algorithms error: too many flags: %d\n", num_algorithm); exit(0); } /* ------------------------------------------------------------- * 1, switch on some algorithms with little efficiency loss */ /* ǷҪ˶ * ο֡1ʱMVŶMVؾȴﵽ1/4 */ if (i_preset_level < 2) { h->use_fractional_me = 1; } else { h->use_fractional_me = 2; } h->use_fast_sub_me = (i_preset_level < 5); h->UMH_big_hex_level = (i_preset_level < 5) ? 0 : (i_preset_level < 9) ? 1 : 2; h->skip_rough_improved = (i_preset_level > 3); /* ------------------------------------------------------------- * 2, switch off part of fast algorithms according to different preset levels */ enable_algs = get_fast_algorithms(h, i_preset_level); SWITCH_OFF(OPT_ROUGH_PU_SEL); /* apply the settings */ h->i_fast_algs = enable_algs; if (IS_ALG_ENABLE(OPT_ET_RDO_INTRA_L)) { memcpy(h->tab_num_intra_rdo, INTRA_FULL_RDO_NUM[i_preset_level >> 1], sizeof(h->tab_num_intra_rdo)); } else { memcpy(h->tab_num_intra_rdo, INTRA_FULL_RDO_NUM[i_preset_level >> 0], sizeof(h->tab_num_intra_rdo)); } /* RMD㷨Ƕ */ h->num_intra_rmd_dist2 = tab_num_angle_dist2[i_preset_level]; h->num_intra_rmd_dist1 = tab_num_angle_dist1[i_preset_level]; h->num_rdo_intra_chroma = tab_num_rdo_chroma_intra_mode[i_preset_level]; /* ֡Ԥģʽ */ if (IS_ALG_ENABLE(OPT_FAST_INTRA_MODE)) { h->get_intra_candidates_luma = rdo_get_pred_intra_luma_rmd; } else { h->get_intra_candidates_luma = rdo_get_pred_intra_luma; } if (IS_ALG_ENABLE(OPT_FAST_RDO_INTRA_C)) { h->get_intra_candidates_chroma = rdo_get_pred_intra_chroma_fast; } else { h->get_intra_candidates_chroma = rdo_get_pred_intra_chroma; } /* AEC */ switch (h->param->rdo_bit_est_method) { case 1: case 2: h->size_aec_rdo_copy = sizeof(aec_t) - sizeof(ctx_set_t); h->copy_aec_state_rdo = aec_copy_aec_state_rdo; break; default: h->size_aec_rdo_copy = sizeof(aec_t); h->copy_aec_state_rdo = aec_copy_aec_state; break; } } /** * --------------------------------------------------------------------------- * Function : decide the ultimate parameters used by encoders * Parameters : * [in ] : p_param - the ultimate coding parameter to be set * Return : none * --------------------------------------------------------------------------- */ void decide_ultimate_paramters(xavs2_param_t *p_param) { algorithm_init_thresholds(p_param); if (p_param->preset_level < 4) { p_param->me_method = XAVS2_ME_HEX; } } #undef SWITCH_OFF #undef SWITCH_ON xavs2-1.3/source/encoder/presets.h000066400000000000000000000036141340660520300172000ustar00rootroot00000000000000/* * presets.h * * Description of this file: * parse preset level functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_PRESETS_H #define XAVS2_PRESETS_H #define parse_preset_level FPFX(parse_preset_level) void parse_preset_level(xavs2_param_t *p_param, int i_preset_level); #define encoder_set_fast_algorithms FPFX(encoder_set_fast_algorithms) void encoder_set_fast_algorithms(xavs2_t *h); #define decide_ultimate_paramters FPFX(decide_ultimate_paramters) void decide_ultimate_paramters(xavs2_param_t *p_param); #endif // XAVS2_PRESET_LEVELS_H xavs2-1.3/source/encoder/ratecontrol.c000066400000000000000000000751351340660520300200510ustar00rootroot00000000000000/* * ratecontrol.c * * Description of this file: * Ratecontrol functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "ratecontrol.h" #include "cpu.h" #include "defines.h" /** * =========================================================================== * const defines * =========================================================================== */ static const double PI = (3.14159265358979); static const int RC_MAX_INT = 1024; // max frame number, used to refresh encoder when frame number is not known static const double RC_MAX_DELTA_QP = 3.5; // max delta QP between current key frame and its previous key frame #define RC_LCU_LEVEL 0 // 1 - enable LCU level rate control, 0 - disable #define RC_AUTO_ADJUST 0 // 1 - enable auto adjust the qp // #define RC_MODEL_HISTORY 2 // #define RC_MAX_TEMPORAL_LEVELS 5 /** * =========================================================================== * type defines * =========================================================================== */ #if RC_LCU_LEVEL /* --------------------------------------------------------------------------- */ typedef struct RCLCU { int GlobeLCUNum; int CurBits; int PreBits; double PreBpp; double CurBpp; double LCUTBL; double LCUTBLdelta; double LCUBuffer; double LCUBufferError; double LCUBufferDifError; double LCUPreBufferError; int LCUPreLCUQP; int LCUCurLCUQP; int LCUdeltaQP; double LCUCurLambda; } RCLCU; #endif /* --------------------------------------------------------------------------- * |<--- WIN --->| * . . . . . I B B . . . F B B . . . F B B . . . F B B . . . I B B . . . * |<-- GOP -->|<-- GOP -->|<-- GOP -->|<-- GOP -->| B B . . . */ struct ratectrl_t { //EKIN_MARK /* const */ int i_total_frames; // total frames to be encoded (=0, forever) int i_intra_period; // period of I-frames (=0, only first) int i_frame_size; // frame size in pixel int b_open_gop; // open GOP? 1: open, 0: close /* qp */ double f_delta_qp; // delta qp int i_last_qp; // qp for the last KEY frame int i_base_qp; // initial and base qp int i_min_qp; /* min QP */ int i_max_qp; /* max QP */ /* count */ int i_coded_frames; // number of encoded frames /* gop/win */ int i_gop_flag; // flag (index of first frame in GOP ) int i_win_size; // size of WIN int i_win_cnt; // count of KEY frames in current WIN (window length = a period of I-frames) int i_win_qp; // sum of KEY frame QP in current WIN double f_win_bpp; // sum of KEY frame BPP in current WIN double f_gop_bpp; // sum of frame BPP in current GOP /* bpp */ double f_target_bpp; // average target BBP (bit per pixel) for each frame double f_intra_bpp; // BPP of intra KEY frame (used only for i_intra_period = 0/1) double f_inter_bpp; // BPP of inter KEY frame (used only for i_intra_period = 0/1) /* buffer */ double f_buf_curr; // current buffer size in BPP (bits per pixel) double f_buf_error; // buffer error double f_buf_error_diff; // different buffer error double f_buf_error_prev; // previous buffer error #if RC_AUTO_ADJUST /* level */ double f_first_buf_level; // first buffer size level double f_target_buf_level; // target buffer level double f_delta_buf_level; // delta value of buffer level #endif #if RC_LCU_LEVEL /* LCU RC */ int RcMBQP; // int SumMBQP; // int NumMB; // int LCUbaseQP; // RCLCU rc_lcu; // #endif xavs2_thread_mutex_t rc_mutex; }; /** * =========================================================================== * local/global variables * =========================================================================== */ /* --------------------------------------------------------------------------- */ static const double tab_fuzzy_initial[13][13] = { {-4.80, -4.80, -4.80, -4.80, -3.57, -3.57, -3.17, -3.17, -2.00, -2.00, -0.25, -0.25, 0.00}, {-4.80, -4.80, -4.80, -4.80, -3.57, -3.57, -3.17, -3.17, -2.00, -2.00, -0.25, -0.25, 0.00}, {-4.80, -4.80, -3.57, -3.57, -3.57, -3.57, -2.00, -2.00, -1.10, -1.10, 0.00, 0.00, 0.25}, {-4.80, -4.80, -3.57, -3.57, -3.57, -3.57, -2.00, -2.00, -1.10, -1.10, 0.00, 0.00, 0.25}, {-3.57, -3.57, -3.57, -3.57, -2.00, -2.00, -1.10, -1.10, 0.00, 0.00, 1.10, 1.10, 2.00}, {-3.57, -3.57, -3.57, -3.57, -2.00, -2.00, -1.10, -1.10, 0.00, 0.00, 1.10, 1.10, 2.00}, {-3.17, -3.17, -2.00, -2.00, -1.10, -1.10, 0.00, 0.00, 1.10, 1.10, 2.00, 2.00, 3.17}, {-3.17, -3.17, -2.00, -2.00, -1.10, -1.10, 0.00, 0.00, 1.10, 1.10, 2.00, 2.00, 3.17}, {-2.00, -2.00, -1.10, -1.10, 0.00, 0.00, 1.10, 1.10, 2.00, 2.00, 3.57, 3.57, 3.57}, {-2.00, -2.00, -1.10, -1.10, 0.00, 0.00, 1.10, 1.10, 2.00, 2.00, 3.57, 3.57, 3.57}, {-0.25, -0.25, 0.00, 0.00, 1.10, 1.10, 2.44, 2.44, 3.57, 3.57, 3.57, 3.57, 4.80}, {-0.25, -0.25, 0.00, 0.00, 1.10, 1.10, 2.44, 2.44, 3.57, 3.57, 3.57, 3.57, 4.80}, { 0.00, 0.00, 0.25, 0.25, 2.00, 2.00, 3.57, 3.57, 3.86, 3.86, 4.80, 4.80, 4.80}, }; /* --------------------------------------------------------------------------- */ static double tab_fuzzy_qp_query[13][13]; #if ENABLE_AUTO_INIT_QP /* --------------------------------------------------------------------------- * table for getting initial qp via gpp */ static const double tab_qp_gpp[3][3] = { {5.656359783, 1.029364114, 0.120057248}, {6.520734830, 1.191140657, 0.089733000}, {5.494096438, 0.954657540, 0.111765010} }; #endif #if ENABLE_AUTO_INIT_QP /* --------------------------------------------------------------------------- * compute the gradient per pixel */ static double cal_frame_gradient(xavs2_frame_t *frm) { double grad_per_pixel = 0; // gradient per pixel pel_t *src = frm->planes[IMG_Y];// pointer to luma component int width = frm->i_width[IMG_Y]; int height = frm->i_lines[IMG_Y]; int stride = frm->i_stride[IMG_Y]; int size = width * height; int i, j; width--; height--; for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { int dx = src[j] - src[j + 1]; int dy = src[j] - src[j + stride]; if (dx || dy) { grad_per_pixel += sqrt((double)(dx * dx + dy * dy)); } } src += stride; } return grad_per_pixel / size; } #endif /* --------------------------------------------------------------------------- */ static void init_fuzzy_controller(double f_scale_factor) { int i, j; for (i = 0; i < 13; i++) { for (j = 0; j < 13; j++) { tab_fuzzy_qp_query[i][j] = tab_fuzzy_initial[i][j] * f_scale_factor; } } } /* --------------------------------------------------------------------------- */ static double fuzzy_get_delta_qp(double f_actual_val, double f_delta_val, double max_a, double min_a, double max_b, double min_b) { double dFuzAct = (12.0 / (max_a - min_a)) * (f_actual_val - (max_a + min_a) / 2.0); double dFuzDel = (12.0 / (max_b - min_b)) * (f_delta_val - (max_b + min_b) / 2.0); int iFuzAct, iFuzDel; dFuzAct = XAVS2_CLIP3F(-6.0, 6.0, dFuzAct); dFuzDel = XAVS2_CLIP3F(-6.0, 6.0, dFuzDel); iFuzAct = (int)((dFuzAct < 0 ? floor(dFuzAct + 0.5) : ceil(dFuzAct - 0.5)) + 6); iFuzDel = (int)((dFuzDel < 0 ? floor(dFuzDel + 0.5) : ceil(dFuzDel - 0.5)) + 6); return tab_fuzzy_qp_query[iFuzAct][iFuzDel]; } /* --------------------------------------------------------------------------- */ static double rc_calculate_gop_delta_qp(ratectrl_t *rc, int frm_type, int gop_len) { double buf_range; double buf_range_delta; double tmp_bpp; /* get ERROR */ #if RC_AUTO_ADJUST rc->f_buf_error = rc->f_buf_curr - rc->f_target_buf_level; #else rc->f_buf_error = rc->f_buf_curr; #endif if ((rc->i_coded_frames % gop_len == 1) || (rc->i_intra_period == 1)) { rc->f_buf_error_diff = rc->f_buf_error - rc->f_buf_error_prev; rc->f_buf_error_prev = rc->f_buf_error; } /* get BPP */ if (rc->i_intra_period > 1) { if (frm_type == XAVS2_TYPE_I) { tmp_bpp = rc->f_win_bpp; } else { tmp_bpp = rc->f_gop_bpp; } } else { if (rc->i_intra_period == 1) { tmp_bpp = rc->f_intra_bpp; } else /*if (rc->i_intra_period == 0)*/ { tmp_bpp = rc->f_inter_bpp; } } /* get RANGE */ buf_range = rc->i_coded_frames < 2 ? (tmp_bpp * 4) : (tmp_bpp / 4); buf_range = XAVS2_MAX(buf_range, 0.0001); if (rc->i_intra_period <= 1 || frm_type == XAVS2_TYPE_I) { buf_range_delta = buf_range * 2; } else { buf_range_delta = buf_range / 2; } return fuzzy_get_delta_qp(rc->f_buf_error, rc->f_buf_error_diff, buf_range, -buf_range, buf_range_delta, -buf_range_delta); } /** * --------------------------------------------------------------------------- * Function : calculate the key frame QP * Parameters : * [in ] : h - handle of the xavs2 video encoder * : frm_idx - frame index * : frm_type - frame type * : force_qp - specified qp for encoding current frame * [out] : none * Return : the key frame QP * --------------------------------------------------------------------------- */ static int rc_calculate_frame_qp(xavs2_t *h, int frm_idx, int frm_type, int force_qp) { ratectrl_t *rc = h->rc; const int max_qp = rc->i_max_qp + (h->param->sample_bit_depth - 8) * 8; const int remain_frames = rc->i_total_frames - rc->i_coded_frames; double qp; assert(h->param->i_rc_method != XAVS2_RC_CQP); /* call before floating point arithmetic */ xavs2_emms(); /* the initial frame QP */ if (h->param->enable_refine_qp && h->param->intra_period_max > 1) { qp = 5.661 * log(h->f_lambda_mode) + 13.131; } else { qp = h->i_qp; } /* Update Frame count when I frame is found */ if (frm_type == XAVS2_TYPE_I && rc->i_coded_frames > (RC_MAX_INT >> 1)) { rc->i_total_frames = RC_MAX_INT; rc->i_coded_frames = 0; } /* size is changed from the 2nd WIN for OPEN GOP */ if (frm_type == XAVS2_TYPE_I && frm_idx > 0 && rc->b_open_gop) { rc->i_win_size = h->i_gop_size * rc->i_intra_period; } #if ENABLE_AUTO_INIT_QP /* compute the initial qp */ if (frm_idx == 0) { double bit = log(1000 * rc->f_target_bpp); double gpp = log(cal_frame_gradient(h->fenc)); int idx = XAVS2_MIN(2, rc->i_intra_period); int max_i_qp = 63 + (h->param->sample_bit_depth - 8) * 8 - 10; qp = (tab_qp_gpp[idx][0] + tab_qp_gpp[idx][1] * gpp - bit) / tab_qp_gpp[idx][2]; qp = XAVS2_CLIP3F(20, max_i_qp, qp); rc->i_base_qp = (int)(qp + 0.5); // reset the QP in encoder parameters } #endif /* compute the delta qp */ if (rc->i_intra_period == 0) { if (frm_idx % h->i_gop_size == 1) { rc->f_delta_qp = rc_calculate_gop_delta_qp(rc, frm_type, h->i_gop_size); } } else if (rc->i_intra_period == 1) { if ((frm_idx % h->i_gop_size == 0) && (frm_idx != 0)) { rc->f_delta_qp = rc_calculate_gop_delta_qp(rc, frm_type, h->i_gop_size); } } else { if ((frm_type == XAVS2_TYPE_I) && (remain_frames <= (2 * rc->i_win_size))) { init_fuzzy_controller(0.50); // enhance adjusting strength of the last WIN } if ((frm_idx % h->i_gop_size == 0) && (frm_idx != 0)) { rc->f_delta_qp = rc_calculate_gop_delta_qp(rc, frm_type, h->i_gop_size); } else if (remain_frames == (rc->i_total_frames - 1) % h->i_gop_size) { rc->f_delta_qp = rc_calculate_gop_delta_qp(rc, frm_type, h->i_gop_size); } } if ((frm_idx && frm_idx % h->i_gop_size == 0) || (rc->i_intra_period == 1)) { if (rc->i_intra_period > 1 && frm_type == XAVS2_TYPE_I) { #if RC_AUTO_ADJUST double remain_gop_num; /* adjust the delta QP according to the final WIN */ if (remain_frames > rc->i_win_size + h->i_gop_size) { remain_gop_num = rc->i_intra_period; } else { remain_gop_num = ceil((double)remain_frames / (double)h->i_gop_size); } if (remain_frames <= rc->i_win_size + h->i_gop_size) { remain_gop_num /= rc->i_intra_period; if (remain_gop_num < 1.0 / 3) { rc->f_delta_qp += 4.2; // as bitrate halve } else if (remain_gop_num < 1.0 / 2) { rc->f_delta_qp += 3.4; } else if (remain_gop_num < 2.0 / 3) { rc->f_delta_qp += 2.6; } else if (remain_gop_num < 3.0 / 4) { rc->f_delta_qp += 1.8; } else if (remain_gop_num < 4.0 / 4) { rc->f_delta_qp += 1.0; } } #endif /* calculate the average QP of all KEY frames in last WIN */ qp = (double)rc->i_win_qp / rc->i_win_cnt + rc->f_delta_qp; rc->i_base_qp = (int)(qp + 0.5); // reset the QP in encoder parameters } else { /* handle middle GOPs */ qp += rc->f_delta_qp; rc->i_base_qp += (int)(rc->f_delta_qp + 0.5); // also fix the QP in encoder parameters } qp = XAVS2_CLIP3F(rc->i_min_qp, max_qp, qp); rc->i_base_qp = XAVS2_CLIP3F(rc->i_min_qp, max_qp, rc->i_base_qp); } if (force_qp != XAVS2_QP_AUTO) { qp = force_qp - 1; } // check the QP if (rc->i_coded_frames > 0 && frm_type != XAVS2_TYPE_B) { qp = XAVS2_CLIP3F(rc->i_last_qp - RC_MAX_DELTA_QP, rc->i_last_qp + RC_MAX_DELTA_QP, qp); } return XAVS2_CLIP3F(rc->i_min_qp, max_qp, (int)(qp + 0.5)); } /* --------------------------------------------------------------------------- */ #if RC_LCU_LEVEL static void Init_LCURateControl(ratectrl_t *rc, int NumUnitsLCU) { rc->rc_lcu.LCUTBL = rc->f_target_buf_level; rc->rc_lcu.LCUBuffer = rc->f_buf_curr; rc->rc_lcu.LCUTBLdelta = rc->f_delta_buf_level / NumUnitsLCU; rc->rc_lcu.LCUBufferError = 0; rc->rc_lcu.LCUBufferDifError = 0; rc->rc_lcu.LCUPreBufferError = 0; } /* --------------------------------------------------------------------------- */ static int CalculateLCUDeltaQP(ratectrl_t *rc) { if (rc->i_intra_period <= 1) { // lcu level RC does not support RA now. double belta = 0.12; double tmp_bpp = rc->f_target_bpp; double buf_range = tmp_bpp * belta * 2; double buf_range_delta; buf_range = buf_range < 0.0001 ? 0.0001 : buf_range; buf_range_delta = buf_range * 2; return fuzzy_get_delta_qp(rc->rc_lcu.LCUBufferError, rc->rc_lcu.LCUBufferDifError, buf_range, -buf_range, buf_range_delta, -buf_range_delta); } return 0; } /* --------------------------------------------------------------------------- */ static void UpdataLCURateControl(ratectrl_t *rc, int qp, double lambda, int bits, int NumLCU) { rc->rc_lcu.PreBpp = rc->rc_lcu.CurBpp; rc->rc_lcu.CurBpp = ((double)bits) / rc->i_frame_size; rc->rc_lcu.LCUTBL -= rc->rc_lcu.LCUTBLdelta; rc->rc_lcu.LCUBuffer = rc->rc_lcu.LCUBuffer + rc->rc_lcu.CurBpp - rc->f_target_bpp / NumLCU; rc->rc_lcu.LCUBufferError = rc->rc_lcu.LCUBuffer - rc->rc_lcu.LCUTBL; rc->rc_lcu.LCUBufferDifError = rc->rc_lcu.LCUBufferError - rc->rc_lcu.LCUPreBufferError; rc->rc_lcu.LCUPreBufferError = rc->rc_lcu.LCUBufferError; rc->rc_lcu.LCUPreLCUQP = rc->rc_lcu.LCUCurLCUQP; rc->rc_lcu.LCUCurLCUQP = qp; rc->rc_lcu.LCUCurLambda = lambda; } #endif /** * =========================================================================== * interface function defines * =========================================================================== */ /** * --------------------------------------------------------------------------- * Function : get buffer size for rate control module * Parameters : * [in ] : param - handle of the xavs2 encoder * [out] : none * Return : return > 0 on success, 0/-1 on failure * --------------------------------------------------------------------------- */ int xavs2_rc_get_buffer_size(xavs2_param_t *param) { UNUSED_PARAMETER(param); return sizeof(ratectrl_t); } /** * --------------------------------------------------------------------------- * Function : create and init the rate control module * Parameters : * [in ] : h - handle of the xavs2 encoder * [out] : none * Return : return 0 on success, -1 on failure * --------------------------------------------------------------------------- */ int xavs2_rc_init(ratectrl_t *rc, xavs2_param_t *param) { /* clear memory for rate control handle */ memset(rc, 0, sizeof(ratectrl_t)); if (param->i_rc_method == XAVS2_RC_CBR_SCU && param->intra_period_max > 1) { param->i_rc_method = XAVS2_RC_CBR_FRM; xavs2_log(NULL, XAVS2_LOG_WARNING, "LCU Rate Control does not support RA. Using Frame RC. \n"); } // init rc->i_total_frames = param->num_frames == 0 ? RC_MAX_INT : param->num_frames; rc->i_total_frames = XAVS2_MIN(RC_MAX_INT, param->num_frames); rc->i_coded_frames = 0; rc->i_intra_period = param->intra_period_max; rc->i_frame_size = param->org_width * param->org_height; rc->b_open_gop = param->b_open_gop; rc->f_delta_qp = 0.0; rc->i_base_qp = param->i_initial_qp; rc->i_last_qp = 0; rc->i_min_qp = param->i_min_qp; rc->i_max_qp = param->i_max_qp; rc->i_coded_frames = 0; rc->i_gop_flag = -1024; rc->i_win_cnt = 0; rc->i_win_qp = 0; rc->f_win_bpp = 0.0; rc->f_gop_bpp = 0.0; rc->f_target_bpp = param->i_target_bitrate / (param->frame_rate * rc->i_frame_size); rc->f_intra_bpp = 0.0; rc->f_inter_bpp = 0.0; rc->f_buf_curr = 0.0; rc->f_buf_error = 0.0; rc->f_buf_error_diff = 0.0; rc->f_buf_error_prev = 0.0; #if RC_AUTO_ADJUST rc->f_first_buf_level = 0.0; rc->f_target_buf_level = 0.0; rc->f_delta_buf_level = 0.0; #endif // set size of WIN (intra period) rc->i_win_size = param->i_gop_size * (rc->i_intra_period - 1) + 1; // init table of fuzzy controller if (rc->i_intra_period == 1) { init_fuzzy_controller(0.85); } else { init_fuzzy_controller(0.75); } if (xavs2_thread_mutex_init(&rc->rc_mutex, NULL)) { return -1; } return 0; } /** * --------------------------------------------------------------------------- * Function : get base qp of the encoder * Parameters : * [in ] : h - handle of the xavs2 video encoder * [out] : none * Return : none * --------------------------------------------------------------------------- */ int xavs2_rc_get_base_qp(xavs2_t *h) { return h->rc->i_base_qp; // return the base qp directly } /** * --------------------------------------------------------------------------- * Function : get frame qp * Parameters : * [in ] : h - handle of the xavs2 video encoder * [out] : none * Return : none * --------------------------------------------------------------------------- */ int xavs2_rc_get_frame_qp(xavs2_t *h, int frm_idx, int frm_type, int force_qp) { /* get QP for current frame */ if (h->param->i_rc_method != XAVS2_RC_CQP && frm_type != XAVS2_TYPE_B) { int i_qp; xavs2_thread_mutex_lock(&h->rc->rc_mutex); i_qp = rc_calculate_frame_qp(h, frm_idx, frm_type, force_qp); xavs2_thread_mutex_unlock(&h->rc->rc_mutex); return i_qp; } else { return h->i_qp; // return the old value directly } } /** * --------------------------------------------------------------------------- * Function : get qp for one lcu * Parameters : * [in ] : h - handle of the xavs2 video encoder * : frm_idx - frame index * : qp - basic QP of the LCU to be encoded * [out] : none * Return : adjusted qp of the LCU * --------------------------------------------------------------------------- */ int xavs2_rc_get_lcu_qp(xavs2_t *h, int frm_idx, int qp) { UNUSED_PARAMETER(h); UNUSED_PARAMETER(frm_idx); //if (h->param->i_rc_method == XAVS2_RC_CBR_SCU && img->current_mb_nr == 0) { // Init_LCURateControl(rc, num_of_orgMB); //} #if RC_LCU_LEVEL if (h->param->i_rc_method == XAVS2_RC_CBR_SCU) { ratectrl_t *rc = h->rc; double lambda_mode = 0.5; int current_mb_nr = 0; /* FIX: current LCU index */ if (current_mb_nr == 0) { rc->SumMBQP = rc->NumMB = rc->LCUbaseQP = 0; } rc->RcMBQP = rc->LCUbaseQP = qp; if (rc->i_intra_period == 1) { if (current_mb_nr == 0) { rc->RcMBQP = qp; } else { rc->rc_lcu.LCUdeltaQP = CalculateLCUDeltaQP(rc); rc->RcMBQP = qp + rc->rc_lcu.LCUdeltaQP; rc->RcMBQP = XAVS2_MAX(qp - 3, XAVS2_MIN(rc->RcMBQP, qp + 3)); } } if (rc->i_intra_period == 0) { if (frm_idx == 0) { rc->RcMBQP = qp; } else { rc->rc_lcu.LCUdeltaQP = CalculateLCUDeltaQP(rc); rc->RcMBQP = qp + rc->rc_lcu.LCUdeltaQP; rc->RcMBQP = XAVS2_MAX(qp - 5, XAVS2_MIN(rc->RcMBQP, qp + 5)); } } if (rc->i_intra_period > 1) { if (frm_idx == 0) { rc->RcMBQP = qp; } else { rc->rc_lcu.LCUdeltaQP = CalculateLCUDeltaQP(rc); rc->RcMBQP = qp + rc->rc_lcu.LCUdeltaQP; rc->RcMBQP = XAVS2_MAX(qp - 5, XAVS2_MIN(rc->RcMBQP, qp + 5)); } } lambda_mode *= pow(2, (rc->RcMBQP - qp) / 4.0); rc->RcMBQP = XAVS2_MAX(rc->i_min_qp, XAVS2_MIN(rc->RcMBQP, rc->i_max_qp + (h->param->sample_bit_depth - 8) * 8)); rc->SumMBQP += rc->RcMBQP; rc->NumMB++; } #endif return qp; } /** * --------------------------------------------------------------------------- * Function : save stats and update rate control state after encoding a LCU block * Parameters : * [in ] : h - handle of the xavs2 video encoder * : frm_idx - frame index * : qp - QP of the encoded LCU * : bits - number of bits of the encoded LCU * Return : none * --------------------------------------------------------------------------- */ void xavs2_rc_update_after_lcu_coded(xavs2_t *h, int frm_idx, int qp) { UNUSED_PARAMETER(h); UNUSED_PARAMETER(frm_idx); UNUSED_PARAMETER(qp); #if RC_LCU_LEVEL if (h->param->i_rc_method == XAVS2_RC_CBR_SCU) { ratectrl_t *rc = h->rc; int LCUbits; if (img->current_mb_nr == 0) { rc->rc_lcu.PreBits = 0; } rc->rc_lcu.CurBits = currBitStream->byte_pos * 8; LCUbits = rc->rc_lcu.CurBits - rc->rc_lcu.PreBits; rc->rc_lcu.PreBits = rc->rc_lcu.CurBits; UpdataLCURateControl(rc, rc->RcMBQP, lambda_mode, LCUbits, numLCUInPicWidth * numLCUInPicHeight); } #endif } /** * --------------------------------------------------------------------------- * Function : save stats and update rate control state after encoding a frame * Parameters : * [in ] : h - handle of the xavs2 video encoder * : frm_bits - number of bits of the encoded frame * : frm_qp - average QP of the encoded frame * : frm_type - frame type * : frm_idx - frame index * Return : none * --------------------------------------------------------------------------- */ void xavs2_rc_update_after_frame_coded(xavs2_t *h, int frm_bits, int frm_qp, int frm_type, int frm_idx) { ratectrl_t *rc = h->rc; double frm_bpp = (double)frm_bits / rc->i_frame_size; // bits per pixel if (h->param->i_rc_method == XAVS2_RC_CQP) { return; /* no need to update */ } xavs2_thread_mutex_lock(&rc->rc_mutex); // lock #if RC_LCU_LEVEL if (h->param->i_rc_method == XAVS2_RC_CBR_SCU) { frm_qp = (int)((0.5 + rc->SumMBQP) / rc->NumMB); } #endif /* update */ rc->i_coded_frames++; /* sum up number of encoded frames */ rc->f_buf_curr += frm_bpp - rc->f_target_bpp; /* sum up buffer ERROR */ if (frm_type != XAVS2_TYPE_B) { if (frm_type == XAVS2_TYPE_I) { /* reset for the WIN */ rc->f_intra_bpp = frm_bpp; rc->i_win_qp = frm_qp; rc->f_win_bpp = frm_bpp; rc->i_win_cnt = 1; } else { /* sum up in the WIN */ rc->f_inter_bpp = frm_bpp; rc->i_win_qp += frm_qp; rc->f_win_bpp += frm_bpp; rc->i_win_cnt++; } rc->i_last_qp = frm_qp; rc->f_gop_bpp = frm_bpp; /* reset for a GOP */ } else { rc->f_gop_bpp += frm_bpp; /* sum up in a GOP */ } #if RC_AUTO_ADJUST /* adjust */ if (rc->i_intra_period == 1) { rc->f_target_buf_level = rc->f_delta_buf_level = 0.0; } else if (rc->i_intra_period == 0) { if (frm_type == XAVS2_TYPE_I) { rc->f_target_buf_level = rc->f_buf_curr; rc->f_delta_buf_level = rc->f_target_buf_level / (rc->i_total_frames + XAVS2_MAX(0, XAVS2_MIN(150, (rc->i_total_frames - 150) / 3))); } else { rc->f_target_buf_level = rc->f_target_buf_level - rc->f_delta_buf_level; } } else if (rc->i_intra_period == 2 || rc->i_intra_period == 3) { if (frm_type == XAVS2_TYPE_I && rc->i_coded_frames < 2) { rc->f_first_buf_level = rc->f_buf_curr; rc->f_target_buf_level = rc->f_buf_curr; } else { rc->f_target_buf_level = rc->f_first_buf_level * cos(PI / 2 * rc->i_coded_frames / rc->i_total_frames); } } else { const int remain_frames = rc->i_total_frames - rc->i_coded_frames; int LevelLength; if (frm_type == XAVS2_TYPE_I && rc->i_coded_frames < 2) { /* in the first WIN, after encoding the frame I */ LevelLength = XAVS2_MIN(rc->i_win_size - 1, rc->i_total_frames - frm_idx); rc->f_target_buf_level = rc->f_buf_curr; rc->f_delta_buf_level = rc->f_buf_curr / LevelLength; } else if (frm_type == XAVS2_TYPE_I && rc->i_coded_frames >= 2 && (remain_frames > rc->i_win_size + h->i_gop_size)) { /* in the middle WIN, after encoding the frame I */ rc->i_gop_flag = rc->i_coded_frames; /* store the position */ rc->f_target_buf_level = rc->f_delta_buf_level = 0.0; /* not adjust the buffer level */ } else if (frm_type == XAVS2_TYPE_I && rc->i_coded_frames >= 2 && (remain_frames <= rc->i_win_size + h->i_gop_size)) { /* in the final WIN, after encoding the frame I */ rc->f_target_buf_level = rc->f_buf_curr; rc->f_delta_buf_level = rc->f_buf_curr / remain_frames; } else if (rc->i_gop_flag == rc->i_coded_frames - h->i_gop_size) { /* in the middle WIN, after encoding the first GOP */ if (remain_frames <= rc->i_win_size) { LevelLength = remain_frames; } else { LevelLength = (rc->i_intra_period - 1) * h->i_gop_size; } rc->f_target_buf_level = rc->f_buf_curr; rc->f_delta_buf_level = rc->f_buf_curr / LevelLength; rc->i_gop_flag = -1024; } else { rc->f_target_buf_level = rc->f_target_buf_level - rc->f_delta_buf_level; } } #else UNUSED_PARAMETER(frm_idx); #endif xavs2_thread_mutex_unlock(&rc->rc_mutex); // unlock } /** * --------------------------------------------------------------------------- * Function : destroy the rate control * Parameters : * [in ] : rc - handle of the ratecontrol handler * [out] : none * Return : none * --------------------------------------------------------------------------- */ void xavs2_rc_destroy(ratectrl_t *rc) { xavs2_thread_mutex_destroy(&rc->rc_mutex); } xavs2-1.3/source/encoder/ratecontrol.h000066400000000000000000000050441340660520300200460ustar00rootroot00000000000000/* * ratecontrol.h * * Description of this file: * Ratecontrol functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_RATECONTRAL_H #define XAVS2_RATECONTRAL_H #define xavs2_rc_get_buffer_size FPFX(rc_get_buffer_size) int xavs2_rc_get_buffer_size(xavs2_param_t *h); #define xavs2_rc_init FPFX(rc_init) int xavs2_rc_init(ratectrl_t *rc, xavs2_param_t *param); #define xavs2_rc_get_base_qp FPFX(rc_get_base_qp) int xavs2_rc_get_base_qp(xavs2_t *h); #define xavs2_rc_get_frame_qp FPFX(rc_get_frame_qp) int xavs2_rc_get_frame_qp(xavs2_t *h, int frm_idx, int frm_type, int force_qp); #define xavs2_rc_update_after_frame_coded FPFX(rc_update_after_frame_coded) void xavs2_rc_update_after_frame_coded(xavs2_t *h, int frm_bits, int frm_qp, int frm_type, int frm_idx); #if ENABLE_RATE_CONTROL_CU #define xavs2_rc_get_lcu_qp FPFX(rc_get_lcu_qp) int xavs2_rc_get_lcu_qp(xavs2_t *h, int frm_idx, int qp); #define xavs2_rc_update_after_lcu_coded FPFX(rc_update_after_lcu_coded) void xavs2_rc_update_after_lcu_coded(xavs2_t *h, int frm_idx, int qp); #endif // ENABLE_RATE_CONTROL_CU #define xavs2_rc_destroy FPFX(rc_destroy) void xavs2_rc_destroy(ratectrl_t *rc); #endif // XAVS2_RATECONTRAL_H xavs2-1.3/source/encoder/rdo.c000066400000000000000000004556331340660520300163060ustar00rootroot00000000000000/* * rdo.c * * Description of this file: * RDO functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "rdo.h" #include "cudata.h" #include "aec.h" #include "common/mc.h" #include "transform.h" #include "block_info.h" #include "wquant.h" #include "me.h" #include "cpu.h" #include "predict.h" #include "ratecontrol.h" #include "rdoq.h" /** * =========================================================================== * local/global variables * =========================================================================== */ /* --------------------------------------------------------------------------- */ static const float SUBCU_COST_RATE[2][4] = { {0.50f, 0.75f, 0.97f, 1.0f}, /* ֡CUCostһ㶼ϴ */ {0.75f, 0.90f, 0.99f, 1.0f}, /* ֡£SkipCostС */ }; static const int tab_pdir_bskip[DS_MAX_NUM] = { PDIR_BID, PDIR_BWD, PDIR_SYM, PDIR_FWD }; /* --------------------------------------------------------------------------- */ static const int8_t NUM_PREDICTION_UNIT[MAX_PRED_MODES] = {// [mode] 1, // 0: 8x8, ---, ---, --- (PRED_SKIP ) 1, // 1: 8x8, ---, ---, --- (PRED_2Nx2N ) 2, // 2: 8x4, 8x4, ---, --- (PRED_2NxN ) 2, // 3: 4x8, 4x8, ---, --- (PRED_Nx2N ) 2, // 4: 8x2, 8x6, ---, --- (PRED_2NxnU ) 2, // 5: 8x6, 8x2, ---, --- (PRED_2NxnD ) 2, // 6: 2x8, 6x8, ---, --- (PRED_nLx2N ) 2, // 7: 6x8, 2x8, ---, --- (PRED_nRx2N ) 1, // 8: 8x8, ---, ---, --- (PRED_I_2Nx2N) 4, // 9: 4x4, 4x4, 4x4, 4x4 (PRED_I_NxN ) 4, //10: 8x2, 8x2, 8x2, 8x2 (PRED_I_2Nxn ) 4 //11: 2x8, 2x8, 2x8, 2x8 (PRED_I_nx2N ) }; static const cb_t CODING_BLOCK_INFO[MAX_PRED_MODES + 1][4] = {// [mode][block] // x, y, w, h x, y, w, h x, y, w, h x, y, w, h for block 0, 1, 2 and 3 { { {0, 0, 8, 8} }, { {0, 0, 0, 0} }, { {0, 0, 0, 0} }, { {0, 0, 0, 0} } }, // 0: 8x8, ---, ---, --- (PRED_SKIP ) { { {0, 0, 8, 8} }, { {0, 0, 0, 0} }, { {0, 0, 0, 0} }, { {0, 0, 0, 0} } }, // 1: 8x8, ---, ---, --- (PRED_2Nx2N ) { { {0, 0, 8, 4} }, { {0, 4, 8, 4} }, { {0, 0, 0, 0} }, { {0, 0, 0, 0} } }, // 2: 8x4, 8x4, ---, --- (PRED_2NxN ) { { {0, 0, 4, 8} }, { {4, 0, 4, 8} }, { {0, 0, 0, 0} }, { {0, 0, 0, 0} } }, // 3: 4x8, 4x8, ---, --- (PRED_Nx2N ) { { {0, 0, 8, 2} }, { {0, 2, 8, 6} }, { {0, 0, 0, 0} }, { {0, 0, 0, 0} } }, // 4: 8x2, 8x6, ---, --- (PRED_2NxnU ) { { {0, 0, 8, 6} }, { {0, 6, 8, 2} }, { {0, 0, 0, 0} }, { {0, 0, 0, 0} } }, // 5: 8x6, 8x2, ---, --- (PRED_2NxnD ) { { {0, 0, 2, 8} }, { {2, 0, 6, 8} }, { {0, 0, 0, 0} }, { {0, 0, 0, 0} } }, // 6: 2x8, 6x8, ---, --- (PRED_nLx2N ) { { {0, 0, 6, 8} }, { {6, 0, 2, 8} }, { {0, 0, 0, 0} }, { {0, 0, 0, 0} } }, // 7: 6x8, 2x8, ---, --- (PRED_nRx2N ) { { {0, 0, 8, 8} }, { {0, 0, 0, 0} }, { {0, 0, 0, 0} }, { {0, 0, 0, 0} } }, // 8: 8x8, ---, ---, --- (PRED_I_2Nx2N) { { {0, 0, 4, 4} }, { {4, 0, 4, 4} }, { {0, 4, 4, 4} }, { {4, 4, 4, 4} } }, // 9: 4x4, 4x4, 4x4, 4x4 (PRED_I_NxN ) { { {0, 0, 8, 2} }, { {0, 2, 8, 2} }, { {0, 4, 8, 2} }, { {0, 6, 8, 2} } }, //10: 8x2, 8x2, 8x2, 8x2 (PRED_I_2Nxn ) { { {0, 0, 2, 8} }, { {2, 0, 2, 8} }, { {4, 0, 2, 8} }, { {6, 0, 2, 8} } }, //11: 2x8, 2x8, 2x8, 2x8 (PRED_I_nx2N ) { { {0, 0, 4, 4} }, { {4, 0, 4, 4} }, { {0, 4, 4, 4} }, { {4, 4, 4, 4} } }, // X: 4x4, 4x4, 4x4, 4x4 }; static const int8_t TU_SPLIT_TYPE[MAX_PRED_MODES][2] = { // [mode][(NsqtEnable or SdipEnables) and cu_level > B8X8_IN_BIT] // split_type for block non-sdip/nsqt:[0] and sdip/nsqt:[1] { TU_SPLIT_CROSS, TU_SPLIT_CROSS }, // 0: 8x8, ---, ---, --- (PRED_SKIP ) { TU_SPLIT_CROSS, TU_SPLIT_CROSS }, // 1: 8x8, ---, ---, --- (PRED_2Nx2N ) { TU_SPLIT_CROSS, TU_SPLIT_HOR }, // 2: 8x4, 8x4, ---, --- (PRED_2NxN ) { TU_SPLIT_CROSS, TU_SPLIT_VER }, // 3: 4x8, 4x8, ---, --- (PRED_Nx2N ) { TU_SPLIT_CROSS, TU_SPLIT_HOR }, // 4: 8x2, 8x6, ---, --- (PRED_2NxnU ) { TU_SPLIT_CROSS, TU_SPLIT_HOR }, // 5: 8x6, 8x2, ---, --- (PRED_2NxnD ) { TU_SPLIT_CROSS, TU_SPLIT_VER }, // 6: 2x8, 6x8, ---, --- (PRED_nLx2N ) { TU_SPLIT_CROSS, TU_SPLIT_VER }, // 7: 6x8, 2x8, ---, --- (PRED_nRx2N ) { TU_SPLIT_NON, TU_SPLIT_INVALID }, // 8: 8x8, ---, ---, --- (PRED_I_2Nx2N) { TU_SPLIT_CROSS, TU_SPLIT_CROSS }, // 9: 4x4, 4x4, 4x4, 4x4 (PRED_I_NxN ) { TU_SPLIT_INVALID, TU_SPLIT_HOR }, //10: 8x2, 8x2, 8x2, 8x2 (PRED_I_2Nxn ) { TU_SPLIT_INVALID, TU_SPLIT_VER } //11: 2x8, 2x8, 2x8, 2x8 (PRED_I_nx2N ) }; static const int8_t headerbits_skipmode[8] = { 2, 3, 4, 4, 3, 4, 5, 5 };//temporal wsm1 wsm2 wsm3 spatial_direct0 spatial_direct1 spatial_direct2 spatial_direct3 /** * =========================================================================== * local function defines (utilities) * =========================================================================== */ /* --------------------------------------------------------------------------- * CUģʽȷǰCUPUʹС֡仮֣ */ static ALWAYS_INLINE void cu_init_pu_inter(xavs2_t *h, cu_info_t *p_cu_info, int i_level, int i_mode) { const int shift_bits = i_level - MIN_CU_SIZE_IN_BIT; const int8_t block_num = NUM_PREDICTION_UNIT[i_mode]; int ds_mode = p_cu_info->directskip_mhp_idx; int i; cb_t *p_cb = p_cu_info->cb; // set for each block if (i_mode == PRED_SKIP) { ///! һЩSkip/DirectģʽCU8x8PUֳ4 if (i_level > 3 && (h->i_type == SLICE_TYPE_P || (h->i_type == SLICE_TYPE_F && ds_mode == DS_NONE) || (h->i_type == SLICE_TYPE_B && ds_mode == DS_NONE))) { p_cu_info->num_pu = 4; for (i = 0; i < 4; i++) { p_cb[i].v = CODING_BLOCK_INFO[PRED_I_nx2N + 1][i].v << shift_bits; } } else { p_cu_info->num_pu = 1; memset(p_cu_info->cb, 0, sizeof(p_cu_info->cb)); p_cb[0].v = CODING_BLOCK_INFO[PRED_SKIP][0].v << shift_bits; } } else { p_cu_info->num_pu = block_num; for (i = 0; i < block_num; i++) { p_cb[i].v = CODING_BLOCK_INFO[i_mode][i].v << shift_bits; } } } /* --------------------------------------------------------------------------- * CUģʽȷǰCUPUʹС֡ڻ֣ */ static ALWAYS_INLINE void cu_init_pu_intra(xavs2_t *h, cu_info_t *p_cu_info, int i_level, int i_mode) { const int shift_bits = i_level - MIN_CU_SIZE_IN_BIT; const int8_t block_num = NUM_PREDICTION_UNIT[i_mode]; int i; cb_t *p_cb = p_cu_info->cb; UNUSED_PARAMETER(h); // set for each block p_cu_info->num_pu = block_num; for (i = 0; i < 4; i++) { p_cb[i].v = CODING_BLOCK_INFO[i_mode][i].v << shift_bits; } } /* --------------------------------------------------------------------------- * TU split type when TU split is enabled for current CU */ static ALWAYS_INLINE void cu_set_tu_split_type(xavs2_t *h, cu_info_t *p_cu_info, int transform_split_flag) { int mode = p_cu_info->i_mode; int level = p_cu_info->i_level; int enable_nsqt_sdip = IS_INTRA_MODE(mode) ? h->param->enable_sdip : h->param->enable_nsqt; enable_nsqt_sdip = enable_nsqt_sdip && level > B8X8_IN_BIT; p_cu_info->i_tu_split = transform_split_flag ? TU_SPLIT_TYPE[mode][enable_nsqt_sdip] : TU_SPLIT_NON; assert(p_cu_info->i_tu_split != TU_SPLIT_INVALID); } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE uint32_t cu_get_valid_modes(xavs2_t *h, int frm_type, int level) { return h->valid_modes[frm_type][level - MIN_CU_SIZE_IN_BIT]; } /* --------------------------------------------------------------------------- */ static INLINE void cu_init(xavs2_t *h, cu_t *p_cu, cu_info_t *best, int i_level) { cu_layer_t *p_layer = cu_get_layer(h, i_level); int i; /* Ping-pong buffer */ p_layer->buf_pred_inter = p_layer->buf_pred_inter_luma[0]; p_layer->buf_pred_inter_best = p_layer->buf_pred_inter_luma[1]; /* init rec and coeff pointer */ p_cu->cu_info.p_rec [0] = p_layer->rec_buf_y [0]; p_cu->cu_info.p_coeff[0] = p_layer->coef_buf_y[0]; p_layer->p_rec_tmp [0] = p_layer->rec_buf_y [1]; p_layer->p_coeff_tmp [0] = p_layer->coef_buf_y[1]; best->p_rec [0] = p_layer->rec_buf_y [2]; best->p_coeff [0] = p_layer->coef_buf_y[2]; p_cu->cu_info.p_rec [1] = p_layer->rec_buf_uv [0][0]; p_cu->cu_info.p_coeff[1] = p_layer->coef_buf_uv[0][0]; p_layer->p_rec_tmp [1] = p_layer->rec_buf_uv [0][1]; p_layer->p_coeff_tmp [1] = p_layer->coef_buf_uv[0][1]; best->p_rec [1] = p_layer->rec_buf_uv [0][2]; best->p_coeff [1] = p_layer->coef_buf_uv[0][2]; p_cu->cu_info.p_rec [2] = p_layer->rec_buf_uv [1][0]; p_cu->cu_info.p_coeff[2] = p_layer->coef_buf_uv[1][0]; p_layer->p_rec_tmp [2] = p_layer->rec_buf_uv [1][1]; p_layer->p_coeff_tmp [2] = p_layer->coef_buf_uv[1][1]; best->p_rec [2] = p_layer->rec_buf_uv [1][2]; best->p_coeff [2] = p_layer->coef_buf_uv[1][2]; /* init basic properties */ p_cu->cu_info.i_cbp = 0; #if ENABLE_RATE_CONTROL_CU /* set qp needed in loop filter (even if constant QP is used) */ p_cu->cu_info.i_cu_qp = h->i_qp; if (h->param->i_rc_method == XAVS2_RC_CBR_SCU) { int i_left_cu_qp; if (p_cu->i_pix_x > 0) { i_left_cu_qp = h->cu_info[p_cu->i_scu_xy - 1].i_cu_qp; } else { i_left_cu_qp = h->i_qp; } p_cu->cu_info.i_delta_qp = p_cu->cu_info.i_cu_qp - i_left_cu_qp; } else { p_cu->cu_info.i_delta_qp = 0; } #endif /* ref_idx_1st[], ref_idx_2nd[] ڴ */ memset(p_cu->cu_info.ref_idx_1st, INVALID_REF, sizeof(p_cu->cu_info.ref_idx_1st) + sizeof(p_cu->cu_info.ref_idx_2nd)); /* init position for 4 sub-CUs */ if (i_level > B8X8_IN_BIT) { for (i = 0; i < 4; i++) { cu_t *p_sub_cu = p_cu->sub_cu[i]; p_sub_cu->i_pix_x = p_cu->i_pix_x + ((i & 1) << (i_level - 1)); p_sub_cu->i_pix_y = p_cu->i_pix_y + ((i >> 1) << (i_level - 1)); p_sub_cu->cu_info.i_scu_x = p_sub_cu->i_pix_x >> MIN_CU_SIZE_IN_BIT; p_sub_cu->cu_info.i_scu_y = p_sub_cu->i_pix_y >> MIN_CU_SIZE_IN_BIT; p_sub_cu->i_scu_xy = p_sub_cu->cu_info.i_scu_y * h->i_width_in_mincu + p_sub_cu->cu_info.i_scu_x; } } /* set neighbor CUs */ check_neighbor_cu_avail(h, p_cu, p_cu->cu_info.i_scu_x, p_cu->cu_info.i_scu_y, p_cu->i_scu_xy); p_cu->b_cbp_direct = 0; } /* --------------------------------------------------------------------------- * copy information of CU */ static ALWAYS_INLINE void cu_copy_info(cu_info_t *p_dst, const cu_info_t *p_src) { const int num_bytes = sizeof(cu_info_t) - (int)((uint8_t *)&p_dst->i_level - (uint8_t *)p_dst); memcpy(&p_dst->i_level, &p_src->i_level, num_bytes); } /* --------------------------------------------------------------------------- * store cu parameters to best */ static void cu_store_parameters(xavs2_t *h, cu_t *p_cu, cu_info_t *best) { int mode = p_cu->cu_info.i_mode; cu_mode_t *p_mode = cu_get_layer_mode(h, p_cu->cu_info.i_level); // store best mode cu_copy_info(best, &p_cu->cu_info); /* --- reconstructed blocks ---- */ XAVS2_SWAP_PTR(best->p_rec[0], p_cu->cu_info.p_rec[0]); XAVS2_SWAP_PTR(best->p_rec[1], p_cu->cu_info.p_rec[1]); XAVS2_SWAP_PTR(best->p_rec[2], p_cu->cu_info.p_rec[2]); /* ---- residual (coefficients) ---- */ XAVS2_SWAP_PTR(best->p_coeff[0], p_cu->cu_info.p_coeff[0]); XAVS2_SWAP_PTR(best->p_coeff[1], p_cu->cu_info.p_coeff[1]); XAVS2_SWAP_PTR(best->p_coeff[2], p_cu->cu_info.p_coeff[2]); /* ---- prediction information ---- */ if (!IS_INTRA_MODE(mode)) { memcpy(&p_mode->best_mc, &p_cu->mc, sizeof(p_cu->mc)); } } /* --------------------------------------------------------------------------- * sets motion vectors and reference indexes for an CU */ static void cu_save_mvs_refs(xavs2_t *h, cu_info_t *p_cu_info) { int8_t *p_dirpred = h->dir_pred; int8_t *p_ref_1st = h->fwd_1st_ref; int8_t *p_ref_2nd = h->bwd_2nd_ref; mv_t *p_mv_1st = h->fwd_1st_mv; mv_t *p_mv_2nd = h->bwd_2nd_mv; int bx_4x4_cu = p_cu_info->i_scu_x << (MIN_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT); int by_4x4_cu = p_cu_info->i_scu_y << (MIN_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT); int w_in_4x4 = h->i_width_in_minpu; cu_mode_t *p_cu_mode = cu_get_layer_mode(h, p_cu_info->i_level); cu_mc_param_t *p_mc = &p_cu_mode->best_mc; int width, height; int k, by, bx; int r, c; for (k = 0; k < p_cu_info->num_pu; k++) { int8_t i_dir_pred = (int8_t)p_cu_info->b8pdir[k]; int8_t ref_1st = (int8_t)p_cu_info->ref_idx_1st[k]; int8_t ref_2nd = (int8_t)p_cu_info->ref_idx_2nd[k]; mv_t mv_1st = p_mc->mv[k][0]; mv_t mv_2nd = p_mc->mv[k][1]; cb_t cur_cb = p_cu_info->cb[k]; cur_cb.v >>= 2; bx = cur_cb.x; by = cur_cb.y; width = cur_cb.w; height = cur_cb.h; bx += bx_4x4_cu; by += by_4x4_cu; for (r = 0; r < height; r++) { int offset = (by + r) * w_in_4x4 + bx; for (c = 0; c < width; c++) { p_dirpred[offset + c] = i_dir_pred; p_mv_1st [offset + c] = mv_1st; p_mv_2nd [offset + c] = mv_2nd; p_ref_1st[offset + c] = ref_1st; p_ref_2nd[offset + c] = ref_2nd; } } } } /* --------------------------------------------------------------------------- * set stored cu parameters */ static INLINE void cu_copy_stored_parameters(xavs2_t *h, cu_t *p_cu, cu_info_t *best) { int mode = best->i_mode; int w_in_4x4 = h->i_width_in_minpu; int scu_xy = p_cu->i_scu_xy; int b4x4_x = p_cu->i_pix_x >> MIN_PU_SIZE_IN_BIT; int b4x4_y = p_cu->i_pix_y >> MIN_PU_SIZE_IN_BIT; int pix_x = p_cu->i_pos_x; int pix_y = p_cu->i_pos_y; int pix_cx = pix_x >> 1; int pix_cy = pix_y >> 1; int blocksize = p_cu->i_size; int ip_stride = h->i_width_in_minpu + 16; int part_idx_c = PART_INDEX(blocksize >> 1, blocksize >> 1); int8_t *p_intramode = h->ipredmode + (pix_y >> MIN_PU_SIZE_IN_BIT) * ip_stride + b4x4_x; const int size_in_spu = (blocksize >> MIN_PU_SIZE_IN_BIT); const int size_in_scu = (blocksize >> MIN_CU_SIZE_IN_BIT); int i, j; cu_copy_info(&p_cu->cu_info, best); //===== reconstruction values ===== g_funcs.pixf.copy_pp[PART_INDEX(blocksize, blocksize)](h->lcu.p_fdec[0] + pix_y * FDEC_STRIDE + pix_x, FDEC_STRIDE, best->p_rec[0], FREC_STRIDE); g_funcs.pixf.copy_ss[PART_INDEX(blocksize, blocksize)](h->lcu.lcu_coeff[0] + (p_cu->idx_zorder << 6), blocksize, best->p_coeff[0], blocksize); g_funcs.pixf.copy_pp[part_idx_c](h->lcu.p_fdec[1] + pix_cy * FDEC_STRIDE + pix_cx, FDEC_STRIDE, best->p_rec[1], FREC_CSTRIDE / 2); g_funcs.pixf.copy_pp[part_idx_c](h->lcu.p_fdec[2] + pix_cy * FDEC_STRIDE + pix_cx, FDEC_STRIDE, best->p_rec[2], FREC_CSTRIDE / 2); g_funcs.pixf.copy_ss[part_idx_c](h->lcu.lcu_coeff[1] + (p_cu->idx_zorder << 4), blocksize >> 1, best->p_coeff[1], blocksize >> 1); g_funcs.pixf.copy_ss[part_idx_c](h->lcu.lcu_coeff[2] + (p_cu->idx_zorder << 4), blocksize >> 1, best->p_coeff[2], blocksize >> 1); //=============== cbp and mode =============== for (j = 0; j < size_in_scu; j++) { cu_info_t *p_cu_info = &h->cu_info[j * h->i_width_in_mincu + scu_xy]; // save data to cu_info for (i = size_in_scu; i != 0; i--) { cu_copy_info(p_cu_info++, best); } } //=============== intra pred mode =============== if (IS_INTRA_MODE(mode)) { int n_size4 = size_in_spu >> 2; int k; int8_t intra_pred_mode; switch (mode) { case PRED_I_2Nxn: for (i = 0; i < 4; i++) { for (j = i * n_size4; j < (i + 1) * n_size4; j++) { g_funcs.fast_memset(p_intramode, p_cu->cu_info.real_intra_modes[i], size_in_spu * sizeof(int8_t)); p_intramode += ip_stride; } } break; case PRED_I_nx2N: for (j = 0; j < size_in_spu; j++) { for (i = 0; i < 4; i++) { k = i * n_size4; g_funcs.fast_memset(p_intramode + k, p_cu->cu_info.real_intra_modes[i], n_size4 * sizeof(int8_t)); } p_intramode += ip_stride; } break; case PRED_I_NxN: n_size4 = size_in_spu >> 1; for (j = 0; j < n_size4; j++) { for (i = 0; i < 2; i++) { k = i * n_size4; g_funcs.fast_memset(p_intramode + k, p_cu->cu_info.real_intra_modes[i], n_size4 * sizeof(int8_t)); } p_intramode += ip_stride; } for (j = n_size4; j < size_in_spu; j++) { for (i = 0; i < 2; i++) { k = i * n_size4; g_funcs.fast_memset(p_intramode + k, p_cu->cu_info.real_intra_modes[i + 2], n_size4 * sizeof(int8_t)); } p_intramode += ip_stride; } break; default: // PRED_2Nx2N intra_pred_mode = p_cu->cu_info.real_intra_modes[0]; for (j = size_in_spu - 1; j != 0; j--) { p_intramode[size_in_spu - 1] = intra_pred_mode; p_intramode += ip_stride; } g_funcs.fast_memset(p_intramode, intra_pred_mode, size_in_spu * sizeof(int8_t)); break; } } else if (h->i_type != SLICE_TYPE_I && h->fenc->b_enable_intra) { for (j = size_in_spu - 1; j != 0; j--) { p_intramode[size_in_spu - 1] = -1; p_intramode += ip_stride; } g_funcs.fast_memset(p_intramode, -1, size_in_spu * sizeof(int8_t)); } //============== inter prediction information ========================= if (h->i_type != SLICE_TYPE_I) { if (IS_INTER_MODE(p_cu->cu_info.i_mode)) { cu_save_mvs_refs(h, &p_cu->cu_info); // store mv } else { int8_t *p_dirpred = h->dir_pred + b4x4_y * w_in_4x4 + b4x4_x; int8_t *p_ref_1st = h->fwd_1st_ref + b4x4_y * w_in_4x4 + b4x4_x; int8_t *p_ref_2nd = h->bwd_2nd_ref + b4x4_y * w_in_4x4 + b4x4_x; int size_b4 = blocksize >> MIN_PU_SIZE_IN_BIT; for (i = size_b4; i != 0; i--) { for (j = 0; j < size_b4; j++) { p_ref_1st[j] = INVALID_REF; p_ref_2nd[j] = INVALID_REF; p_dirpred[j] = PDIR_INVALID; } p_dirpred += w_in_4x4; p_ref_1st += w_in_4x4; p_ref_2nd += w_in_4x4; } } } } /* --------------------------------------------------------------------------- * get spatial neighboring MV */ static ALWAYS_INLINE void cu_get_neighbor_spatial(xavs2_t *h, int cur_slice_idx, neighbor_inter_t *p_neighbor, int x4, int y4) { int pos = y4 * h->i_width_in_minpu + x4; int y_outside_pic = y4 < 0 || y4 >= h->i_height_in_minpu; int x_outside_pic = x4 < 0 || x4 >= h->i_width_in_minpu; // scu_xy = XAVS2_MIN(h->i_width_in_mincu * h->i_height_in_mincu - 1, XAVS2_MAX(0, scu_xy)); if (y_outside_pic || x_outside_pic || cu_get_slice_index(h, x4 >> 1, y4 >> 1) != cur_slice_idx) { p_neighbor->is_available = 0; p_neighbor->i_dir_pred = PDIR_INVALID; p_neighbor->ref_idx[0] = INVALID_REF; p_neighbor->ref_idx[1] = INVALID_REF; p_neighbor->mv[0].v = 0; p_neighbor->mv[1].v = 0; } else { p_neighbor->is_available = 1; p_neighbor->i_dir_pred = h->dir_pred[pos]; p_neighbor->ref_idx[0] = h->fwd_1st_ref[pos]; p_neighbor->ref_idx[1] = h->bwd_2nd_ref[pos]; p_neighbor->mv[0] = h->fwd_1st_mv[pos]; p_neighbor->mv[1] = h->bwd_2nd_mv[pos]; } } /* --------------------------------------------------------------------------- * get temporal MV predictor */ static ALWAYS_INLINE void cu_get_neighbor_temporal(xavs2_t *h, neighbor_inter_t *p_neighbor, int x4, int y4) { int w_in_16x16 = (h->i_width_in_minpu + 3) >> 2; int pos = (y4 >> 2) * w_in_16x16 + (x4 >> 2); p_neighbor->is_available = 1; p_neighbor->i_dir_pred = PDIR_FWD; p_neighbor->ref_idx[0] = h->fref[0]->pu_ref[pos]; p_neighbor->mv[0] = h->fref[0]->pu_mv[pos]; p_neighbor->ref_idx[1] = INVALID_REF; p_neighbor->mv[1].v = 0; } /* --------------------------------------------------------------------------- * get neighboring MVs for MVP */ void cu_get_neighbors(xavs2_t *h, cu_t *p_cu, cb_t *p_cb) { neighbor_inter_t *neighbors = cu_get_layer(h, p_cu->cu_info.i_level)->neighbor_inter; int cur_slice_idx = cu_get_slice_index(h, p_cu->i_pix_x >> MIN_CU_SIZE_IN_BIT, p_cu->i_pix_y >> MIN_CU_SIZE_IN_BIT); int bx_4x4 = p_cu->i_pix_x >> MIN_PU_SIZE_IN_BIT; int by_4x4 = p_cu->i_pix_y >> MIN_PU_SIZE_IN_BIT; int xx0 = (p_cb->x >> MIN_PU_SIZE_IN_BIT) + bx_4x4; int yy0 = (p_cb->y >> MIN_PU_SIZE_IN_BIT) + by_4x4; int xx1 = (p_cb->w >> MIN_PU_SIZE_IN_BIT) + xx0 - 1; int yy1 = (p_cb->h >> MIN_PU_SIZE_IN_BIT) + yy0 - 1; /* 1. check whether the top-right 4x4 block is reconstructed */ int x_TR_4x4_in_lcu = xx1 - (h->lcu.i_scu_x << (MIN_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT)); int y_TR_4x4_in_lcu = yy0 - (h->lcu.i_scu_y << (MIN_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT)); int b_available_TR = h->tab_avail_TR[(y_TR_4x4_in_lcu << (h->i_lcu_level - B4X4_IN_BIT)) + x_TR_4x4_in_lcu]; /* 2. get neighboring blocks */ /* */ cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOPLEFT ], xx0 - 1, yy0 - 1); /* ڵPUϢ */ if (IS_VER_PU_PART(p_cu->cu_info.i_mode) && p_cb->x != 0) { // CUֱΪPUҵǰPUΪұһ neighbor_inter_t *p_neighbor = neighbors + BLK_LEFT; p_neighbor->is_available = 1; // cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_LEFT], xx0 - 1, yy0); p_neighbor->i_dir_pred = p_cu->cu_info.b8pdir[0]; p_neighbor->ref_idx[0] = p_cu->cu_info.ref_idx_1st[0]; p_neighbor->ref_idx[1] = p_cu->cu_info.ref_idx_2nd[0]; p_neighbor->mv[0] = p_cu->mc.mv[0][0]; p_neighbor->mv[1] = p_cu->mc.mv[0][1]; memcpy(&neighbors[BLK_LEFT2], p_neighbor, sizeof(neighbor_inter_t)); } else { cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_LEFT], xx0 - 1, yy0); cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_LEFT2], xx0 - 1, yy1); } /* ڵPUϢ */ if (IS_HOR_PU_PART(p_cu->cu_info.i_mode) && p_cb->y != 0) { // CUˮƽΪPUҵǰPUΪ±һ neighbor_inter_t *p_neighbor = neighbors + BLK_TOP; p_neighbor->is_available = 1; // cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_LEFT], xx0 - 1, yy0); p_neighbor->i_dir_pred = p_cu->cu_info.b8pdir[0]; p_neighbor->ref_idx[0] = p_cu->cu_info.ref_idx_1st[0]; p_neighbor->ref_idx[1] = p_cu->cu_info.ref_idx_2nd[0]; p_neighbor->mv[0] = p_cu->mc.mv[0][0]; p_neighbor->mv[1] = p_cu->mc.mv[0][1]; memcpy(&neighbors[BLK_TOP2], p_neighbor, sizeof(neighbor_inter_t)); } else { cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOP], xx0, yy0 - 1); cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOP2], xx1, yy0 - 1); } /* */ cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOPRIGHT], b_available_TR ? xx1 + 1 : -1, yy0 - 1); cu_get_neighbor_temporal(h, &neighbors[BLK_COL], xx0, yy0); } /* --------------------------------------------------------------------------- * return: number of reference frames */ static ALWAYS_INLINE int cu_get_mvs_for_mc(xavs2_t *h, cu_t *p_cu, int pu_idx, mv_t *p_mv_1st, mv_t *p_mv_2nd, int *p_ref_idx1, int *p_ref_idx2) { int num_ref; // number of reference frames int dmh_mode = p_cu->cu_info.dmh_mode; int ref_1st = p_cu->cu_info.ref_idx_1st[pu_idx]; // һǰB֡Ԥ⣩˶ʸ int ref_2nd = p_cu->cu_info.ref_idx_2nd[pu_idx]; // ڶB֡˫ĺ mv_t mv_1st, mv_2nd; // һǰB֡Ԥ⣩͵ڶ˶ʸ if (h->i_type != SLICE_TYPE_B) { num_ref = (ref_1st != INVALID_REF) + (ref_2nd != INVALID_REF); mv_1st = p_cu->mc.mv[pu_idx][0]; mv_2nd = p_cu->mc.mv[pu_idx][1]; if (dmh_mode > 0) { num_ref = 2; ref_2nd = ref_1st; mv_2nd = mv_1st; mv_1st.x -= tab_dmh_pos[dmh_mode][0]; mv_1st.y -= tab_dmh_pos[dmh_mode][1]; mv_2nd.x += tab_dmh_pos[dmh_mode][0]; mv_2nd.y += tab_dmh_pos[dmh_mode][1]; } } else { num_ref = (ref_1st != INVALID_REF) + (ref_2nd != INVALID_REF); if (ref_1st == INVALID_REF) { ref_1st = B_BWD; ref_2nd = INVALID_REF; mv_1st = p_cu->mc.mv[pu_idx][1]; mv_2nd.v = 0; } else { mv_1st = p_cu->mc.mv[pu_idx][0]; mv_2nd = p_cu->mc.mv[pu_idx][1]; ref_1st = B_FWD; ref_2nd = B_BWD; } } *p_mv_1st = mv_1st; *p_mv_2nd = mv_2nd; *p_ref_idx1 = ref_1st; *p_ref_idx2 = ref_2nd; return num_ref; } /* --------------------------------------------------------------------------- * forward quantization */ static INLINE int tu_quant_forward(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, coeff_t *p_coeff, int i_level, int bsx, int bsy, int qp, int b_intra, int b_luma, int intra_mode) { /* ((1 << x) * 5) / 31 */ static const int tab_quant_fwd_add[] = { 0, 0, 0, 1, 2, 5, 10, 20, 41, 82, 165, 330, 660, 1321, 2642, 5285, 10570, 21140, 42281, 84562, 169125, 338250, 676500, 1353001, }; const int shift = 15 + LIMIT_BIT - (h->param->sample_bit_depth + 1) - i_level; const int add = tab_quant_fwd_add[shift + b_intra]; if (h->lcu.b_enable_rdoq) { if ((IS_ALG_ENABLE(OPT_CODE_OPTIMZATION) && b_luma) || (IS_ALG_ENABLE(OPT_RDOQ_AZPC))) { const int i_coef = bsx * bsy; const int th_RDOQ = (int)(((1 << shift) - add) / (double)(tab_Q_TAB[qp])); //ljr int i; for (i = 0; i < i_coef; i++) { if (XAVS2_ABS(p_coeff[i]) >= th_RDOQ) { break; } } if (i_coef == i) { p_coeff[0] = 0; return 0; } } return rdoq_block(h, p_aec, p_cu, p_coeff, bsx, bsy, i_level, qp, b_luma, intra_mode); } else { #if !ENABLE_WQUANT return g_funcs.dctf.quant(p_coeff, bsx * bsy, tab_Q_TAB[qp], shift, add); #else if (!h->WeightQuantEnable) { return g_funcs.dctf.quant(p_coeff, bsx * bsy, tab_Q_TAB[qp], shift, add); } else { int *levelscale = h->wq_data.levelScale[i_level - B4X4_IN_BIT][b_intra]; return g_funcs.dctf.wquant(p_coeff, bsx * bsy, tab_Q_TAB[qp], shift, add, levelscale); } #endif } } /* --------------------------------------------------------------------------- * inverse quantization */ static INLINE void tu_quant_inverse(xavs2_t *h, cu_t *p_cu, coeff_t *coef, int num_coeff, int i_level, int qp, int b_luma) { const int scale = tab_IQ_TAB[qp]; const int shift = tab_IQ_SHIFT[qp] + (h->param->sample_bit_depth + 1) + i_level - LIMIT_BIT; #if !ENABLE_WQUANT UNUSED_PARAMETER(h); UNUSED_PARAMETER(b_luma); UNUSED_PARAMETER(p_cu); g_funcs.dctf.dequant(coef, num_coeff, scale, shift); #else if (!h->WeightQuantEnable) { g_funcs.dctf.dequant(coef, num_coeff, scale, shift); } else { int b_hor = b_luma && p_cu->cu_info.i_tu_split == TU_SPLIT_HOR; int b_ver = b_luma && p_cu->cu_info.i_tu_split == TU_SPLIT_VER; const int16_t(*AVS_SCAN)[2] = NULL; int wqm_size_id = 0; int wqm_stride = 0; int wqm_shift = h->param->PicWQDataIndex == 1 ? 3 : 0; int xy_shift = 0; int16_t *wq_matrix; // adaptive frequency weighting quantization if ((h->param->enable_sdip || h->param->enable_nsqt) && (b_hor || b_ver)) { xy_shift = XAVS2_MIN(2, i_level - B4X4_IN_BIT); wqm_size_id = xy_shift + 1; if (b_hor) { AVS_SCAN = tab_coef_scan_list_hor[XAVS2_MIN(2, i_level - 2)]; } else { AVS_SCAN = tab_coef_scan_list_ver[XAVS2_MIN(2, i_level - 2)]; } } else { xy_shift = XAVS2_MIN(3, i_level - B4X4_IN_BIT); wqm_size_id = xy_shift + 1; AVS_SCAN = tab_coef_scan_list[XAVS2_MIN(3, i_level - 2)]; } wqm_stride = 1 << (wqm_size_id + B4X4_IN_BIT); if (wqm_size_id == 2) { wqm_stride >>= 1; } else if (wqm_size_id == 3) { wqm_stride >>= 2; } wq_matrix = h->wq_data.cur_wq_matrix[wqm_size_id]; dequant_weighted_c(coef, num_coeff, scale, shift, wqm_shift, wqm_stride, xy_shift, wq_matrix, AVS_SCAN); } #endif } /* --------------------------------------------------------------------------- */ static INLINE void tu_get_dct_coeff(xavs2_t *h, coeff_t *cur_blk, int pu_size_idx, int bsx, int bsy) { if (IS_ALG_ENABLE(OPT_BIT_EST_PSZT) && !h->lcu.b_2nd_rdcost_pass && bsx >= 32 && bsy >= 32) { g_funcs.dctf.dct_half[pu_size_idx](cur_blk, cur_blk, bsx); } else { g_funcs.dctf.dct[pu_size_idx](cur_blk, cur_blk, bsx); } } /** * =========================================================================== * local function defines (chroma) * =========================================================================== */ /* --------------------------------------------------------------------------- * finish transform, quantization, inverse-transform, inverse-quantization * and reconstruction pixel generation of chroma block */ static int cu_recon_chroma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, dist_t *distortion) { int b_intra = IS_INTRA_MODE(p_cu->cu_info.i_mode); int pix_x_c = p_cu->i_pos_x >> 1; int pix_y_c = p_cu->i_pos_y >> CHROMA_V_SHIFT; int level_c = p_cu->cu_info.i_level - CHROMA_V_SHIFT; int bsize_c = 1 << level_c; int partidx_c = PART_INDEX(bsize_c, bsize_c); int cbp_c = 0; // Coding Block Pattern (CBP) of chroma blocks int num_nonzero; // number of non-zero coefficients int qp_c; int uv; cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); coeff_t *cur_blk = p_enc->coeff_blk; pel_t *p_pred; /* prediction buffer of chroma blocks */ if (b_intra) { p_pred = p_enc->intra_pred_c[p_cu->cu_info.i_intra_mode_c]; } else { p_pred = p_enc->buf_pred_inter_c; } for (uv = 0; uv < 2; uv++) { pel_t *p_fdec = p_cu->cu_info.p_rec[uv + 1]; pel_t *p_fenc = h->lcu.p_fenc[uv + 1] + pix_y_c * FENC_STRIDE + pix_x_c; g_funcs.pixf.sub_ps[partidx_c](cur_blk, bsize_c, p_fenc, p_pred, FENC_STRIDE, FREC_CSTRIDE); // DCT, quantization, inverse quantization, IDCT, and reconstruction tu_get_dct_coeff(h, cur_blk, partidx_c, bsize_c, bsize_c); qp_c = cu_get_qp(h, &p_cu->cu_info); #if ENABLE_WQUANT qp_c += (uv == 0 ? h->param->chroma_quant_param_delta_u : h->param->chroma_quant_param_delta_v); #endif qp_c = cu_get_chroma_qp(h, qp_c, uv); num_nonzero = tu_quant_forward(h, p_aec, p_cu, cur_blk, level_c, bsize_c, bsize_c, qp_c, b_intra, 0, DC_PRED); cbp_c |= (num_nonzero != 0) << (4 + uv); if (num_nonzero) { g_funcs.pixf.copy_ss[partidx_c](p_cu->cu_info.p_coeff[uv + 1], bsize_c, cur_blk, bsize_c); tu_quant_inverse(h, p_cu, cur_blk, bsize_c * bsize_c, level_c, qp_c, 0); g_funcs.dctf.idct[partidx_c](cur_blk, cur_blk, bsize_c); g_funcs.pixf.add_ps[partidx_c](p_fdec, FREC_CSTRIDE / 2, p_pred, cur_blk, FREC_CSTRIDE, bsize_c); } else { g_funcs.pixf.copy_pp[partidx_c](p_fdec, FREC_CSTRIDE / 2, p_pred, FREC_CSTRIDE); } *distortion += g_funcs.pixf.ssd[partidx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2); p_pred += (FREC_CSTRIDE >> 1); // uvoffset } return cbp_c; } /* --------------------------------------------------------------------------- * get max available bits for left residual coding */ static ALWAYS_INLINE int rdo_get_left_bits(xavs2_t *h, rdcost_t min_rdcost, dist_t distortion) { rdcost_t f_lambda = h->f_lambda_mode; double f_left_bits = ((min_rdcost - distortion) * h->f_lambda_1th) + 1; int left_bits; left_bits = (int)XAVS2_CLIP3F(0.0f, 32766.0f, f_left_bits); // clipһ if (left_bits * f_lambda + distortion <= min_rdcost) { left_bits++; // ⸡֤ﵽֵʱrdcostmin_rdcost } return left_bits; } /** * =========================================================================== * local function defines (intra) * =========================================================================== */ /* --------------------------------------------------------------------------- * finish transform, quantization, inverse-transform, inverse-quantization * and reconstruction pixel generation of a intra luma block */ static INLINE int cu_recon_intra_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, pel_t *p_pred, int bsx, int bsy, int block_x, int block_y, int idx_tu, int intra_pred_mode, dist_t *distortion) { int used_wavelet = (p_cu->cu_info.i_level == B64X64_IN_BIT && p_cu->cu_info.i_tu_split != TU_SPLIT_CROSS); int i_tu_level = p_cu->cu_info.i_level - (p_cu->cu_info.i_tu_split != TU_SPLIT_NON); int pos_x = p_cu->i_pos_x + block_x; int pos_y = p_cu->i_pos_y + block_y; int part_idx = PART_INDEX(bsx, bsy); int w_tr = bsx >> used_wavelet; int h_tr = bsy >> used_wavelet; int num_non_zero; int b_2nd_trans = h->param->enable_secT; cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); pel_t *p_fenc = h->lcu.p_fenc[0] + pos_y * FENC_STRIDE + pos_x; pel_t *p_fdec = p_cu->cu_info.p_rec[0] + block_y * FREC_STRIDE + block_x; coeff_t *cur_blk = p_enc->coeff_blk; coeff_t *p_coeff_y = p_cu->cu_info.p_coeff[0] + (idx_tu << ((p_cu->cu_info.i_level - 1) << 1)); int b_top = IS_NEIGHBOR_AVAIL(p_cu->block_avail, MD_I_TOP); int b_left = IS_NEIGHBOR_AVAIL(p_cu->block_avail, MD_I_LEFT); // get prediction and prediction error g_funcs.pixf.sub_ps[PART_INDEX(bsx, bsy)](cur_blk, bsx, p_fenc, p_pred, FENC_STRIDE, bsx); // block transform if (part_idx == LUMA_4x4) { if (b_2nd_trans) { g_funcs.dctf.transform_4x4_2nd(cur_blk, w_tr); } else { g_funcs.dctf.dct[LUMA_4x4](cur_blk, cur_blk, 4); /* 4x4 dct */ } } else { tu_get_dct_coeff(h, cur_blk, part_idx, w_tr, h_tr); if (b_2nd_trans) { g_funcs.dctf.transform_2nd(cur_blk, w_tr, intra_pred_mode, b_top, b_left); } } // quantization num_non_zero = tu_quant_forward(h, p_aec, p_cu, cur_blk, i_tu_level, w_tr, h_tr, cu_get_qp(h, &p_cu->cu_info), 1, 1, intra_pred_mode); if (num_non_zero) { g_funcs.pixf.copy_ss[PART_INDEX(w_tr, h_tr)](p_coeff_y, w_tr, cur_blk, w_tr); // inverse quantization tu_quant_inverse(h, p_cu, cur_blk, w_tr * h_tr, i_tu_level, cu_get_qp(h, &p_cu->cu_info), 1); // inverse transform if (part_idx == LUMA_4x4) { if (b_2nd_trans) { g_funcs.dctf.inv_transform_4x4_2nd(cur_blk, w_tr); } else { g_funcs.dctf.idct[LUMA_4x4](cur_blk, cur_blk, 4); /* 4x4 idct */ } } else { if (b_2nd_trans) { g_funcs.dctf.inv_transform_2nd(cur_blk, w_tr, intra_pred_mode, b_top, b_left); } g_funcs.dctf.idct[part_idx](cur_blk, cur_blk, w_tr); } g_funcs.pixf.add_ps[part_idx](p_fdec, FREC_STRIDE, p_pred, cur_blk, bsx, bsx); } else { g_funcs.pixf.copy_pp[part_idx](p_fdec, FREC_STRIDE, p_pred, bsx); } // get distortion (SSD) of current block *distortion = g_funcs.pixf.ssd[part_idx](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE); return num_non_zero; } /* --------------------------------------------------------------------------- * get the MPMs of an intra block at (pos_x, pos_y) as 4x4 address */ static ALWAYS_INLINE void xavs2_get_mpms(xavs2_t *h, cu_t *p_cu, int blockidx, int pos_y, int pos_x, int mpm[2]) { int ip_stride = h->i_width_in_minpu + 16; int top_mode = h->ipredmode[((pos_y >> MIN_PU_SIZE_IN_BIT) - 1) * ip_stride + pos_x]; int left_mode = h->ipredmode[(pos_y >> MIN_PU_SIZE_IN_BIT) * ip_stride + pos_x - 1]; if (blockidx != 0) { if (p_cu->cu_info.i_mode == PRED_I_2Nxn) { top_mode = p_cu->cu_info.real_intra_modes[blockidx - 1]; } else if (p_cu->cu_info.i_mode == PRED_I_nx2N) { left_mode = p_cu->cu_info.real_intra_modes[blockidx - 1]; } else if (p_cu->cu_info.i_mode == PRED_I_NxN) { switch (blockidx) { case 1: left_mode = p_cu->cu_info.real_intra_modes[0]; break; case 2: top_mode = p_cu->cu_info.real_intra_modes[0]; break; case 3: top_mode = p_cu->cu_info.real_intra_modes[1]; left_mode = p_cu->cu_info.real_intra_modes[2]; break; default: // case 0: break; } } } top_mode = (top_mode < 0) ? DC_PRED : top_mode; left_mode = (left_mode < 0) ? DC_PRED : left_mode; mpm[0] = XAVS2_MIN(top_mode, left_mode); mpm[1] = XAVS2_MAX(top_mode, left_mode); if (mpm[0] == mpm[1]) { mpm[0] = DC_PRED; mpm[1] = (mpm[1] == DC_PRED) ? BI_PRED : mpm[1]; } } /* --------------------------------------------------------------------------- * ֡PUַʽRDCostŵPUַʽ */ static void cu_check_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best, int mode, rdcost_t *min_rdcost) { int level = p_cu->cu_info.i_level; cu_layer_t *p_layer = cu_get_layer(h, level); cu_parallel_t *p_enc = cu_get_enc_context(h, level); rdcost_t rdcost_luma = 0; rdcost_t rdcost = MAX_COST; rdcost_t min_mode_rdcost = MAX_COST; pel_t *rec_bak_y = best->p_rec[0]; pel_t *p_best_part[4]; int blockidx; int num_luma_block = mode != PRED_I_2Nx2N ? 4 : 1; int b_need_swap_buf = 0; int pix_x_c = p_cu->i_pos_x >> 1; int pix_y_c = p_cu->i_pos_y >> CHROMA_V_SHIFT; intra_candidate_t *p_candidates = p_layer->intra_candidates; /* ȷPU */ cu_init_pu_intra(h, &p_cu->cu_info, level, mode); /* ȷTU */ cu_set_tu_split_type(h, &p_cu->cu_info, mode != PRED_I_2Nx2N); h->copy_aec_state_rdo(&p_layer->cs_rdo, p_aec); p_cu->cu_info.i_cbp = 0; p_cu->intra_avail = (uint8_t)xavs2_intra_get_cu_neighbors(h, p_cu, p_cu->i_pix_x, p_cu->i_pix_y, p_cu->i_size); /* 1, intra luma prediction and mode decision */ for (blockidx = 0; blockidx < num_luma_block; blockidx++) { int mpm[2]; // most probable modes (MPMs) for current luma block int block_x = p_cu->cu_info.cb[blockidx].x; int block_y = p_cu->cu_info.cb[blockidx].y; int block_w = p_cu->cu_info.cb[blockidx].w; int block_h = p_cu->cu_info.cb[blockidx].h; int pos_x = p_cu->i_pos_x + block_x; int pos_y = p_cu->i_pos_y + block_y; int b4x4_x = (p_cu->i_pix_x + block_x) >> MIN_PU_SIZE_IN_BIT; dist_t best_dist = MAX_DISTORTION; int best_rate = INT_MAX; int best_mode = 0; int best_pmode = 0; int best_cbp = 0; pel_t *p_fenc = h->lcu.p_fenc[0] + pos_y * FENC_STRIDE + pos_x; rdcost_t best_rdcost = MAX_COST; int i; int num_for_rdo; intra_candidate_t *p_candidates = p_layer->intra_candidates; // candidate list, reserving the cost /* init */ xavs2_get_mpms(h, p_cu, blockidx, pos_y, b4x4_x, mpm); for (i = 0; i < INTRA_MODE_NUM_FOR_RDO; i++) { p_candidates[i].mode = 0; p_candidates[i].cost = MAX_COST; } /* conduct prediction and get intra prediction direction candidates for RDO */ num_for_rdo = h->lcu.get_intra_dir_for_rdo_luma(h, p_cu, p_candidates, p_fenc, mpm, blockidx, block_x, block_y, block_w, block_h); // store the coding state h->copy_aec_state_rdo(&p_enc->cs_pu_init, p_aec); /* RDO */ for (i = 0; i < num_for_rdo; i++) { rdcost_t rdcost; dist_t dist_curr; // ǰ֡ڿʧ int rate_curr = 0; // ǰ֡ڿʣ int mode = p_candidates[i].mode; pel_t *p_pred = p_enc->intra_pred[mode]; // get and check rate_chroma-distortion cost int mode_idx_aec = (mpm[0] == mode) ? -2 : ((mpm[1] == mode) ? -1 : (mpm[0] > mode ? mode : (mpm[1] > mode ? mode - 1 : mode - 2))); int num_nonzero; num_nonzero = cu_recon_intra_luma(h, p_aec, p_cu, p_pred, block_w, block_h, block_x, block_y, blockidx, mode, &dist_curr); num_nonzero = !!num_nonzero; { int used_wavelet = (p_cu->cu_info.i_level == B64X64_IN_BIT && p_cu->cu_info.i_tu_split != TU_SPLIT_CROSS); int w_tr = block_w >> used_wavelet; int i_tu_level = p_cu->cu_info.i_level - (p_cu->cu_info.i_tu_split != TU_SPLIT_NON) - used_wavelet; int rate_luma_mode; coeff_t *p_coeff_y = p_cu->cu_info.p_coeff[0] + (blockidx << ((p_cu->cu_info.i_level - 1) << 1)); // get rate for intra prediction mode rate_luma_mode = p_aec->binary.write_intra_pred_mode(p_aec, mode_idx_aec); // get rate for luminance coefficients if (num_nonzero) { int bits_left = rdo_get_left_bits(h, best_rdcost, dist_curr) - rate_luma_mode; rate_curr = p_aec->binary.est_luma_block_coeff(h, p_aec, p_cu, p_coeff_y, &p_enc->runlevel, i_tu_level, xavs2_log2u(w_tr), 1, mode, bits_left); rate_luma_mode += rate_curr; } // calculate RD-cost and return it rdcost = dist_curr + h->f_lambda_mode * rate_luma_mode; } // choose best mode if (rdcost < best_rdcost) { XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); // set best mode update minimum cost best_dist = dist_curr; best_rate = rate_curr; best_rdcost = rdcost; best_mode = mode; best_pmode = mode_idx_aec; best_cbp = num_nonzero; // flag if dct-coefficients must be coded h->copy_aec_state_rdo(&p_enc->cs_tu, p_aec); } h->copy_aec_state_rdo(p_aec, &p_enc->cs_pu_init); if (IS_ALG_ENABLE(OPT_ET_RDO_INTRA_L)) { if (rdcost > best_rdcost * 1.2) { break; } } } // for (i = 0; i < num_for_rdo; i++) /* change the coding state to BEST */ if (best_rate < INT_MAX) { if (p_cu->cu_info.i_mode != PRED_I_2Nx2N) { g_funcs.pixf.copy_pp[PART_INDEX(block_w, block_h)](h->lcu.p_fdec[0] + pos_y * FDEC_STRIDE + pos_x, FDEC_STRIDE, p_layer->p_rec_tmp[0] + block_y * FREC_STRIDE + block_x, FREC_STRIDE); } /* copy coefficients and reconstructed data for best mode */ XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); p_best_part[blockidx] = p_cu->cu_info.p_rec[0]; /* set intra mode prediction */ p_cu->cu_info.pred_intra_modes[blockidx] = (int8_t)best_pmode; p_cu->cu_info.real_intra_modes[blockidx] = (int8_t)best_mode; /* copy coding state */ h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); } /* ģʽ״̬ʧ桢ȷųԤģʽCBP */ rdcost_luma += best_dist + h->f_lambda_mode * best_rate; p_cu->cu_info.i_cbp |= (best_cbp) << blockidx; /* ȿRDOǰֹ */ if (rdcost_luma >= *min_rdcost) { p_layer->mode_rdcost[mode] = MAX_COST; /* set the cost for SDIP fast algorithm */ h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); return; // ȿrdcostѾǰֵֹͣɫȿģʽ } } p_cu->feature.rdcost_luma = rdcost_luma; /* 2, store best luma reconstruction pixels */ for (blockidx = 0; blockidx < num_luma_block; blockidx++) { if (p_best_part[blockidx] != p_cu->cu_info.p_rec[0]) { int offset = p_cu->cu_info.cb[blockidx].y * FREC_STRIDE + p_cu->cu_info.cb[blockidx].x; int offset_coeff = blockidx << ((p_cu->cu_info.i_level - 1) << 1); int w_tr = p_cu->cu_info.cb[0].w; int h_tr = p_cu->cu_info.cb[0].h; int part_idx = PART_INDEX(w_tr, h_tr); g_funcs.pixf.copy_pp[part_idx](p_cu->cu_info.p_rec[0] + offset, FREC_STRIDE, p_layer->p_rec_tmp[0] + offset, p_cu->i_size); g_funcs.pixf.copy_ss[part_idx](p_cu->cu_info.p_coeff[0] + offset_coeff, w_tr, p_layer->p_coeff_tmp[0] + offset_coeff, w_tr); } } /* 3, Chroma mode decision and CU mode updating */ if (h->param->chroma_format != CHROMA_400) { int lmode; int num_rdo_chroma_mode; int idx_chroma_mode; int tmp_cbp_luma = p_cu->cu_info.i_cbp; lmode = tab_intra_mode_luma2chroma[p_cu->cu_info.real_intra_modes[0]]; num_rdo_chroma_mode = h->get_intra_candidates_chroma(h, p_cu, level - 1, pix_y_c, pix_x_c, p_candidates); for (idx_chroma_mode = 0; idx_chroma_mode < num_rdo_chroma_mode; idx_chroma_mode++) { dist_t dist_chroma = 0; // ɫȿָ int rate_chroma = 0; int bits_left; int predmode_c = p_candidates[idx_chroma_mode].mode; int cbp_c; /* ɫȷڶεùеģʽѡֱѡģʽRDOQ */ if ((h->param->i_rdoq_level == RDOQ_CU_LEVEL && h->lcu.b_enable_rdoq) && predmode_c != best->i_intra_mode_c) { continue; } if (predmode_c != DM_PRED_C && predmode_c == lmode) { continue; } p_cu->cu_info.i_intra_mode_c = (int8_t)predmode_c; /* RDO̵ɫȿع̣任任عֵ */ cbp_c = cu_recon_chroma(h, p_aec, p_cu, &dist_chroma); p_cu->cu_info.i_cbp = (int8_t)(tmp_cbp_luma + cbp_c); /* ------- GET RATE -------- */ rate_chroma = p_aec->binary.est_cu_header(h, p_aec, p_cu); #if ENABLE_RATE_CONTROL_CU rate_chroma += p_aec->binary.write_cu_cbp_dqp(h, p_aec, &p_cu->cu_info, h->i_slice_index, h->last_dquant); #else rate_chroma += p_aec->binary.write_cu_cbp(p_aec, &p_cu->cu_info, h->i_slice_index, h); #endif bits_left = rdo_get_left_bits(h, *min_rdcost - rdcost_luma, dist_chroma); if (p_cu->cu_info.i_cbp & (1 << 4)) { int cur_bits_left = bits_left - rate_chroma; rate_chroma += p_aec->binary.est_chroma_block_coeff(h, p_aec, p_cu, p_cu->cu_info.p_coeff[1], &p_enc->runlevel, level - 1, cur_bits_left); } if (p_cu->cu_info.i_cbp & (1 << 5)) { int cur_bits_left = bits_left - rate_chroma; rate_chroma += p_aec->binary.est_chroma_block_coeff(h, p_aec, p_cu, p_cu->cu_info.p_coeff[2], &p_enc->runlevel, level - 1, cur_bits_left); } rdcost = dist_chroma + h->f_lambda_mode * rate_chroma + rdcost_luma; min_mode_rdcost = XAVS2_MIN(rdcost, min_mode_rdcost); if (rdcost < *min_rdcost) { *min_rdcost = rdcost; h->copy_aec_state_rdo(&p_layer->cs_cu, p_aec); /* store coding state for the best mode */ cu_store_parameters(h, p_cu, best); b_need_swap_buf = 1; } h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); /* revert to AEC context of best Luma mode */ if (IS_ALG_ENABLE(OPT_FAST_RDO_INTRA_C)) { if (rdcost > *min_rdcost * 2 || cbp_c == 0) { break; } } } } else { /* YUV400 */ /* ------- GET RATE -------- */ int rate_hdr = p_aec->binary.est_cu_header(h, p_aec, p_cu); #if ENABLE_RATE_CONTROL_CU rate_hdr += p_aec->binary.write_cu_cbp_dqp(h, p_aec, &p_cu->cu_info, h->i_slice_index, h->last_dquant); #else rate_hdr += p_aec->binary.write_cu_cbp(p_aec, &p_cu->cu_info, h->i_slice_index, h); #endif rdcost = h->f_lambda_mode * rate_hdr + rdcost_luma; if (rdcost < *min_rdcost) { *min_rdcost = rdcost; h->copy_aec_state_rdo(&p_layer->cs_cu, p_aec); /* store coding state for the best mode */ cu_store_parameters(h, p_cu, best); b_need_swap_buf = 1; } } h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); /* revert to initial AEC context */ /* 4, confirm the buffer pointers and record the best information */ if (best->p_rec[0] == rec_bak_y && b_need_swap_buf) { XAVS2_SWAP_PTR(best->p_rec[0], p_cu->cu_info.p_rec[0]); XAVS2_SWAP_PTR(best->p_coeff[0], p_cu->cu_info.p_coeff[0]); } p_layer->mode_rdcost[mode] = min_mode_rdcost; /* store the cost for SDIP fast algorithm */ } //#if OPT_BYPASS_SDIP /* --------------------------------------------------------------------------- * SDIP fast */ static ALWAYS_INLINE int sdip_early_bypass(xavs2_t *h, cu_layer_t *p_layer, int i_mode) { UNUSED_PARAMETER(h); return i_mode == PRED_I_nx2N && (p_layer->mode_rdcost[PRED_I_2Nxn] < p_layer->mode_rdcost[PRED_I_2Nx2N] * 0.9); } //#endif /** * =========================================================================== * local function defines (inter) * =========================================================================== */ //#if OPT_FAST_ZBLOCK || OPT_ECU static const int tab_th_zero_block_sad[][5] = { { 7, 19, 72, 281, 1115 }, { 7, 19, 73, 281, 1116 }, { 7, 20, 73, 282, 1118 }, { 8, 20, 74, 283, 1120 }, { 8, 20, 74, 284, 1122 }, { 8, 20, 75, 285, 1124 }, { 8, 21, 75, 286, 1126 }, { 8, 21, 76, 288, 1129 }, { 9, 21, 77, 289, 1132 }, { 9, 22, 77, 291, 1135 }, { 9, 22, 78, 292, 1138 }, { 10, 23, 79, 294, 1142 }, { 10, 23, 80, 296, 1146 }, { 10, 24, 81, 298, 1150 }, { 11, 24, 82, 301, 1155 }, { 11, 25, 84, 303, 1160 }, { 12, 26, 85, 306, 1166 }, { 12, 26, 87, 309, 1172 }, { 13, 27, 88, 312, 1179 }, { 13, 28, 90, 316, 1186 }, { 14, 29, 92, 320, 1194 }, { 15, 30, 94, 325, 1203 }, { 15, 31, 97, 329, 1213 }, { 16, 33, 99, 334, 1223 }, { 17, 34, 102, 340, 1235 }, { 18, 36, 105, 346, 1247 }, { 20, 37, 109, 353, 1260 }, { 21, 39, 112, 360, 1275 }, { 22, 41, 116, 368, 1292 }, { 24, 43, 121, 377, 1309 }, { 25, 46, 125, 386, 1328 }, { 27, 48, 131, 397, 1349 }, { 29, 51, 136, 408, 1372 }, { 31, 54, 142, 420, 1397 }, { 33, 58, 149, 434, 1424 }, { 36, 61, 156, 448, 1453 }, { 38, 65, 164, 464, 1485 }, { 41, 70, 173, 482, 1520 }, { 45, 74, 183, 501, 1559 }, { 48, 79, 193, 521, 1600 }, { 52, 85, 204, 544, 1646 }, { 56, 91, 217, 569, 1696 }, { 61, 98, 230, 596, 1750 }, { 66, 105, 245, 625, 1809 }, { 71, 113, 261, 657, 1873 }, { 77, 122, 278, 692, 1944 }, { 83, 132, 297, 729, 2020 }, { 90, 142, 318, 771, 2104 }, { 98, 153, 341, 816, 2195 }, { 106, 166, 365, 865, 2294 }, { 116, 179, 392, 919, 2403 }, { 126, 194, 422, 978, 2521 }, { 136, 210, 454, 1042, 2649 }, { 148, 227, 488, 1111, 2790 }, { 161, 246, 526, 1187, 2943 }, { 175, 267, 568, 1270, 3110 }, { 191, 290, 613, 1360, 3292 }, { 207, 314, 662, 1459, 3491 }, { 225, 341, 716, 1566, 3707 }, { 245, 370, 775, 1683, 3944 }, { 267, 402, 839, 1811, 4201 }, { 291, 437, 909, 1950, 4482 }, { 316, 475, 985, 2102, 4788 }, { 345, 517, 1068, 2268, 5123 }, { 375, 562, 1158, 2448, 5487 }, { 412, 617, 1268, 2667, 5928 }, { 445, 665, 1364, 2860, 6317 }, { 485, 724, 1482, 3094, 6790 }, { 528, 788, 1610, 3350, 7305 }, { 576, 858, 1749, 3628, 7867 }, { 631, 939, 1912, 3954, 8524 }, { 687, 1022, 2078, 4285, 9192 }, { 748, 1113, 2259, 4647, 9920 }, { 812, 1206, 2446, 5019, 10671 }, { 884, 1313, 2661, 5448, 11537 }, { 964, 1431, 2895, 5917, 12482 }, { 1047, 1553, 3140, 6406, 13469 }, { 1145, 1698, 3430, 6985, 14636 }, { 1248, 1850, 3735, 7592, 15862 }, { 1357, 2011, 4055, 8233, 17154 } }; /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE bool_t isZeroCuFast(xavs2_t *h, cu_t *p_cu) { int i_level = p_cu->cu_info.i_level - MIN_PU_SIZE_IN_BIT; int i_qp = cu_get_qp(h, &p_cu->cu_info); int thres_satd = (int)(tab_th_zero_block_sad[i_qp][i_level] * h->param->factor_zero_block); return p_cu->sum_satd < thres_satd; } //#endif /* --------------------------------------------------------------------------- * int scrFlag = 0; // 0=noSCR, 1=strongSCR, 2=jmSCR */ static INLINE int tu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int i_level, int8_t *cbp, int blockidx, coeff_t *cur_blk, int x_pu, int y_pu, int w_pu, int h_pu) { cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); int used_wavelet = (p_cu->cu_info.i_level == B64X64_IN_BIT && p_cu->cu_info.i_tu_split != TU_SPLIT_CROSS); int part_idx = PART_INDEX(w_pu, h_pu); int w_tr = w_pu >> used_wavelet; int h_tr = h_pu >> used_wavelet; int num_non_zero = 0; pel_t *p_fdec = p_cu->cu_info.p_rec[0] + y_pu * FREC_STRIDE + x_pu; pel_t *p_pred = p_layer->buf_pred_inter + y_pu * FREC_STRIDE + x_pu; coeff_t *coeff_y = p_cu->cu_info.p_coeff[0] + (blockidx << ((p_cu->cu_info.i_level - 1) << 1)); tu_get_dct_coeff(h, cur_blk, part_idx, w_tr, h_tr); num_non_zero = tu_quant_forward(h, p_aec, p_cu, cur_blk, i_level, w_tr, h_tr, cu_get_qp(h, &p_cu->cu_info), 0, 1, DC_PRED); if (num_non_zero != 0) { *cbp |= (1 << blockidx); // ָλΪ 1 g_funcs.pixf.copy_ss[PART_INDEX(w_tr, h_tr)](coeff_y, w_tr, cur_blk, w_tr); tu_quant_inverse(h, p_cu, cur_blk, w_tr * h_tr, i_level, cu_get_qp(h, &p_cu->cu_info), 1); g_funcs.dctf.idct[part_idx](cur_blk, cur_blk, w_tr); g_funcs.pixf.add_ps[part_idx](p_fdec, FREC_STRIDE, p_pred, cur_blk, FREC_STRIDE, w_pu); } else { /* CBPָλֵCBPʼֵΪ0 */ // ȫ鲻任ֻ追ԤֵΪعֵ coeff_y[0] = 0; if (p_cu->cu_info.i_tu_split) { g_funcs.pixf.copy_pp[part_idx](p_fdec, FREC_STRIDE, p_pred, FREC_STRIDE); } } return num_non_zero; } /* --------------------------------------------------------------------------- * ָʽع֡ԤⷽʽCUȷ * صǰCUʧ棨ɫȿʧ棩 */ static dist_t cu_recon_inter_luma(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int is_non_residual, int b_tu_split, int cbp_c, dist_t dist_chroma) { cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); coeff_t *cur_blk = p_enc->coeff_blk; coeff_t *coeff_bak = p_enc->coeff_bak; coeff_t *p_resi; int level = p_cu->cu_info.i_level; int num_nonzero = 0; int sum_dc_coeff = 0; int b_zero_block = 0; int blockidx; int pix_x = p_cu->i_pos_x; int pix_y = p_cu->i_pos_y; int cu_size = p_cu->i_size; int cu_size_2 = cu_size >> 1; int cu_size_4 = cu_size_2 >> 1; dist_t distortion; pel_t *p_fenc; pel_t *p_fdec; /* clear CBP */ p_cu->cu_info.i_cbp = 0; /* encode for luma */ cu_set_tu_split_type(h, &p_cu->cu_info, b_tu_split); if (is_non_residual) { /* SKIP mode (or no residual coding) */ int uvoffset = (FREC_CSTRIDE >> 1); int part_idx_c = PART_INDEX(cu_size_2, cu_size_2); int pix_x_c = pix_x >> 1; int pix_y_c = pix_y >> CHROMA_V_SHIFT; h->lcu.bypass_all_dmh |= (p_cu->cu_info.dmh_mode == 0); /* copy Y component and get distortion */ p_fenc = h->lcu.p_fenc[0] + pix_y * FENC_STRIDE + pix_x; p_fdec = p_cu->cu_info.p_rec[0]; g_funcs.pixf.copy_pp[PART_INDEX(cu_size, cu_size)](p_fdec, FREC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); distortion = g_funcs.pixf.ssd[PART_INDEX(cu_size, cu_size)](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE); /* chroma distortion */ if (cbp_c) { /* copy U component and get distortion */ p_fenc = h->lcu.p_fenc[1] + pix_y_c * FENC_STRIDE + pix_x_c; p_fdec = p_cu->cu_info.p_rec[1]; g_funcs.pixf.copy_pp[part_idx_c](p_fdec, FREC_CSTRIDE / 2, p_enc->buf_pred_inter_c, FREC_CSTRIDE); distortion += g_funcs.pixf.ssd[part_idx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2); /* copy V component and get distortion */ p_fenc = h->lcu.p_fenc[2] + pix_y_c * FENC_STRIDE + pix_x_c; p_fdec = p_cu->cu_info.p_rec[2]; g_funcs.pixf.copy_pp[part_idx_c](p_fdec, FREC_CSTRIDE / 2, p_enc->buf_pred_inter_c + uvoffset, FREC_CSTRIDE); distortion += g_funcs.pixf.ssd[part_idx_c](p_fenc, FENC_STRIDE, p_fdec, FREC_CSTRIDE / 2); } else { distortion += dist_chroma; } return distortion; } else if (p_cu->cu_info.i_tu_split) { int pix_cu_x = 0; int pix_cu_y = 0; switch (p_cu->cu_info.i_tu_split) { case TU_SPLIT_HOR: g_funcs.pixf.copy_ss[PART_INDEX(cu_size, cu_size)](cur_blk, cu_size, coeff_bak, cu_size); for (blockidx = 0; blockidx < 4; blockidx++, pix_cu_y += cu_size_4) { p_resi = cur_blk + pix_cu_y * cu_size + pix_cu_x; num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, p_resi, pix_cu_x, pix_cu_y, cu_size, cu_size_4); sum_dc_coeff += XAVS2_ABS(p_cu->cu_info.p_coeff[0][pix_cu_y * cu_size + pix_cu_x]); } break; case TU_SPLIT_VER: for (blockidx = 0; blockidx < 4; blockidx++, pix_cu_x += cu_size_4) { p_resi = coeff_bak + pix_cu_y * cu_size + pix_cu_x; g_funcs.pixf.copy_ss[PART_INDEX(cu_size_4, cu_size)](cur_blk, cu_size_4, p_resi, cu_size); num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, cur_blk, pix_cu_x, pix_cu_y, cu_size_4, cu_size); sum_dc_coeff += XAVS2_ABS(p_cu->cu_info.p_coeff[0][pix_cu_y * cu_size + pix_cu_x]); } break; default: for (blockidx = 0; blockidx < 4; blockidx++) { pix_cu_x = (blockidx & 1) * cu_size_2; pix_cu_y = (blockidx >> 1) * cu_size_2; p_resi = coeff_bak + pix_cu_y * cu_size + pix_cu_x; g_funcs.pixf.copy_ss[PART_INDEX(cu_size_2, cu_size_2)](cur_blk, cu_size_2, p_resi, cu_size); num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level - 1, &p_cu->cu_info.i_cbp, blockidx, cur_blk, pix_cu_x, pix_cu_y, cu_size_2, cu_size_2); sum_dc_coeff += XAVS2_ABS(p_cu->cu_info.p_coeff[0][pix_cu_y * cu_size + pix_cu_x]); } break; } // ǰCUϵ LUMA_COEFF_COST DCϵ£϶Ϊȫ b_zero_block = (num_nonzero <= LUMA_COEFF_COST && sum_dc_coeff <= MAX_COEFF_QUASI_ZERO); } else { if (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) && p_cu->is_zero_block) { b_zero_block = 1; } else { num_nonzero += tu_recon_inter_luma(h, p_aec, p_cu, level, &p_cu->cu_info.i_cbp, 0, coeff_bak, 0, 0, cu_size, cu_size); // ǰCUб任ķϵ LUMA_COEFF_COST DCϵ£϶Ϊȫ sum_dc_coeff = XAVS2_ABS(p_cu->cu_info.p_coeff[0][0]); b_zero_block = (num_nonzero <= LUMA_COEFF_COST && sum_dc_coeff <= MAX_COEFF_QUASI_ZERO); } } if (b_zero_block) { h->lcu.bypass_all_dmh |= (h->i_type == SLICE_TYPE_F && p_cu->cu_info.dmh_mode == 0); p_cu->cu_info.i_cbp = 0; g_funcs.pixf.copy_pp[PART_INDEX(cu_size, cu_size)](p_cu->cu_info.p_rec[0], FREC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); } /* set CBP */ p_cu->cu_info.i_cbp += (int8_t)cbp_c; /* luma distortion */ p_fenc = h->lcu.p_fenc[0] + pix_y * FENC_STRIDE + pix_x; p_fdec = p_cu->cu_info.p_rec[0]; distortion = dist_chroma; distortion += g_funcs.pixf.ssd[PART_INDEX(cu_size, cu_size)](p_fenc, FENC_STRIDE, p_fdec, FREC_STRIDE); return distortion; } /* --------------------------------------------------------------------------- * R-D cost for a inter cu whether split or not * Return: rate-distortion cost of cu when TU is split or not */ static int tu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, dist_t distortion, int rate_chroma, rdcost_t *rdcost) { int mode = p_cu->cu_info.i_mode; int level = p_cu->cu_info.i_level; int rate; int block_idx; cu_parallel_t *p_enc = cu_get_enc_context(h, level); /* ------------------------------------------------------------- * get rate */ /* rate of cu header */ rate = p_aec->binary.est_cu_header(h, p_aec, p_cu); /* rate of motion information */ rate += p_aec->binary.est_cu_refs_mvds(h, p_aec, p_cu); /* tu information */ if (mode != PRED_SKIP || p_cu->cu_info.i_cbp) { int bits_left = rdo_get_left_bits(h, *rdcost, distortion); int cur_bits_left; /* rate of cbp & dqp */ #if ENABLE_RATE_CONTROL_CU rate += p_aec->binary.write_cu_cbp_dqp(h, p_aec, &p_cu->cu_info, h->i_slice_index, h->last_dquant); #else rate += p_aec->binary.write_cu_cbp(p_aec, &p_cu->cu_info, h->i_slice_index, h); #endif /* rate of luma coefficients */ if (p_cu->cu_info.i_tu_split != TU_SPLIT_NON) { int use_wavelet = (level == B64X64_IN_BIT && p_cu->cu_info.i_tu_split != TU_SPLIT_CROSS); int i_tu_level = level - 1 - use_wavelet; for (block_idx = 0; block_idx < 4; block_idx++) { if (p_cu->cu_info.i_cbp & (1 << block_idx)) { cb_t tb; cur_bits_left = bits_left - rate; cu_init_transform_block(p_cu->cu_info.i_level, p_cu->cu_info.i_tu_split, block_idx, &tb); rate += p_aec->binary.est_luma_block_coeff(h, p_aec, p_cu, p_cu->cu_info.p_coeff[0] + (block_idx << ((p_cu->cu_info.i_level - 1) << 1)), &p_enc->runlevel, i_tu_level, xavs2_log2u(tb.w) - use_wavelet, 0, 0, cur_bits_left); } } } else { if (p_cu->cu_info.i_cbp & 15) { int i_tu_level = level - (level == B64X64_IN_BIT); cur_bits_left = bits_left - rate; rate += p_aec->binary.est_luma_block_coeff(h, p_aec, p_cu, p_cu->cu_info.p_coeff[0], &p_enc->runlevel, i_tu_level, i_tu_level, 0, 0, cur_bits_left); } } /* rate of chroma coefficients */ if (IS_ALG_ENABLE(OPT_ADVANCE_CHROMA_AEC)) { if (p_cu->cu_info.i_cbp != 0) { // not skip mode rate += rate_chroma; } } else { level--; if (p_cu->cu_info.i_cbp & (1 << 4)) { cur_bits_left = bits_left - rate; rate += p_aec->binary.est_chroma_block_coeff(h, p_aec, p_cu, p_cu->cu_info.p_coeff[1], &p_enc->runlevel, level, cur_bits_left); } if (p_cu->cu_info.i_cbp & (1 << 5)) { cur_bits_left = bits_left - rate; rate += p_aec->binary.est_chroma_block_coeff(h, p_aec, p_cu, p_cu->cu_info.p_coeff[2], &p_enc->runlevel, level, cur_bits_left); } } } /* ------------------------------------------------------------- * get rate-distortion cost */ *rdcost = distortion + h->f_lambda_mode * rate; return p_cu->cu_info.i_cbp; } /* --------------------------------------------------------------------------- * ȡȡɫȷԤֵMVǷЧΧ */ static ALWAYS_INLINE int rdo_get_pred_inter(xavs2_t *h, cu_t *p_cu, int cal_luma_chroma) { cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); int blockidx; /* get prediction data */ for (blockidx = 0; blockidx < p_cu->cu_info.num_pu; blockidx++) { cb_t cur_cb = p_cu->cu_info.cb[blockidx]; int start_x = cur_cb.x; int start_y = cur_cb.y; int width = cur_cb.w; int height = cur_cb.h; int pix_x = p_cu->i_pix_x + start_x; int pix_y = p_cu->i_pix_y + start_y; mv_t mv_1st, mv_2nd; // һǰB֡Ԥ⣩͵ڶ˶ʸ int ref_1st, ref_2nd; // һǰB֡Ԥ⣩͵ڶ򣩲ο֡ int num_mvs; int b_mv_valid; // MVǷЧСȡֵǷڱ׼涨ЧΧ pel_t *p_temp = p_enc->buf_pixel_temp; pel_t *p_pred; xavs2_frame_t *p_ref1 = NULL; xavs2_frame_t *p_ref2 = NULL; /* MV1Ϊ˫ο֡/DMHԤ */ num_mvs = cu_get_mvs_for_mc(h, p_cu, blockidx, &mv_1st, &mv_2nd, &ref_1st, &ref_2nd); b_mv_valid = check_mv_range(h, &mv_1st, ref_1st, pix_x, pix_y, width, height); if (num_mvs > 1) { b_mv_valid &= check_mv_range(h, &mv_2nd, ref_2nd, pix_x, pix_y, width, height); get_mv_for_mc(h, &mv_2nd, pix_x, pix_y, width, height); p_ref2 = h->fref[ref_2nd]; } get_mv_for_mc(h, &mv_1st, pix_x, pix_y, width, height); p_ref1 = h->fref[ref_1st]; if (!b_mv_valid && p_cu->cu_info.i_mode != PRED_SKIP) { return 0; } /* y component */ if (cal_luma_chroma & 1) { p_pred = p_layer->buf_pred_inter + start_y * FREC_STRIDE + start_x; mc_luma(p_pred, FREC_STRIDE, mv_1st.x, mv_1st.y, width, height, p_ref1); if (num_mvs > 1) { mc_luma(p_temp, width, mv_2nd.x, mv_2nd.y, width, height, p_ref2); g_funcs.pixf.avg[PART_INDEX(width, height)](p_pred, FREC_STRIDE, p_pred, FREC_STRIDE, p_temp, width, 32); } } /* u and v component */ if (h->param->chroma_format == CHROMA_420 && (cal_luma_chroma & 2)) { int uvoffset = (FREC_CSTRIDE >> 1); start_x >>= 1; width >>= 1; pix_x >>= 1; start_y >>= CHROMA_V_SHIFT; pix_y >>= CHROMA_V_SHIFT; height >>= CHROMA_V_SHIFT; p_pred = p_enc->buf_pred_inter_c + start_y * FREC_CSTRIDE + start_x; /* u component */ mc_chroma(p_pred, p_pred + uvoffset, FREC_CSTRIDE, mv_1st.x, mv_1st.y, width, height, p_ref1); if (num_mvs > 1) { mc_chroma(p_temp, p_temp + uvoffset, FREC_CSTRIDE, mv_2nd.x, mv_2nd.y, width, height, p_ref2); if (width != 2 && width != 6 && height != 2 && height != 6) { pixel_avg_pp_t func_avg = g_funcs.pixf.avg[PART_INDEX(width, height)]; func_avg(p_pred , FREC_CSTRIDE, p_pred , FREC_CSTRIDE, p_temp , FREC_CSTRIDE, 32); func_avg(p_pred + uvoffset, FREC_CSTRIDE, p_pred + uvoffset, FREC_CSTRIDE, p_temp + uvoffset, FREC_CSTRIDE, 32); } else { g_funcs.pixf.average(p_pred, FREC_CSTRIDE / 2, p_pred, FREC_CSTRIDE / 2, p_temp, FREC_CSTRIDE / 2, width, height * 2); } } } } return 1; } /* --------------------------------------------------------------------------- * compute rd-cost for inter cu * return 1, means it is the best mode * 0, means it is not the best mode */ static int cu_rdcost_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, rdcost_t *min_rdcost, cu_info_t *p_best) { static int8_t tab_try_2level_tu[4][2] = { /* try non-split; try split */ {1, 0}, /* 8x8 */ {1, 0}, /* 16x16 */ {1, 0}, /* 32x32 */ {1, 0}, /* 64x64 */ }; bool_t b_try_tu_nonsplit = h->param->b_fast_2lelvel_tu ? tab_try_2level_tu[p_cu->cu_info.i_level - MIN_CU_SIZE_IN_BIT][0] : 1; bool_t b_try_tu_split = h->param->b_fast_2lelvel_tu ? tab_try_2level_tu[p_cu->cu_info.i_level - MIN_CU_SIZE_IN_BIT][1] : 1; int mode = p_cu->cu_info.i_mode; int cu_size = p_cu->i_size; int tmp_cbp; /* cbp for i_tu_split = 1*/ int cbp_c = 0; int rate_chroma = 0; dist_t dist_chroma = 0; dist_t dist_split = 0; dist_t dist_notsplit = 0; dist_t best_dist_cur = 0; rdcost_t rdcost = *min_rdcost; // ʼΪRDCost rdcost_t rdcost_split = rdcost; pel_t *p_fenc = h->lcu.p_fenc[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); /* ------------------------------------------------------------- * 1, begin */ p_cu->cu_info.i_cbp = 0; p_cu->cu_info.i_tu_split = TU_SPLIT_NON; // cu_set_tu_split_type(h, &p_cu->cu_info, 0); /* set reference frame and block mode */ cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, mode); /* store coding state */ h->copy_aec_state_rdo(&p_layer->cs_rdo, p_aec); /* ------------------------------------------------------------- * 2. get prediction data */ if (!rdo_get_pred_inter(h, p_cu, 3)) { return 0; } /* ------------------------------------------------------------- * 3, tu decision */ /* 3.1, check chroma residual coding */ if (h->param->chroma_format == CHROMA_420){ cbp_c = cu_recon_chroma(h, p_aec, p_cu, &dist_chroma); if (IS_ALG_ENABLE(OPT_ADVANCE_CHROMA_AEC)) { int bits_left = rdo_get_left_bits(h, *min_rdcost, dist_chroma); int i_level_c = p_cu->cu_info.i_level - 1; int cur_bits_left; if (cbp_c & (1 << 4)) { cur_bits_left = bits_left - rate_chroma; rate_chroma += p_aec->binary.est_chroma_block_coeff(h, p_aec, p_cu, p_cu->cu_info.p_coeff[1], &p_enc->runlevel, i_level_c, cur_bits_left); } if (cbp_c & (1 << 5)) { cur_bits_left = bits_left - rate_chroma; rate_chroma += p_aec->binary.est_chroma_block_coeff(h, p_aec, p_cu, p_cu->cu_info.p_coeff[2], &p_enc->runlevel, i_level_c, cur_bits_left); } } } /* 3.2, check luma CU tu-split type and CBP */ /* 3.2.1, get luma residual */ g_funcs.pixf.sub_ps[PART_INDEX(cu_size, cu_size)](p_enc->coeff_bak, cu_size, p_fenc, p_layer->buf_pred_inter, FENC_STRIDE, FREC_STRIDE); /* 3.2.2, Fast algorithm, check whether TU split is essential */ if (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) || IS_ALG_ENABLE(OPT_ECU)) { p_cu->sum_satd = g_funcs.pixf.sad[PART_INDEX(cu_size, cu_size)](p_layer->buf_pred_inter, FREC_STRIDE, p_fenc, FENC_STRIDE); p_cu->is_zero_block = isZeroCuFast(h, p_cu); } /* only get cost with tu depth equals 1 */ if ((h->enable_tu_2level == 1) || ((h->enable_tu_2level == 3) && (p_best->i_tu_split != 0))) { if (b_try_tu_split && b_try_tu_nonsplit && (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) && p_cu->is_zero_block)) { b_try_tu_split = FALSE; } if (b_try_tu_split) { h->copy_aec_state_rdo(&p_enc->cs_tu, p_aec); /* store coding state for tu depth = 1 */ dist_split = cu_recon_inter_luma(h, &p_enc->cs_tu, p_cu, 0, 1, cbp_c, dist_chroma); tmp_cbp = tu_rdcost_inter(h, &p_enc->cs_tu, p_cu, dist_split, rate_chroma, &rdcost_split); /* store dct coefficients, rec data and coding state for tu depth = 1*/ XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); } else { rdcost_split = MAX_COST; tmp_cbp = 0; } if (rdcost_split >= *min_rdcost) { h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); return 0; /* return code = 0, means it is not the best mode */ } else { XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); p_layer->mode_rdcost[mode] = XAVS2_MIN(rdcost_split, p_layer->mode_rdcost[mode]); /* restore the cbp, dct coefficients, rec data and coding state for tu depth = 1*/ p_cu->cu_info.i_cbp = (int8_t)tmp_cbp; *min_rdcost = rdcost_split; p_cu->best_dist_total = dist_split; h->copy_aec_state_rdo(&p_layer->cs_cu, &p_enc->cs_tu); h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); cu_store_parameters(h, p_cu, p_best); return 1; /* return code = 1, means it is the best mode */ } } else if ((h->enable_tu_2level == 0) || ((h->enable_tu_2level == 3) && (p_best->i_tu_split == 0))) { /* only get cost with tu depth equals 0 */ dist_notsplit = cu_recon_inter_luma(h, p_aec, p_cu, 0, 0, cbp_c, dist_chroma); tu_rdcost_inter(h, p_aec, p_cu, dist_notsplit, rate_chroma, &rdcost); } else { if (b_try_tu_split && b_try_tu_nonsplit && (IS_ALG_ENABLE(OPT_FAST_ZBLOCK) && p_cu->is_zero_block)) { b_try_tu_split = FALSE; } if (b_try_tu_split) { h->copy_aec_state_rdo(&p_enc->cs_tu, p_aec); /* store coding state for tu depth = 1 */ dist_split = cu_recon_inter_luma(h, &p_enc->cs_tu, p_cu, 0, 1, cbp_c, dist_chroma); tmp_cbp = tu_rdcost_inter(h, &p_enc->cs_tu, p_cu, dist_split, rate_chroma, &rdcost_split); /* store dct coefficients, rec data and coding state for tu depth = 1*/ XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); } else { rdcost_split = MAX_COST; tmp_cbp = 0; } /* 3.2.4, get cost with tu depth equals 0 */ if (b_try_tu_nonsplit) { dist_notsplit = cu_recon_inter_luma(h, p_aec, p_cu, 0, 0, cbp_c, dist_chroma); tu_rdcost_inter(h, p_aec, p_cu, dist_notsplit, rate_chroma, &rdcost); } /* 3.2.5, choose the best tu depth (whether split or not) */ if (rdcost > rdcost_split) { /* the best tu depth is 1 */ rdcost = rdcost_split; best_dist_cur = dist_split; cu_set_tu_split_type(h, &p_cu->cu_info, 1); /* restore the cbp, dct coefficients, rec data and coding state for tu depth = 1*/ p_cu->cu_info.i_cbp = (int8_t)tmp_cbp; XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); XAVS2_SWAP_PTR(p_cu->cu_info.p_coeff[0], p_layer->p_coeff_tmp[0]); h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); /* restore coding state */ } else { best_dist_cur = dist_notsplit; } } if (IS_ALG_ENABLE(OPT_CBP_DIRECT) && IS_SKIP_MODE(mode)) { /* Skip/DirectģʽIJв任Ϊȫ飺 * ʱֹ²CUֿԵõ϶ʱʡʧС * ͨPUģʽܴļ١ */ p_cu->b_cbp_direct = (p_cu->cu_info.i_cbp == 0); } /* 3.3, check skip mode for PRED_SKIP when CBP is nonzero */ if (IS_SKIP_MODE(p_cu->cu_info.i_mode) && p_cu->cu_info.i_cbp != 0) { rdcost_t rdcost_skip = MAX_COST; dist_t dist_total_skip; int best_tu_split_type = p_cu->cu_info.i_tu_split; if (best_tu_split_type == TU_SPLIT_NON) { h->copy_aec_state_rdo(&p_enc->cs_tu, p_aec); /* store coding state for best Direct mode */ } h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo);/* restore coding state */ tmp_cbp = p_cu->cu_info.i_cbp; /* backup reconstruction buffers, prepare for SKIP mode */ XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); if (cbp_c != 0) { XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[1], p_layer->p_rec_tmp[1]); XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[2], p_layer->p_rec_tmp[2]); } /* check SKIP Mode */ dist_total_skip = cu_recon_inter_luma(h, p_aec, p_cu, 1, 0, cbp_c, dist_chroma); tu_rdcost_inter(h, p_aec, p_cu, dist_total_skip, rate_chroma, &rdcost_skip); if (rdcost_skip <= rdcost) { rdcost = rdcost_skip; /* skip mode is the best */ best_dist_cur = dist_total_skip; p_cu->cu_info.i_tu_split = TU_SPLIT_NON; } else { h->copy_aec_state_rdo(p_aec, &p_enc->cs_tu); /* restore coding state */ /* revert buffers */ XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[0], p_layer->p_rec_tmp[0]); if (cbp_c != 0) { XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[1], p_layer->p_rec_tmp[1]); XAVS2_SWAP_PTR(p_cu->cu_info.p_rec[2], p_layer->p_rec_tmp[2]); } p_cu->cu_info.i_cbp = (int8_t)(tmp_cbp); p_cu->cu_info.i_tu_split = (int8_t)(best_tu_split_type); } } /* ------------------------------------------------------------- * 4, store the min cost for current cu mode */ p_layer->mode_rdcost[mode] = XAVS2_MIN(rdcost, p_layer->mode_rdcost[mode]); /* ------------------------------------------------------------- * 5, update the min cost, restore the coding state and return */ if (rdcost >= *min_rdcost) { h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); return 0; /* return code = 0, means it is not the best mode */ } else { if (mode == PRED_SKIP && IS_ALG_ENABLE(OPT_ROUGH_SKIP_SEL)) { /* re-cover best skip prediction data */ XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); } *min_rdcost = rdcost; p_cu->best_dist_total = best_dist_cur; /* store coding state for the best mode */ h->copy_aec_state_rdo(&p_layer->cs_cu, p_aec); h->copy_aec_state_rdo(p_aec, &p_layer->cs_rdo); /* update best CU information */ cu_store_parameters(h, p_cu, p_best); return 1; /* return code = 1, means it is the best mode */ } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void cu_set_mvs_noskip(cu_t *p_cu, int blockidx, int ref1, mv_t *pmv1, int ref2, mv_t *pmv2) { const int mode = p_cu->cu_info.i_mode; if (mode == PRED_2Nx2N) { int k; for (k = 0; k < 4; k++) { p_cu->mc.mv[k][0].v = pmv1->v; p_cu->mc.mv[k][1].v = pmv2->v; p_cu->cu_info.ref_idx_1st[k] = (int8_t)ref1; p_cu->cu_info.ref_idx_2nd[k] = (int8_t)ref2; } } else { p_cu->mc.mv [blockidx][0].v = pmv1->v; p_cu->cu_info.ref_idx_1st[blockidx] = (int8_t)ref1; p_cu->mc.mv [blockidx][1].v = pmv2->v; p_cu->cu_info.ref_idx_2nd[blockidx] = (int8_t)ref2; } } /* --------------------------------------------------------------------------- */ static rdcost_t cu_rdo_motion_estimation(xavs2_t *h, cu_t *p_cu, xavs2_me_t *p_me, int dualpred_enabled) { const int mode = p_cu->cu_info.i_mode; const int block_num = mode == PRED_2Nx2N ? 1 : 2; int best_fwd_ref = 0; int best_pdir = PDIR_FWD; int dual_best_fst_ref = 0; int dual_best_snd_ref = 0; int block, b8; dist_t fwd_cost = MAX_DISTORTION; dist_t bwd_cost = 0; rdcost_t total_cost = 0; int ref1, ref2; // best references mv_t mv1, mv2; // best mvs cb_t *p_cb; cu_mode_t *p_mode = cu_get_layer_mode(h, p_cu->cu_info.i_level); cu_mv_mode_t *p_mv_mode; p_cu->cu_info.b8pdir[0] = p_cu->cu_info.b8pdir[1] = p_cu->cu_info.b8pdir[2] = p_cu->cu_info.b8pdir[3] = (int8_t)PDIR_FWD; p_me->bmvcost[0] = p_me->bmvcost[1] = p_me->bmvcost[2] = p_me->bmvcost[3] = MAX_DISTORTION; p_me->b_search_dmh = (dualpred_enabled == -1); // motion estimation for 2Nx2N, 2NxN, Nx2N, AMP blocks for (block = 0; block < block_num; block++) { mv1.v = mv2.v = 0; p_cb = &p_cu->cu_info.cb[block]; cu_get_neighbors(h, p_cu, p_cb); /* һPUҪ½MEMVP䣩 */ if (dualpred_enabled < 0 && block == 0) { best_fwd_ref = p_mode->ref_idx_single[0]; } else { best_fwd_ref = pred_inter_search_single(h, p_cu, p_cb, p_me, &fwd_cost, &bwd_cost); } b8 = pu_get_mv_index(mode, block); p_mv_mode = &p_mode->mvs[mode][b8]; if (dualpred_enabled < 0) { best_pdir = PDIR_FWD; mv1 = p_mv_mode->all_single_mv[best_fwd_ref]; cu_set_mvs_noskip(p_cu, block, best_fwd_ref, &mv1, INVALID_REF, &mv2); } else if (h->i_type == SLICE_TYPE_F) { dist_t dual_mcost = MAX_DISTORTION; if (dualpred_enabled && (!(p_cu->cu_info.i_level == B8X8_IN_BIT && mode >= PRED_2NxN && mode <= PRED_nRx2N))) { pred_inter_search_dual(h, p_cu, p_cb, p_me, &dual_mcost, &dual_best_fst_ref, &dual_best_snd_ref); } if (fwd_cost <= dual_mcost) { best_pdir = PDIR_FWD; ref1 = best_fwd_ref; mv1 = p_mv_mode->all_single_mv[ref1]; cu_set_mvs_noskip(p_cu, block, ref1, &mv1, INVALID_REF, &mv2); p_cu->mvcost[block] = p_me->bmvcost[PDIR_FWD]; } else { fwd_cost = dual_mcost; best_pdir = PDIR_DUAL; ref1 = dual_best_fst_ref; ref2 = dual_best_snd_ref; mv1 = p_mv_mode->all_dual_mv_1st[ref1]; mv2 = p_mv_mode->all_dual_mv_2nd[ref1]; cu_set_mvs_noskip(p_cu, block, ref1, &mv1, ref2, &mv2); p_cu->mvcost[block] = p_me->bmvcost[PDIR_DUAL]; } } else if (h->i_type == SLICE_TYPE_B) { dist_t sym_mcost = MAX_DISTORTION; dist_t bid_mcost = MAX_DISTORTION; best_fwd_ref = 0; // must reset if (!((p_cu->cu_info.i_level == B8X8_IN_BIT) && (mode >= PRED_2NxN && mode <= PRED_nRx2N))) { pred_inter_search_bi(h, p_cu, p_cb, p_me, &sym_mcost, &bid_mcost); } if (fwd_cost <= bwd_cost && fwd_cost <= sym_mcost && fwd_cost <= bid_mcost) { best_pdir = PDIR_FWD; ref1 = B_FWD; mv1 = p_mv_mode->all_single_mv[ref1]; cu_set_mvs_noskip(p_cu, block, ref1, &mv1, INVALID_REF, &mv2); p_cu->mvcost[block] = p_me->bmvcost[PDIR_FWD]; } else if (bwd_cost <= fwd_cost && bwd_cost <= sym_mcost && bwd_cost <= bid_mcost) { fwd_cost = bwd_cost; best_pdir = PDIR_BWD; ref2 = B_BWD; mv2 = p_mv_mode->all_single_mv[B_BWD]; cu_set_mvs_noskip(p_cu, block, INVALID_REF, &mv1, ref2, &mv2); p_cu->mvcost[block] = p_me->bmvcost[PDIR_BWD]; } else if (sym_mcost <= fwd_cost && sym_mcost <= bwd_cost && sym_mcost <= bid_mcost) { int dist_fwd = calculate_distance(h, B_FWD); // fwd int dist_bwd = calculate_distance(h, B_BWD); // bwd fwd_cost = sym_mcost; best_pdir = PDIR_SYM; ref1 = B_FWD; ref2 = B_BWD; mv1 = p_mv_mode->all_sym_mv[0]; mv2.x = -scale_mv_skip ( mv1.x, dist_bwd, dist_fwd); mv2.y = -scale_mv_skip_y(h, mv1.y, dist_bwd, dist_fwd); cu_set_mvs_noskip(p_cu, block, ref1, &mv1, ref2, &mv2); p_cu->mvcost[block] = p_me->bmvcost[PDIR_SYM]; } else { fwd_cost = bid_mcost; best_pdir = PDIR_BID; ref1 = B_FWD; ref2 = B_BWD; mv1 = p_mv_mode->all_dual_mv_1st[0]; mv2 = p_mv_mode->all_dual_mv_2nd[0]; cu_set_mvs_noskip(p_cu, block, ref1, &mv1, ref2, &mv2); p_cu->mvcost[block] = p_me->bmvcost[PDIR_BID]; } } else { ref1 = best_fwd_ref; mv1 = p_mv_mode->all_single_mv[ref1]; cu_set_mvs_noskip(p_cu, block, ref1, &mv1, INVALID_REF, &mv2); p_cu->mvcost[block] = p_me->mvcost[PDIR_FWD]; } total_cost += fwd_cost; // store reference frame index and direction parameters p_mode->ref_idx_single[block] = (int8_t)best_fwd_ref; p_cu->cu_info.b8pdir[block] = (int8_t)best_pdir; } cu_get_mvds(h, p_cu); // MVD return total_cost; // СCost } //#if OPT_DMH_CANDIDATE /* --------------------------------------------------------------------------- * ǰȡŵDMHģʽѡRDO */ static int dmh_bits[9] = { // 0, 3, 3, 4, 4, 5, 5, 5, 5 0, 0, 0, 0, 0, 0, 0, 0, 0 }; static int rdo_get_dmh_candidate(xavs2_t *h, cu_t *p_cu, rdcost_t rdcost_non_dmh) { const int num_dmh_modes = DMH_MODE_NUM + DMH_MODE_NUM - 1; int cu_size = 1 << p_cu->cu_info.i_level; pixel_ssd_t cmp_dmh = g_funcs.pixf.ssd[PART_INDEX(cu_size, cu_size)]; rdcost_t min_distotion = MAX_COST; dist_t distortion; rdcost_t cost; int best_dmh_cand = -1; cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); pel_t *p_fenc = h->lcu.p_fenc[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; int i; int rate; /* DMHģʽִԤⲢʧ棬ȡʧСһģʽΪDMHѡ */ for (i = 1; i < num_dmh_modes; i++) { /* get prediction data and luma distortion */ p_cu->cu_info.dmh_mode = (int8_t)(i); if (rdo_get_pred_inter(h, p_cu, 1)) { rate = dmh_bits[i]; distortion = cmp_dmh(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); cost = distortion + h->f_lambda_mode * rate; if (cost < min_distotion) { min_distotion = cost; best_dmh_cand = i; } } } if (IS_ALG_ENABLE(OPT_SKIP_DMH_THRES) && min_distotion > (rdcost_t)(1.2 * rdcost_non_dmh)) { /* Dzвdistortion */ return -1; } else { return best_dmh_cand; } } //#endif /* --------------------------------------------------------------------------- * ֡Ԥ黮ַʽѡһŵĻ */ static int cu_select_inter_partition(xavs2_t *h, cu_t *p_cu, int i_level, uint32_t inter_modes, cu_info_t *best, rdcost_t *p_min_rdcost, int b_dhp_enabled, int b_check_dmh) { cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); int best_cu_mode = 1; int mode; int cu_size = 1 << i_level; int cu_size_c = cu_size >> 1; int pix_x = p_cu->i_pix_x; int pix_y = p_cu->i_pix_y; int pix_x_c = pix_x >> 1; int pix_y_c = pix_y >> CHROMA_V_SHIFT; pel_t *p_fenc[3]; int i; int64_t min_cost = MAX_COST; int64_t mecost; int ref1, ref2; UNUSED_PARAMETER(b_check_dmh); UNUSED_PARAMETER(p_min_rdcost); memcpy(best, &p_cu->cu_info, sizeof(cu_info_t)); //inter_modes |= (uint32_t)((1 << PRED_2NxN) | (1 << PRED_Nx2N)); for (mode = 1; mode < MAX_INTER_MODES; mode++) { /* ִ˶ */ if (!(inter_modes & (1 << mode))) { continue; // ֱģʽľ } /* پ(OPT_BYPASS_AMP)P2NxNδţֱַͬPRED_2NxnU/PRED_2NxnD; PNx2Nͬ */ if (IS_ALG_ENABLE(OPT_BYPASS_AMP) && i_level > B16X16_IN_BIT) { if ((mode == PRED_2NxnU || mode == PRED_2NxnD) && best_cu_mode != PRED_2NxN) { continue; } else if ((mode == PRED_nLx2N || mode == PRED_nRx2N) && best_cu_mode != PRED_Nx2N) { continue; } } p_cu->cu_info.i_mode = (int8_t)mode; cu_init_pu_inter(h, &p_cu->cu_info, i_level, mode); cu_rdo_motion_estimation(h, p_cu, &h->me_state, b_dhp_enabled); /* CostѡȡС */ p_cu->cu_info.directskip_wsm_idx = 0; p_cu->cu_info.directskip_mhp_idx = DS_NONE; p_cu->cu_info.dmh_mode = 0; rdo_get_pred_inter(h, p_cu, 3); p_fenc[0] = h->lcu.p_fenc[0] + pix_y * FENC_STRIDE + pix_x; p_fenc[1] = h->lcu.p_fenc[1] + pix_y_c * FENC_STRIDE + pix_x_c; p_fenc[2] = h->lcu.p_fenc[2] + pix_y_c * FENC_STRIDE + pix_x_c; mecost = g_funcs.pixf.sa8d[PART_INDEX(cu_size, cu_size)](p_layer->buf_pred_inter, FREC_STRIDE, p_fenc[0], FENC_STRIDE); mecost += g_funcs.pixf.sa8d[PART_INDEX(cu_size_c, cu_size_c)](p_enc->buf_pred_inter_c, FREC_CSTRIDE, p_fenc[1], FENC_STRIDE); mecost += g_funcs.pixf.sa8d[PART_INDEX(cu_size_c, cu_size_c)](p_enc->buf_pred_inter_c + (FREC_CSTRIDE >> 1), FREC_CSTRIDE, p_fenc[2], FENC_STRIDE); for (i = 0; i < p_cu->cu_info.num_pu; i++) { mecost += p_cu->mvcost[i]; ref1 = p_cu->cu_info.ref_idx_1st[i]; ref2= p_cu->cu_info.ref_idx_2nd[i]; if (h->i_type != SLICE_TYPE_B) { mecost += (ref1 == INVALID_REF? 0: REF_COST(ref1)); mecost += (ref2 == INVALID_REF? 0: REF_COST(ref2)); } } if (mecost < min_cost) { memcpy(&p_layer->cu_mode.best_mc_tmp, &p_cu->mc, sizeof(p_cu->mc)); memcpy(best, &p_cu->cu_info, sizeof(cu_info_t)); min_cost = mecost; best_cu_mode = mode; } } return best_cu_mode; } /* --------------------------------------------------------------------------- * ͨ֡Ԥ黮ַʽӦCost */ static void cu_check_inter_partition(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int mode, int i_level, cu_info_t *best, rdcost_t *p_min_rdcost, int b_dhp_enabled, int b_check_dmh) { /* set reference frame and block mode */ cu_init_pu_inter(h, &p_cu->cu_info, i_level, mode); /* ME */ cu_rdo_motion_estimation(h, p_cu, &h->me_state, b_dhp_enabled); h->lcu.bypass_all_dmh = 0; /* һ֡仮ģʽRDCostȷűģʽ */ p_cu->cu_info.directskip_wsm_idx = 0; p_cu->cu_info.directskip_mhp_idx = DS_NONE; p_cu->cu_info.dmh_mode = 0; cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, best); /* DMHģʽ */ if (h->i_type == SLICE_TYPE_F && h->param->enable_dmh && !h->lcu.bypass_all_dmh && b_check_dmh && !(i_level == B8X8_IN_BIT && mode != PRED_2Nx2N)) { // disable 8x4 or 4x8 2MVs/PU mode int dmh_mode_candidate = 0; int max_dmh_mode; int best_dmh_mode = 0; int dmh_mode; if (p_cu->cu_info.b8pdir[0] == PDIR_FWD && p_cu->cu_info.b8pdir[1] == PDIR_FWD && p_cu->cu_info.b8pdir[2] == PDIR_FWD && p_cu->cu_info.b8pdir[3] == PDIR_FWD) { /* MEȷŵPUԤⷽΪǰ򣬴ʱֻҪDMHģʽ */ dmh_mode = 1; } else { // DHP Ҳο֡Ϊ2ʱп /* ŵPUа˫ǰ飬ʱҪPUΪǰʱRDCostsٱDMHģʽ */ /* ʱMEͬʱһPUҪ */ cu_rdo_motion_estimation(h, p_cu, &h->me_state, -1); dmh_mode = 0; } /* ܼ 2 * (DMH_MODE_NUM - 1) + 1 ģʽ */ max_dmh_mode = DMH_MODE_NUM + DMH_MODE_NUM - 1; /* 㷨DMHѡģʽйҪģʽ * αģʽ޴ļ */ if (IS_ALG_ENABLE(OPT_DMH_CANDIDATE)) { dmh_mode_candidate = rdo_get_dmh_candidate(h, p_cu, *p_min_rdcost); } // ijģʽµIJвΪȫʱкdmhģʽ for (; dmh_mode < max_dmh_mode && !h->lcu.bypass_all_dmh; dmh_mode++) { if (IS_ALG_ENABLE(OPT_DMH_CANDIDATE)) { if (dmh_mode != 0 && dmh_mode != dmh_mode_candidate) { continue; } } else { if (dmh_mode > (DMH_MODE_NUM - 1)) { if (best_dmh_mode != (dmh_mode - (DMH_MODE_NUM - 1))) { // ֻͬչ continue; } } } p_cu->cu_info.dmh_mode = (int8_t)dmh_mode; if (cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, best)) { best_dmh_mode = dmh_mode; } } // end loop of DMH modes } // end of check DMH modes } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void cu_set_mvs_skip(xavs2_t *h, cu_t *p_cu) { int weighted_skip_mode = p_cu->cu_info.directskip_wsm_idx; int ds_mode = p_cu->cu_info.directskip_mhp_idx; const cu_mode_t *p_cu_mode = cu_get_layer_mode(h, p_cu->cu_info.i_level); int k; assert(p_cu->cu_info.i_mode == PRED_SKIP); assert(h->i_type != SLICE_TYPE_I); if (ds_mode != DS_NONE) { mv_t mv1 = p_cu_mode->skip_mv_1st[ds_mode]; mv_t mv2 = p_cu_mode->skip_mv_2nd[ds_mode]; int8_t ref1 = p_cu_mode->skip_ref_1st[ds_mode]; int8_t ref2 = p_cu_mode->skip_ref_2nd[ds_mode]; int i_dir_pred = tab_pdir_bskip[ds_mode]; for (k = 0; k < 4; k++) { p_cu->cu_info.b8pdir[k] = (int8_t)i_dir_pred; p_cu->mc.mv[k][0] = mv1; p_cu->mc.mv[k][1] = mv2; p_cu->cu_info.ref_idx_1st[k] = ref1; p_cu->cu_info.ref_idx_2nd[k] = ref2; } } else if (weighted_skip_mode) { for (k = 0; k < 4; k++) { p_cu->cu_info.b8pdir[k] = PDIR_FWD; p_cu->mc.mv[k][0] = p_cu_mode->tskip_mv[k][0]; p_cu->mc.mv[k][1] = p_cu_mode->tskip_mv[k][weighted_skip_mode]; p_cu->cu_info.ref_idx_1st[k] = 0; p_cu->cu_info.ref_idx_2nd[k] = (int8_t)weighted_skip_mode; } } else if (h->i_type != SLICE_TYPE_B) { for (k = 0; k < 4; k++) { p_cu->cu_info.b8pdir[k] = PDIR_FWD; p_cu->mc.mv[k][0] = p_cu_mode->tskip_mv[k][0]; p_cu->mc.mv[k][1].v = 0; p_cu->cu_info.ref_idx_1st[k] = 0; p_cu->cu_info.ref_idx_2nd[k] = INVALID_REF; } } else { for (k = 0; k < 4; k++) { p_cu->cu_info.b8pdir[k] = PDIR_SYM; p_cu->mc.mv[k][0] = p_cu_mode->tskip_mv[k][0]; p_cu->mc.mv[k][1] = p_cu_mode->tskip_mv[k][1]; p_cu->cu_info.ref_idx_1st[k] = B_FWD; p_cu->cu_info.ref_idx_2nd[k] = B_BWD; } } } /* --------------------------------------------------------------------------- */ typedef struct cu_skip_mc_t { mv_t mv_1st[4]; mv_t mv_2nd[4]; int8_t ref_1st[4]; int8_t ref_2nd[4]; } cu_skip_mc_t; /* --------------------------------------------------------------------------- * SkipMVԼ⵱ǰģʽMVǷ񱻱 */ static ALWAYS_INLINE int is_same_skip_mc_param(const cu_skip_mc_t *p_src1, const cu_skip_mc_t *p_src2) { uint32_t *p1 = (uint32_t *)p_src1; uint32_t *p2 = (uint32_t *)p_src2; int num = sizeof(cu_skip_mc_t) >> 2; int i; for (i = 0; i < num; i++) { if (p1[i] != p2[i]) { return 0; } } return 1; } /* --------------------------------------------------------------------------- * SkipMVԼ⵱ǰģʽMVǷ񱻱 */ static int update_skip_mv_list(cu_skip_mc_t *p_skip_mvs, int i_num, cu_t *p_cu) { cu_skip_mc_t cur_mc; int i; for (i = 0; i < 4; i++) { cur_mc.mv_1st[i].v = p_cu->mc.mv[i][0].v; cur_mc.mv_2nd[i].v = p_cu->mc.mv[i][1].v; cur_mc.ref_1st[i] = p_cu->cu_info.ref_idx_1st[i]; cur_mc.ref_2nd[i] = p_cu->cu_info.ref_idx_2nd[i]; } for (i = 0; i < i_num; i++) { if (is_same_skip_mc_param(p_skip_mvs + i, &cur_mc)) { break; } } if (i != i_num) { return 0; } else { memcpy(p_skip_mvs + i_num, &cur_mc, sizeof(cu_skip_mc_t)); return 1; } } /* --------------------------------------------------------------------------- * Skip/DirectģʽıۣԤвѡȡŵSkipģʽһRDO */ static void cu_check_skip_direct_rough2(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu_t *p_cu, rdcost_t *p_min_rdcost) { cu_skip_mc_t skip_mc_params[DS_MAX_NUM + XAVS2_MAX_REFS]; int num_mc_params = 0; int max_skip_mode_num, i; int cu_size = p_cu->i_size; pixel_ssd_t cmp_skip = g_funcs.pixf.sa8d[PART_INDEX(cu_size, cu_size)]; cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); pel_t *p_fenc = h->lcu.p_fenc[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; dist_t distortion; rdcost_t rdcost; rdcost_t min_rdcost = MAX_COST; int best_skip_mode = DS_NONE; int best_weighted_skip = -1; // also used to verify an additional skip mode is found cb_t cur_cb = { { 0 } }; cur_cb.w = cur_cb.h = (int8_t)p_cu->i_size; cu_get_neighbors(h, p_cu, &cur_cb); /* get Skip/Direct MVs and temporal SKIP mode number */ max_skip_mode_num = h->lcu.get_skip_mvs(h, p_cu); /* 0, init cu data */ p_cu->cu_info.dmh_mode = 0; p_cu->cu_info.i_cbp = 0; int rate; /* 1, temporal skip mode, derive MV from temporal */ p_cu->cu_info.directskip_mhp_idx = DS_NONE; p_cu->cu_info.directskip_wsm_idx = 0; /* ʱMVPԤֱRDCostٸŵRDCostȽϣ 3%ңʱ 20%~30% */ cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); /* 2, Weighted skip mode, derive MV from temporal and scaling */ for (i = 1; i < max_skip_mode_num; i++) { int need_check_mv; p_cu->cu_info.directskip_wsm_idx = (int8_t)i; cu_set_mvs_skip(h, p_cu); cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu); num_mc_params += need_check_mv; if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { rate = p_aec->binary.est_cu_header(h, p_aec, p_cu); distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); rdcost = distortion + h->f_lambda_mode * rate; if (rdcost < min_rdcost) { XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); min_rdcost = rdcost; best_weighted_skip = i; } } } /* 3, ĸspatial direct (single first, single second, dual first, dual second) */ if ((h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) && (!h->fdec->rps.referd_by_others && h->i_type == SLICE_TYPE_B)) { p_cu->cu_info.directskip_wsm_idx = 0; for (i = 0; i < DS_MAX_NUM; i++) { int need_check_mv; p_cu->cu_info.directskip_mhp_idx = (int8_t)i; cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); cu_set_mvs_skip(h, p_cu); need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu); num_mc_params += need_check_mv; if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { rate = headerbits_skipmode[4+i]; distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); rdcost = distortion + h->f_lambda_mode * rate; if (rdcost < min_rdcost) { XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); min_rdcost = rdcost; best_weighted_skip = 0; best_skip_mode = i; } } } /* distortionСģʽѡһŵ */ p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode; p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip; cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); } else if ((h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) && (h->fdec->rps.poc == 2 || h->fdec->rps.poc == 6)) { if (p_cu->p_left_cu != NULL && p_cu->p_topA_cu != NULL && p_cu->p_topL_cu != NULL && p_cu->p_topR_cu != NULL) { if ((p_cu->p_left_cu->i_mode == 0 && p_cu->p_topA_cu->i_mode == 0 && p_cu->p_topL_cu->i_mode == 0 && p_cu->p_topR_cu->i_mode == 0) && (p_cu->p_left_cu->i_cbp == 0 || p_cu->p_topA_cu->i_cbp == 0 || p_cu->p_topL_cu->i_cbp == 0 || p_cu->p_topR_cu->i_cbp == 0)) { p_cu->cu_info.directskip_wsm_idx = 0; for (i = 0; i < DS_MAX_NUM; i++) { int need_check_mv; p_cu->cu_info.directskip_mhp_idx = (int8_t)i; cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); cu_set_mvs_skip(h, p_cu); need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu); num_mc_params += need_check_mv; if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { rate = headerbits_skipmode[4 + i]; distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); rdcost = distortion + h->f_lambda_mode * rate; if (rdcost < min_rdcost) { XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); min_rdcost = rdcost; best_weighted_skip = 0; best_skip_mode = i; } } } /* distortionСģʽѡһŵ */ p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode; p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip; cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); } else { p_cu->cu_info.directskip_wsm_idx = 0; for (i = 0; i < DS_MAX_NUM; i++) { p_cu->cu_info.directskip_mhp_idx = (int8_t)i; cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); } } } else { p_cu->cu_info.directskip_wsm_idx = 0; for (i = 0; i < DS_MAX_NUM; i++) { p_cu->cu_info.directskip_mhp_idx = (int8_t)i; cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); } } } else if (h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) { if (p_cu->p_left_cu != NULL && p_cu->p_topA_cu != NULL && p_cu->p_topL_cu != NULL && p_cu->p_topR_cu != NULL) { if ((p_cu->p_left_cu->i_mode == 0 && p_cu->p_topA_cu->i_mode == 0 && p_cu->p_topL_cu->i_mode == 0 && p_cu->p_topR_cu->i_mode == 0) && (p_cu->p_left_cu->i_cbp == 0 && p_cu->p_topA_cu->i_cbp == 0 && p_cu->p_topL_cu->i_cbp == 0 && p_cu->p_topR_cu->i_cbp == 0)) { p_cu->cu_info.directskip_wsm_idx = 0; for (i = 0; i < DS_MAX_NUM; i++) { int need_check_mv; p_cu->cu_info.directskip_mhp_idx = (int8_t)i; cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); cu_set_mvs_skip(h, p_cu); need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu); num_mc_params += need_check_mv; if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { rate = headerbits_skipmode[4 + i]; distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); rdcost = distortion + h->f_lambda_mode * rate; if (rdcost < min_rdcost) { XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); min_rdcost = rdcost; best_weighted_skip = 0; best_skip_mode = i; } } } /* distortionСģʽѡһŵ */ p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode; p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip; cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); } else { p_cu->cu_info.directskip_wsm_idx = 0; for (i = 0; i < DS_MAX_NUM; i++) { p_cu->cu_info.directskip_mhp_idx = (int8_t)i; cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); } } } else { p_cu->cu_info.directskip_wsm_idx = 0; for (i = 0; i < DS_MAX_NUM; i++) { p_cu->cu_info.directskip_mhp_idx = (int8_t)i; cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); } } } } static void cu_check_skip_direct_rough1(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu_t *p_cu, rdcost_t *p_min_rdcost) { cu_skip_mc_t skip_mc_params[DS_MAX_NUM + XAVS2_MAX_REFS]; int num_mc_params = 0; int max_skip_mode_num, i; int cu_size = p_cu->i_size; pixel_ssd_t cmp_skip = g_funcs.pixf.sa8d[PART_INDEX(cu_size, cu_size)]; cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); pel_t *p_fenc = h->lcu.p_fenc[0] + p_cu->i_pos_y * FENC_STRIDE + p_cu->i_pos_x; dist_t distortion; rdcost_t rdcost; rdcost_t min_rdcost = MAX_COST; int best_skip_mode = DS_NONE; int best_weighted_skip = -1; // also used to verify an additional skip mode is found cb_t cur_cb = { { 0 } }; cur_cb.w = cur_cb.h = (int8_t)p_cu->i_size; cu_get_neighbors(h, p_cu, &cur_cb); /* get Skip/Direct MVs and temporal SKIP mode number */ max_skip_mode_num = h->lcu.get_skip_mvs(h, p_cu); /* 0, init cu data */ p_cu->cu_info.dmh_mode = 0; p_cu->cu_info.i_cbp = 0; int rate; /* 1, temporal skip mode, derive MV from temporal */ p_cu->cu_info.directskip_mhp_idx = DS_NONE; p_cu->cu_info.directskip_wsm_idx = 0; /* ʱMVPԤֱRDCostٸŵRDCostȽϣ 3%ңʱ 20%~30% */ cu_set_mvs_skip(h, p_cu); cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); num_mc_params += update_skip_mv_list(skip_mc_params, num_mc_params, p_cu); if (rdo_get_pred_inter(h, p_cu, 1)) { rate = headerbits_skipmode[0];//p_aec->binary.est_cu_header(h, p_aec, p_cu); distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); min_rdcost = distortion + h->f_lambda_mode * rate; XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); best_weighted_skip = 0; best_skip_mode = DS_NONE; } /* 2, Weighted skip mode, derive MV from temporal and scaling */ for (i = 1; i < max_skip_mode_num; i++) { int need_check_mv; p_cu->cu_info.directskip_wsm_idx = (int8_t)i; cu_set_mvs_skip(h, p_cu); cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu); num_mc_params += need_check_mv; if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { rate = headerbits_skipmode[i];//p_aec->binary.est_cu_header(h, p_aec, p_cu); distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); rdcost = distortion + h->f_lambda_mode * rate; if (rdcost < min_rdcost) { XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); min_rdcost = rdcost; best_weighted_skip = i; } } } /* 3, ĸspatial direct (single first, single second, dual first, dual second) */ if (h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) { p_cu->cu_info.directskip_wsm_idx = 0; for (i = 0; i < DS_MAX_NUM; i++) { int need_check_mv; p_cu->cu_info.directskip_mhp_idx = (int8_t)i; cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); cu_set_mvs_skip(h, p_cu); need_check_mv = update_skip_mv_list(skip_mc_params, num_mc_params, p_cu); num_mc_params += need_check_mv; if (need_check_mv && rdo_get_pred_inter(h, p_cu, 1)) { rate = headerbits_skipmode[4 + i];//p_aec->binary.est_cu_header(h, p_aec, p_cu); distortion = cmp_skip(p_fenc, FENC_STRIDE, p_layer->buf_pred_inter, FREC_STRIDE); rdcost = distortion + h->f_lambda_mode * rate; if (rdcost < min_rdcost) { XAVS2_SWAP_PTR(p_layer->buf_pred_inter, p_layer->buf_pred_inter_best); min_rdcost = rdcost; best_weighted_skip = 0; best_skip_mode = i; } } } } /* distortionСģʽѡһŵ */ p_cu->cu_info.directskip_mhp_idx = (int8_t)best_skip_mode; p_cu->cu_info.directskip_wsm_idx = (int8_t)best_weighted_skip; cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); } /* --------------------------------------------------------------------------- * Skip/DirectģʽıۣRDOѡȡŵSkipģʽ */ static void cu_check_skip_direct_fullrdo(xavs2_t *h, aec_t *p_aec, cu_info_t *p_best, cu_t *p_cu, rdcost_t *p_min_rdcost) { int max_skip_mode_num, i; cb_t cur_cb = { { 0 } }; cur_cb.w = cur_cb.h = (int8_t)p_cu->i_size; cu_get_neighbors(h, p_cu, &cur_cb); /* get Skip/Direct MVs and temporal SKIP mode number */ max_skip_mode_num = h->lcu.get_skip_mvs(h, p_cu); /* 0, init cu data */ p_cu->cu_info.dmh_mode = 0; /* 1, temporal skip mode, derive MV from temporal */ p_cu->cu_info.directskip_mhp_idx = DS_NONE; p_cu->cu_info.directskip_wsm_idx = 0; /* ʱMVPԤֱRDCostٸŵRDCostȽϣ 3%ңʱ 20%~30% */ cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); /* 2, Weighted skip mode, derive MV from temporal and scaling */ for (i = 1; i < max_skip_mode_num; i++) { p_cu->cu_info.directskip_wsm_idx = (int8_t)i; cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); } /* 3, ĸspatial direct (single first, single second, dual first, dual second) */ if (h->i_type == SLICE_TYPE_B || (h->i_type == SLICE_TYPE_F && h->param->enable_mhp_skip)) { p_cu->cu_info.directskip_wsm_idx = 0; for (i = 0; i < DS_MAX_NUM; i++) { p_cu->cu_info.directskip_mhp_idx = (int8_t)i; cu_init_pu_inter(h, &p_cu->cu_info, p_cu->cu_info.i_level, PRED_SKIP); cu_set_mvs_skip(h, p_cu); cu_rdcost_inter(h, p_aec, p_cu, p_min_rdcost, p_best); } } } #if SAVE_CU_INFO //#if OPT_EARLY_SKIP /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int is_early_skip(xavs2_t *h, cu_t *p_cu) { #define IS_EARLY_SKIP_CU(mode, cbp) (((mode) <= PRED_2Nx2N || (mode) == PRED_I_2Nx2N ) && (cbp) == 0) // faster //#define IS_EARLY_SKIP_CU(mode, cbp) ((mode) == PRED_SKIP && (cbp) == 0) // fast #define IS_EAYLY_SKIP(p_cu_info) IS_EARLY_SKIP_CU(p_cu_info->i_mode, p_cu_info->i_cbp) // each neighbor block (left, top, top-left, top-right, col) is skip mode? int left_skip = p_cu->p_left_cu && IS_EAYLY_SKIP(p_cu->p_left_cu); int top_skip = p_cu->p_topA_cu && IS_EAYLY_SKIP(p_cu->p_topA_cu); int topleft_skip = p_cu->p_topL_cu && IS_EAYLY_SKIP(p_cu->p_topL_cu); int topright_skip = p_cu->p_topR_cu && IS_EAYLY_SKIP(p_cu->p_topR_cu); xavs2_frame_t *p_col_ref = h->fref[0]; int i_scu_xy = p_cu->i_scu_xy; int col_skip = IS_EARLY_SKIP_CU(p_col_ref->cu_mode[i_scu_xy], p_col_ref->cu_cbp[i_scu_xy]); #undef IS_EARLY_SKIP_CU #undef IS_EAYLY_SKIP return (left_skip + top_skip + topleft_skip + topright_skip + col_skip > 4) && p_cu->cu_info.i_mode <= PRED_2Nx2N; // return left_skip && top_skip && topleft_skip && topright_skip && col_skip && p_cu->cu_info.i_mode == PRED_SKIP; } //#endif #endif //#if OPT_PSC_MD || OPT_TR_KEY_FRAME_MD /* --------------------------------------------------------------------------- */ static void update_valid_modes_by_complexity(xavs2_t *h, cu_t *p_cu, uint32_t *valid_pred_modes) { static const int mode_weight_factor[3][MAX_PRED_MODES + 1] = { { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }, // neighbor bitsize < cur bitsize { 1, 1, 1, 2, 2, 4, 4, 4, 4, 1, 4, 4, 4 }, // neighbor bitsize == cur bitsize { 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 } // neighbor bitsize > cur bitsize }; #define REAL_CU_TYPE(cu_type, cbp) (cu_type + 1 - (!cu_type && !!cbp)) static const double thres_complexity_min = 1.0f; static const double thres_complexity_max = 4.0f; static const uint32_t modes_simple_tex = (1 << PRED_SKIP) | (1 << PRED_2Nx2N) | (1 << PRED_I_2Nx2N); static const uint32_t mask_complex_tex = (uint32_t)~((1 << PRED_I_2Nx2N) | (1 << PRED_2Nx2N) | (1 << PRED_2NxN) | (1 << PRED_Nx2N)); static const uint32_t mask_non_keyframe_modes = 0x0F; int cu_weight_sum = 0; int mode_complexity = 0; int i_level = p_cu->cu_info.i_level; cu_info_t *cu_left = p_cu->p_left_cu; cu_info_t *cu_up = p_cu->p_topA_cu; cu_info_t *cu_upleft = p_cu->p_topL_cu; cu_info_t *cu_upright = p_cu->p_topR_cu; #if SAVE_CU_INFO int scu_xy = p_cu->i_scu_xy; xavs2_frame_t *ref_fwd = h->i_type == SLICE_TYPE_B ? h->fref[1] : h->fref[0]; xavs2_frame_t *ref_bwd = h->i_type == SLICE_TYPE_B ? h->fref[0] : NULL; #endif int sign_idx, type_idx; if (IS_ALG_ENABLE(OPT_TR_KEY_FRAME_MD)) { if (h->fdec->i_frm_poc & 2) { *valid_pred_modes &= mask_non_keyframe_modes; return; } } /* IS_ALG_ENABLE(OPT_PSC_MD) */ // left if (cu_left) { cu_weight_sum += 6; sign_idx = xavs2_sign3(cu_left->i_level - i_level) + 1; type_idx = REAL_CU_TYPE(cu_left->i_mode, cu_left->i_cbp); mode_complexity += 6 * mode_weight_factor[sign_idx][type_idx]; } // up if (cu_up) { cu_weight_sum += 6; sign_idx = xavs2_sign3(cu_up->i_level - i_level) + 1; type_idx = REAL_CU_TYPE(cu_up->i_mode, cu_up->i_cbp); mode_complexity += 6 * mode_weight_factor[sign_idx][type_idx]; } // upleft if (cu_upleft) { cu_weight_sum += 2; sign_idx = xavs2_sign3(cu_upleft->i_level - i_level) + 1; type_idx = REAL_CU_TYPE(cu_upleft->i_mode, cu_upleft->i_cbp); mode_complexity += 2 * mode_weight_factor[sign_idx][type_idx]; } // upright if (cu_upright) { cu_weight_sum += 2; sign_idx = xavs2_sign3(cu_upright->i_level - i_level) + 1; type_idx = REAL_CU_TYPE(cu_upright->i_mode, cu_upright->i_cbp); mode_complexity += 2 * mode_weight_factor[sign_idx][type_idx]; } #if SAVE_CU_INFO // temporal forward if (ref_fwd) { cu_weight_sum += 1; sign_idx = xavs2_sign3(ref_fwd->cu_level[scu_xy] - i_level) + 1; type_idx = REAL_CU_TYPE(ref_fwd->cu_mode[scu_xy], ref_fwd->cu_cbp[scu_xy]); mode_complexity += 1 * mode_weight_factor[sign_idx][type_idx]; } // temporal backward if (ref_bwd) { cu_weight_sum += 1; sign_idx = xavs2_sign3(ref_bwd->cu_level[scu_xy] - i_level) + 1; type_idx = REAL_CU_TYPE(ref_bwd->cu_mode[scu_xy], ref_bwd->cu_cbp[scu_xy]); mode_complexity += 1 * mode_weight_factor[sign_idx][type_idx]; } #else mode_complexity += 2; #endif if (mode_complexity < thres_complexity_min * cu_weight_sum) { *valid_pred_modes &= modes_simple_tex; } else if (mode_complexity >= thres_complexity_max * cu_weight_sum) { *valid_pred_modes &= mask_complex_tex; } #undef REAL_CU_TYPE } //#endif //#if OPT_ET_HOMO_MV /* --------------------------------------------------------------------------- */ static int is_ET_inter_recur(xavs2_t *h, cu_t *p_cu, cu_info_t *curr_best) { int b_avail_up = p_cu->p_topA_cu != NULL; int b_avail_left = p_cu->p_left_cu != NULL; #if SAVE_CU_INFO int b_avail_col = IS_INTER_MODE(curr_best->i_mode) & IS_INTER_MODE(h->fref[0]->cu_mode[p_cu->i_scu_xy]); #else int b_avail_col = FALSE; #endif int num_blk_pixels[4]; float mv_avg_x[4], mv_avg_y[4]; int start_b4_x, start_b4_y; if (b_avail_up && b_avail_left && b_avail_col) { cu_mode_t *p_mode = cu_get_layer_mode(h, curr_best->i_level); int w_in_4x4 = h->i_width_in_minpu; int b4_x = p_cu->i_pix_x >> MIN_PU_SIZE_IN_BIT; int b4_y = p_cu->i_pix_y >> MIN_PU_SIZE_IN_BIT; int b4_size = 1 << (p_cu->cu_info.i_level - MIN_PU_SIZE_IN_BIT); int b4_num = b4_size * b4_size; float mvs_avg_x = 0, mvs_avg_y = 0; float mvs_var_x = 0, mvs_var_y = 0; mv_t cur_blk_mvs[4]; const mv_t *p_mv_1st = h->fwd_1st_mv + b4_y * w_in_4x4 + b4_x; const mv_t *col_mv = h->fref[0]->pu_mv; const int w_in_16x16 = (h->i_width_in_minpu + 3) >> 2; int i, j, k; assert(curr_best->i_mode >= 0 && curr_best->i_mode < MAX_INTER_MODES); for (i = 0; i < 4; i++) { mv_avg_x[i] = 0; mv_avg_y[i] = 0; } // left column & top row start_b4_x = -1; start_b4_y = 0; for (j = 0; j < b4_size; j++) { mv_avg_x[0] += p_mv_1st[j * w_in_4x4 - 1].x; mv_avg_y[0] += p_mv_1st[j * w_in_4x4 - 1].y; mv_avg_x[1] += p_mv_1st[j - w_in_4x4].x; mv_avg_y[1] += p_mv_1st[j - w_in_4x4].y; } mv_avg_x[0] *= b4_size; mv_avg_y[0] *= b4_size; mv_avg_x[1] *= b4_size; mv_avg_y[1] *= b4_size; // collocated start_b4_x = b4_x; start_b4_y = b4_y; for (j = 0; j < b4_size; j++) { for (i = 0; i < b4_size; i++) { mv_avg_x[2] += col_mv[((start_b4_y + j) >> 2) * w_in_16x16 + ((start_b4_x + i) >> 2)].x; mv_avg_y[2] += col_mv[((start_b4_y + j) >> 2) * w_in_16x16 + ((start_b4_x + i) >> 2)].y; } } // current cu for (k = 0; k < curr_best->num_pu; k++) { mv_t mv_1st = p_mode->best_mc.mv[k][0]; cb_t cur_cb = curr_best->cb[k]; int width, height; cur_cb.v >>= 2; width = cur_cb.w; height = cur_cb.h; cur_blk_mvs[k] = mv_1st; num_blk_pixels[k] = width * height; mv_avg_x[3] += mv_1st.x * num_blk_pixels[k]; mv_avg_y[3] += mv_1st.y * num_blk_pixels[k]; } for (; k < 4; k++) { num_blk_pixels[k] = 0; cur_blk_mvs[k].v = 0; } for (i = 0; i < 4; i++) { mv_avg_x[i] /= b4_num; mv_avg_y[i] /= b4_num; mvs_avg_x += mv_avg_x[i]; mvs_avg_y += mv_avg_y[i]; } // left column & top row for (j = 0; j < b4_size; j++) { mvs_var_x += XAVS2_ABS(p_mv_1st[j * w_in_4x4 - 1].x - mvs_avg_x); mvs_var_y += XAVS2_ABS(p_mv_1st[j * w_in_4x4 - 1].y - mvs_avg_y); mvs_var_x += XAVS2_ABS(p_mv_1st[j - w_in_4x4].x - mvs_avg_x); mvs_var_y += XAVS2_ABS(p_mv_1st[j - w_in_4x4].y - mvs_avg_y); } mvs_var_x *= b4_size; mvs_var_y *= b4_size; // collocated start_b4_x = b4_x; start_b4_y = b4_y; for (j = 0; j < b4_size; j++) { for (i = 0; i < b4_size; i++) { mvs_var_x += XAVS2_ABS(col_mv[((start_b4_y + j) >> 2) * w_in_16x16 + ((start_b4_x + i) >> 2)].x - mvs_avg_x); mvs_var_y += XAVS2_ABS(col_mv[((start_b4_y + j) >> 2) * w_in_16x16 + ((start_b4_x + i) >> 2)].y - mvs_avg_y); } } // current for (i = 0; i < 4; i++) { mvs_var_x += XAVS2_ABS(cur_blk_mvs[i].x - mvs_avg_x) * num_blk_pixels[i]; mvs_var_y += XAVS2_ABS(cur_blk_mvs[i].y - mvs_avg_y) * num_blk_pixels[i]; } return (mvs_var_x < 4 * b4_num && mvs_var_y < 4 * b4_num); } return 0; } //#endif /* --------------------------------------------------------------------------- * encode an intra cu (for I-picture) */ static rdcost_t compress_cu_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best, rdcost_t cost_limit) { int i_level = p_cu->cu_info.i_level; cu_layer_t *p_layer = cu_get_layer(h, i_level); uint32_t intra_modes; // valid intra modes rdcost_t split_flag_cost = 0; rdcost_t min_rdcost = MAX_COST; int mode; UNUSED_PARAMETER(cost_limit); if (i_level > MIN_CU_SIZE_IN_BIT) { split_flag_cost = h->f_lambda_mode * p_aec->binary.write_ctu_split_flag(p_aec, 0, i_level); } h->lcu.b_enable_rdoq = (h->param->i_rdoq_level == RDOQ_ALL); h->lcu.b_2nd_rdcost_pass = 1; h->lcu.get_intra_dir_for_rdo_luma = h->get_intra_candidates_luma; //===== SET VALID MODES ===== intra_modes = cu_get_valid_modes(h, h->i_type, i_level); // reset default parameters for chroma intra predictor p_cu->cu_info.i_intra_mode_c = DC_PRED_C; p_cu->cu_info.directskip_wsm_idx = 0; p_cu->cu_info.directskip_mhp_idx = DS_NONE; //===== GET BEST MACROBLOCK MODE ===== for (mode = PRED_I_2Nx2N; mode <= PRED_I_nx2N; mode++) { if (!(intra_modes & (1 << mode))) { continue; // ֱģʽ } if (IS_ALG_ENABLE(OPT_BYPASS_SDIP)) { // һǶԳ֡ģʽǰ if (sdip_early_bypass(h, p_layer, mode)) { continue; } } // init coding block(s) p_cu->cu_info.i_mode = (int8_t)mode; // set cu type cu_check_intra(h, p_aec, p_cu, best, mode, &min_rdcost); } /* ģʽRDOQ */ if (h->param->i_rdoq_level == RDOQ_CU_LEVEL && best->i_cbp > 0) { h->lcu.get_intra_dir_for_rdo_luma = rdo_get_pred_intra_luma_2nd_pass; h->lcu.b_enable_rdoq = 1; mode = best->i_mode; cu_copy_info(&p_cu->cu_info, best); cu_check_intra(h, p_aec, p_cu, best, mode, &min_rdcost); } min_rdcost += split_flag_cost; return p_layer->best_rdcost = min_rdcost; } /* --------------------------------------------------------------------------- * encode an inter cu (for none I-picture) */ static rdcost_t compress_cu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, cu_info_t *best, uint32_t avail_modes, // available prediction partition modes rdcost_t min_rdcost, // Cost of Skip/Direct mode rdcost_t cost_limit) { int b_dhp_enabled = h->param->enable_dhp && h->i_type == SLICE_TYPE_F && h->i_ref > 1; int i_level = p_cu->cu_info.i_level; int b_bypass_intra = 0; int b_check_dmh = 1; int mode; cu_layer_t *p_layer = cu_get_layer(h, p_cu->cu_info.i_level); /* ------------------------------------------------------------- * 1, ʼ */ UNUSED_PARAMETER(cost_limit); h->lcu.get_intra_dir_for_rdo_luma = h->get_intra_candidates_luma; h->enable_tu_2level = IS_ALG_ENABLE(OPT_TU_LEVEL_DEC) ? 0 : 2; h->lcu.b_enable_rdoq = (h->param->i_rdoq_level == RDOQ_ALL); h->lcu.b_2nd_rdcost_pass = 0; for (mode = 0; mode < MAX_PRED_MODES; mode++) { p_layer->mode_rdcost[mode] = MAX_COST; } /* reset chroma intra predictor to default */ p_cu->cu_info.i_intra_mode_c = DC_PRED_C; // @luoflƳУᵼ²ƥ⣻20170304 19:52:32 /* ------------------------------------------------------------- * 2, SkipDirectģʽ */ /* SKIP/Directģʽ */ p_cu->cu_info.i_mode = PRED_SKIP; if (IS_ALG_ENABLE(OPT_ROUGH_SKIP_SEL) && h->skip_rough_improved) { cu_check_skip_direct_rough2(h, p_aec, best, p_cu, &min_rdcost); } else if (IS_ALG_ENABLE(OPT_ROUGH_SKIP_SEL)) { cu_check_skip_direct_rough1(h, p_aec, best, p_cu, &min_rdcost); } else { cu_check_skip_direct_fullrdo(h, p_aec, best, p_cu, &min_rdcost); } p_layer->best_rdcost = min_rdcost; // update valid modes if (IS_ALG_ENABLE(OPT_PSC_MD) || IS_ALG_ENABLE(OPT_TR_KEY_FRAME_MD)) { update_valid_modes_by_complexity(h, p_cu, &avail_modes); } if (IS_ALG_ENABLE(OPT_ROUGH_MODE_SKIP)) { if (h->i_type == SLICE_TYPE_B && !h->fdec->rps.referd_by_others && (i_level == B64X64_IN_BIT || i_level == B32X32_IN_BIT)) { avail_modes &= (uint32_t)~((1 << PRED_2NxN) | (1 << PRED_Nx2N)); } } /* ------------------------------------------------------------- * 3, Skip/Direct֡ģʽ */ for (mode = 1; mode < MAX_INTER_MODES; mode++) { if (!(avail_modes & (1 << mode))) { continue; // ֱģʽľ } /* ------------------------------------------------------------- * 3.1 Skip/DirectģʽصĿģʽ㷨ڴ˴ */ #if SAVE_CU_INFO if (IS_ALG_ENABLE(OPT_EARLY_SKIP)) { if (is_early_skip(h, p_cu)) { b_bypass_intra = 1; break; // bypass all rest inter & intra modes } } #endif /* PUģʽߣ * P2NxNδţֱַͬPRED_2NxnU/PRED_2NxnD; PNx2Nͬ */ if (IS_ALG_ENABLE(OPT_BYPASS_AMP) && i_level > B16X16_IN_BIT) { if ((mode == PRED_2NxnU || mode == PRED_2NxnD) && best->i_mode != PRED_2NxN) { continue; } else if ((mode == PRED_nLx2N || mode == PRED_nRx2N) && best->i_mode != PRED_Nx2N) { continue; } } /* ------------------------------------------------------------- * 3.2, Ա뵱ǰPUģʽ */ p_cu->cu_info.i_mode = (int8_t)mode; if (IS_ALG_ENABLE(OPT_ROUGH_PU_SEL) && mode == PRED_2Nx2N) { cu_info_t cur_best; cu_select_inter_partition(h, p_cu, i_level, avail_modes, &cur_best, &min_rdcost, b_dhp_enabled, b_check_dmh); mode = cur_best.i_mode; cu_copy_info(&p_cu->cu_info, &cur_best); memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc_tmp, sizeof(p_cu->mc)); /* MVϢڲ */ cu_rdcost_inter(h, p_aec, p_cu, &min_rdcost, best); avail_modes &= ~0xfe; // õʣ֡仮ģʽ } else { cu_check_inter_partition(h, p_aec, p_cu, mode, i_level, best, &min_rdcost, b_dhp_enabled, b_check_dmh); } /* ------------------------------------------------------------- * 3.3, ǰͨPUģʽĿپ㷨 */ if (best->i_mode == mode) { if (best->dmh_mode != 0) { if (IS_ALG_ENABLE(OPT_BYPASS_MODE_FPIC)) { b_bypass_intra = 1; } } } if (IS_ALG_ENABLE(OPT_FAST_CBF_MODE) && p_cu->cu_info.i_cbp == 0) { if (mode == PRED_2Nx2N && best->i_mode == PRED_SKIP) { b_bypass_intra = 1; break; // bypass all rest inter & intra modes } if (mode >= PRED_2Nx2N && best->i_mode == mode) { b_bypass_intra = 1; break; // bypass all rest inter modes } } if (IS_ALG_ENABLE(OPT_FAST_PU_SEL) && p_cu->cu_info.i_cbp == 0) { if (mode == PRED_2Nx2N && best->i_mode == PRED_SKIP) { b_bypass_intra = 1; break; // bypass all rest inter & intra modes } } } /* ڶTU֣ѡģʽ */ if (IS_ALG_ENABLE(OPT_TU_LEVEL_DEC) && best->i_cbp > 0) { h->enable_tu_2level = 1; mode = best->i_mode; cu_copy_info(&p_cu->cu_info, best); memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc)); /* MVϢڲ */ cu_rdcost_inter(h, p_aec, p_cu, &min_rdcost, best); }// end of checking inter PU partitions /* ͨ֡Ԥж֡Ҫ֡Ԥʱ֡ģʽ */ if (!h->fenc->b_enable_intra) { b_bypass_intra = 1; } if (IS_ALG_ENABLE(OPT_BYPASS_INTRA_BPIC)) { b_bypass_intra |= (h->i_type == SLICE_TYPE_B && best->i_cbp == 0); // B֡֡Ԥģʽ } /* ò֡ڻģʽ */ if (IS_ALG_ENABLE(OPT_CMS_ETMD)) { /* ֡ģʽ֮ģʽCBPΪ㣬ٱ֡Ԥģʽ */ b_bypass_intra |= ((best->i_cbp == 0) && (best->i_mode == 0)); /* ֡ŻģʽɸѡҪģʽ */ // if (IS_HOR_PU_PART(best->i_mode)) { // avail_modes &= !(1 << PRED_I_nx2N); // } else if (IS_VER_PU_PART(best->i_mode)) { // avail_modes &= !(1 << PRED_I_2Nxn); // } else if (best->i_mode == PRED_SKIP) { // avail_modes &= (1 << PRED_I_2Nx2N); // } } if (IS_ALG_ENABLE(OPT_ROUGH_MODE_SKIP)) { if (h->i_type == SLICE_TYPE_B && i_level == B64X64_IN_BIT) { b_bypass_intra = 1; } if (!h->fdec->rps.referd_by_others && h->i_type == SLICE_TYPE_B && i_level != B16X16_IN_BIT) { b_bypass_intra = 1; } } /* ǰСRDCostСijֵ֡ԤģʽѾܹϺõԤ⣬ʱټ֡ģʽ */ if (IS_ALG_ENABLE(OPT_FAST_INTRA_IN_INTER) && min_rdcost < h->thres_qsfd_cu[1][i_level - MIN_CU_SIZE_IN_BIT]) { b_bypass_intra = 1; } /* ------------------------------------------------------------- * 4, get best intra mode */ if (!b_bypass_intra) { for (mode = PRED_I_2Nx2N; mode <= PRED_I_nx2N; mode++) { if (!(avail_modes & (1 << mode))) { continue; // ֱģʽľ } if (IS_ALG_ENABLE(OPT_BYPASS_SDIP)) { // һǶԳ֡ģʽǰ if (sdip_early_bypass(h, p_layer, mode)) { continue; } } // init coding block(s) p_cu->cu_info.i_mode = (int8_t)mode; // cal rd-cost cu_check_intra(h, p_aec, p_cu, best, mode, &min_rdcost); if (IS_ALG_ENABLE(OPT_CMS_ETMD)){ if (best->i_mode != PRED_I_2Nx2N && mode == PRED_I_2Nx2N) { break; } } } } /* ģʽ,TUֻDzֵȷRDOQ */ if (h->param->i_rdoq_level == RDOQ_CU_LEVEL&& best->i_cbp > 0) { if (IS_ALG_ENABLE(OPT_TU_LEVEL_DEC)) { h->enable_tu_2level = 3; } else { h->enable_tu_2level = 2; } h->lcu.get_intra_dir_for_rdo_luma = rdo_get_pred_intra_luma_2nd_pass; h->lcu.b_enable_rdoq = 1; h->lcu.b_2nd_rdcost_pass = 1; mode = best->i_mode; cu_copy_info(&p_cu->cu_info, best); if (IS_INTRA_MODE(mode)) { if((!IS_ALG_ENABLE(OPT_BYPASS_INTRA_RDOQ)) || h->i_type == SLICE_TYPE_F){ cu_check_intra(h, p_aec, p_cu, best, mode, &min_rdcost); } } else { memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc)); /* MVϢڲ */ cu_rdcost_inter(h, p_aec, p_cu, &min_rdcost, best); } } else if (IS_ALG_ENABLE(OPT_BIT_EST_PSZT) && i_level >= 5 && (best->i_mode != PRED_SKIP || best->i_cbp != 0)) { h->enable_tu_2level = 2; h->lcu.get_intra_dir_for_rdo_luma = rdo_get_pred_intra_luma_2nd_pass; h->lcu.b_2nd_rdcost_pass = 1; // recheck RDCost mode = best->i_mode; cu_copy_info(&p_cu->cu_info, best); if (IS_INTRA_MODE(mode)) { cu_check_intra(h, p_aec, p_cu, best, mode, &min_rdcost); } else { memcpy(&p_cu->mc, &p_layer->cu_mode.best_mc, sizeof(p_cu->mc)); /* MVϢڲ */ cu_rdcost_inter(h, p_aec, p_cu, &min_rdcost, best); } } return p_layer->best_rdcost = min_rdcost; } /* --------------------------------------------------------------------------- */ static INLINE int ctu_intra_depth_pred_mad(xavs2_t *h, int level, int pix_x, int pix_y) { static const int MAD_TH0[] = { 2, 2 * 256, 2 * 1024, 3 * 4096 }; pel_t *p_src_base = h->lcu.p_fenc[0] + pix_y * FENC_STRIDE + pix_x; int cu_size = 1 << level; int mad = g_funcs.pixf.madf[level - MIN_CU_SIZE_IN_BIT](p_src_base, FENC_STRIDE, cu_size); return mad >= MAD_TH0[level - MIN_CU_SIZE_IN_BIT]; } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * RDOPTʼʱòͬ֡CUСõģʽֱӲ */ void xavs2_init_valid_mode_table(xavs2_t *h) { int frm_type; int level; for (frm_type = 0; frm_type < SLICE_TYPE_NUM; frm_type++) { int inter_frame = (frm_type != SLICE_TYPE_I); for (level = MIN_CU_SIZE_IN_BIT; level <= MAX_CU_SIZE_IN_BIT; level++) { uint32_t valid_pred_modes = 0; /* set validity of inter modes */ if (inter_frame) { valid_pred_modes |= 1 << PRED_SKIP; valid_pred_modes |= 1 << PRED_2Nx2N; valid_pred_modes |= h->param->inter_2pu << PRED_2NxN; valid_pred_modes |= h->param->inter_2pu << PRED_Nx2N; if (h->param->enable_amp && level > MIN_CU_SIZE_IN_BIT) { valid_pred_modes |= 1 << PRED_2NxnU; valid_pred_modes |= 1 << PRED_2NxnD; valid_pred_modes |= 1 << PRED_nLx2N; valid_pred_modes |= 1 << PRED_nRx2N; } } /* set validity of intra modes */ if (!inter_frame || h->param->enable_intra) { valid_pred_modes |= 1 << PRED_I_2Nx2N; valid_pred_modes |= (level == MIN_CU_SIZE_IN_BIT) << PRED_I_NxN; // only valid for 32x8,8x32, 16x4,4x16 if (h->param->enable_sdip && (level == B16X16_IN_BIT || level == B32X32_IN_BIT)) { valid_pred_modes |= 1 << PRED_I_2Nxn; valid_pred_modes |= 1 << PRED_I_nx2N; } } // @luofl: SDIP is disabled here for speedup in inter frames if (inter_frame && level != MIN_CU_SIZE_IN_BIT) { valid_pred_modes &= ~((1 << PRED_I_2Nxn) | (1 << PRED_I_nx2N) | (1 << PRED_I_NxN)); } if (inter_frame && IS_ALG_ENABLE(OPT_PU_RMS)) { if (level == B8X8_IN_BIT || level == B16X16_IN_BIT) { valid_pred_modes &= (uint32_t)((1 << PRED_2Nx2N) | (1 << PRED_I_2Nx2N)); } } h->valid_modes[frm_type][level - MIN_CU_SIZE_IN_BIT] = valid_pred_modes; } } } /* --------------------------------------------------------------------------- */ rdcost_t compress_ctu_intra(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int i_level, int i_min_level, int i_max_level, rdcost_t cost_limit) { aec_t cs_aec; cu_layer_t *p_layer = cu_get_layer(h, i_level); cu_info_t *best = &p_layer->cu_best; rdcost_t large_cu_cost = MAX_COST; rdcost_t split_cu_cost = MAX_COST; int b_inside_pic = (p_cu->i_pix_x + p_cu->i_size <= h->i_width) && (p_cu->i_pix_y + p_cu->i_size <= h->i_height); int b_split_ctu = (i_level > i_min_level || !b_inside_pic); int b_check_large_cu = (b_inside_pic && i_level <= i_max_level); /* init current CU --------------------------------------------- */ cu_init(h, p_cu, best, i_level); /* coding current CU ------------------------------------------- */ if (b_check_large_cu) { if (IS_ALG_ENABLE(OPT_ET_INTRA_DEPTH) && b_split_ctu) { b_split_ctu &= ctu_intra_depth_pred_mad(h, i_level, p_cu->i_pos_x, p_cu->i_pos_y); } h->copy_aec_state_rdo(&cs_aec, p_aec); large_cu_cost = compress_cu_intra(h, &cs_aec, p_cu, best, cost_limit); /* QSFD, skip smaller CU partitions */ if (IS_ALG_ENABLE(OPT_CU_QSFD)) { if (p_cu->cu_info.i_level > 3 && large_cu_cost < h->thres_qsfd_cu[1][p_cu->cu_info.i_level - 3]) { b_split_ctu = FALSE; } } } /* coding 4 sub-CUs -------------------------------------------- */ if (b_split_ctu) { int i; split_cu_cost = 0; // cal split cost if (b_inside_pic) { split_cu_cost += h->f_lambda_mode * p_aec->binary.write_ctu_split_flag(p_aec, 1, i_level); } for (i = 0; i < 4; i++) { cu_t *p_sub_cu = p_cu->sub_cu[i]; if (p_sub_cu->i_pix_x >= h->i_width || p_sub_cu->i_pix_y >= h->i_height) { continue; // current sub CU is outside the frame+* } split_cu_cost += compress_ctu_intra(h, p_aec, p_sub_cu, i_level - 1, i_min_level, i_max_level, large_cu_cost - split_cu_cost); if (split_cu_cost > large_cu_cost || (i != 3 && IS_ALG_ENABLE(OPT_CODE_OPTIMZATION) && (split_cu_cost >= SUBCU_COST_RATE[0][i] * large_cu_cost))) { split_cu_cost = MAX_COST; // guide RDO to select large CU break; } } } /* decide split or not ----------------------------------------- */ if (large_cu_cost < split_cu_cost) { /* the larger cu is selected */ cu_copy_stored_parameters(h, p_cu, best); h->copy_aec_state_rdo(p_aec, &p_layer->cs_cu); split_cu_cost = large_cu_cost; } return split_cu_cost; } /* --------------------------------------------------------------------------- */ rdcost_t compress_ctu_inter(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int i_level, int i_min_level, int i_max_level, rdcost_t cost_limit) { aec_t cs_aec; cu_layer_t *p_layer = cu_get_layer(h, i_level); cu_info_t *best = &p_layer->cu_best; rdcost_t large_cu_cost = MAX_COST; rdcost_t split_cu_cost = MAX_COST; rdcost_t split_flag_cost = 0; uint32_t avail_modes = cu_get_valid_modes(h, h->i_type, i_level); int b_inside_pic = (p_cu->i_pix_x + p_cu->i_size <= h->i_width) && (p_cu->i_pix_y + p_cu->i_size <= h->i_height); int b_split_ctu = (i_level > i_min_level || !b_inside_pic); int b_check_large_cu = (b_inside_pic && i_level <= i_max_level); /* init current CU --------------------------------------------- */ cu_init(h, p_cu, best, i_level); /* coding current CU ------------------------------------------- */ if (b_check_large_cu) { h->copy_aec_state_rdo(&cs_aec, p_aec); if (i_level > MIN_CU_SIZE_IN_BIT) { split_flag_cost = h->f_lambda_mode * p_aec->binary.write_ctu_split_flag(&cs_aec, 0, i_level); } large_cu_cost = compress_cu_inter(h, &cs_aec, p_cu, best, avail_modes, large_cu_cost, cost_limit); large_cu_cost += split_flag_cost; if (IS_ALG_ENABLE(OPT_ET_HOMO_MV) && i_level > i_min_level) { b_split_ctu &= !is_ET_inter_recur(h, p_cu, best); } /* ǰCUһCUģʽΪSKIPģʽ²CUĻ @ */ if (IS_ALG_ENABLE(OPT_CU_CSET) && ((p_cu->i_size <= 16 && h->i_type == SLICE_TYPE_B) || (p_cu->i_size <= 32 && h->fdec->rps.referd_by_others == 0))) { cu_layer_t *p_ulayer = cu_get_layer(h, i_level + 1); cu_info_t *curr_ubest = &p_ulayer->cu_best; if (IS_SKIP_MODE(curr_ubest->i_mode) && IS_SKIP_MODE(best->i_mode)) { b_split_ctu = 0; } } /* QSFD, skip smaller CU partitions */ if (IS_ALG_ENABLE(OPT_CU_QSFD)) { if (p_cu->cu_info.i_level != 3 && large_cu_cost < h->thres_qsfd_cu[0][p_cu->cu_info.i_level - 3]) { b_split_ctu = FALSE; } } if (IS_ALG_ENABLE(OPT_ECU) && i_level > i_min_level) { // int i_level_left = p_cu->p_left_cu ? p_cu->p_left_cu->i_level : MAX_CU_SIZE_IN_BIT; // int i_level_top = p_cu->p_topA_cu ? p_cu->p_topA_cu->i_level : MAX_CU_SIZE_IN_BIT; // b_split_ctu &= !(i_level_left >= i_level && i_level_top >= i_level && (best->i_mode == PRED_SKIP)); b_split_ctu &= !((best->i_mode == PRED_SKIP) && (best->i_cbp == 0) && p_cu->is_zero_block); } } /* coding 4 sub-CUs -------------------------------------------- */ if (b_split_ctu) { int i; split_cu_cost = 0; // cal split cost if (b_inside_pic) { split_cu_cost += h->f_lambda_mode * p_aec->binary.write_ctu_split_flag(p_aec, 1, i_level); } for (i = 0; i < 4; i++) { cu_t *p_sub_cu = p_cu->sub_cu[i]; if (p_sub_cu->i_pix_x >= h->i_width || p_sub_cu->i_pix_y >= h->i_height) { continue; // current sub CU is outside the frame } split_cu_cost += compress_ctu_inter(h, p_aec, p_sub_cu, i_level - 1, i_min_level, i_max_level, large_cu_cost - split_cu_cost); if (split_cu_cost > large_cu_cost || (i != 3 && IS_ALG_ENABLE(OPT_CODE_OPTIMZATION) && (split_cu_cost >= SUBCU_COST_RATE[1][i] * large_cu_cost))) { split_cu_cost = MAX_COST; // guide RDO to select large CU break; } } } if (IS_ALG_ENABLE(OPT_SUBCU_SPLIT)) { if ((p_cu->sub_cu[0] != NULL) && (p_cu->sub_cu[1] != NULL) && (p_cu->sub_cu[2] != NULL) && (p_cu->sub_cu[3] != NULL)) { if (((p_cu->sub_cu[0]->is_ctu_split + p_cu->sub_cu[1]->is_ctu_split + p_cu->sub_cu[2]->is_ctu_split + p_cu->sub_cu[3]->is_ctu_split) >= 3)){ b_check_large_cu = FALSE; // 1080p 20% ʡԼ1.7%ʧpreset 61080p } /* else if (((!p_cu->sub_cu[0]->is_ctu_split) && ((p_cu->sub_cu[0]->cu_info.i_mode == PRED_SKIP || p_cu->sub_cu[0]->cu_info.i_mode == PRED_2Nx2N) && (p_cu->sub_cu[0]->cu_info.i_cbp == 0))) && ((!p_cu->sub_cu[1]->is_ctu_split) && ((p_cu->sub_cu[1]->cu_info.i_mode == PRED_SKIP || p_cu->sub_cu[1]->cu_info.i_mode == PRED_2Nx2N) && (p_cu->sub_cu[1]->cu_info.i_cbp == 0))) && ((!p_cu->sub_cu[2]->is_ctu_split) && ((p_cu->sub_cu[2]->cu_info.i_mode == PRED_SKIP || p_cu->sub_cu[2]->cu_info.i_mode == PRED_2Nx2N) && (p_cu->sub_cu[2]->cu_info.i_cbp == 0))) && ((!p_cu->sub_cu[3]->is_ctu_split) && ((p_cu->sub_cu[3]->cu_info.i_mode == PRED_SKIP || p_cu->sub_cu[3]->cu_info.i_mode == PRED_2Nx2N) && (p_cu->sub_cu[3]->cu_info.i_cbp == 0)))) { avail_modes &= (1<copy_aec_state_rdo(p_aec, &p_layer->cs_cu); split_cu_cost = large_cu_cost; p_cu->is_ctu_split = FALSE; } else { p_cu->is_ctu_split = TRUE; } p_cu->rdcost = split_cu_cost; return split_cu_cost; } /* --------------------------------------------------------------------------- */ void xavs2_rdo_init(uint32_t cpuid, intrinsic_func_t *pf) { UNUSED_PARAMETER(cpuid); pf->compress_ctu[SLICE_TYPE_I] = compress_ctu_intra; pf->compress_ctu[SLICE_TYPE_P] = compress_ctu_inter; pf->compress_ctu[SLICE_TYPE_F] = compress_ctu_inter; pf->compress_ctu[SLICE_TYPE_B] = compress_ctu_inter; pf->get_skip_mv_predictors[SLICE_TYPE_I] = NULL; pf->get_skip_mv_predictors[SLICE_TYPE_P] = get_mv_predictors_pskip; pf->get_skip_mv_predictors[SLICE_TYPE_F] = get_mv_predictors_pskip; pf->get_skip_mv_predictors[SLICE_TYPE_B] = get_mv_predictors_bskip; } xavs2-1.3/source/encoder/rdo.h000066400000000000000000000045501340660520300162770ustar00rootroot00000000000000/* * rdo.h * * Description of this file: * RDO functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_RDO_H #define XAVS2_RDO_H #define xavs2_init_valid_mode_table FPFX(init_valid_mode_table) void xavs2_init_valid_mode_table(xavs2_t *h); #define compress_ctu_inter FPFX(compress_ctu_inter) rdcost_t compress_ctu_inter (xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int i_level, int i_min_level, int i_max_level, rdcost_t cost_limit); #define compress_ctu_intra_topdown FPFX(compress_ctu_intra_topdown) rdcost_t compress_ctu_intra_topdown(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int i_level, int i_min_level, int i_max_level, rdcost_t cost_limit); #define compress_ctu_intra_downtop FPFX(compress_ctu_intra_downtop) rdcost_t compress_ctu_intra_downtop(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int i_level, int i_min_level, int i_max_level, rdcost_t cost_limit); typedef rdcost_t(*lcu_analyse_t)(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, int i_level, int i_min_level, int i_max_level, rdcost_t cost_limit); #endif // XAVS2_RDO_H xavs2-1.3/source/encoder/rdoq.c000066400000000000000000001366111340660520300164570ustar00rootroot00000000000000/* * rdoq.c * * Description of this file: * RDOQ functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "transform.h" #include "rdoq.h" #include "wquant.h" #include "aec.h" #include "cudata.h" /** * =========================================================================== * type defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ enum node_type_e { LAST_POS, LAST_RUN, RUN_LEVEL_PAIR }; /* --------------------------------------------------------------------------- * pair cost for RDOQ */ typedef struct pair_cost_t { rdcost_t levelCost; rdcost_t runCost; rdcost_t uncodedCost; int16_t posBlockX; int16_t posBlockY; int32_t scanPos; } pair_cost_t; /* --------------------------------------------------------------------------- * cost state for RDOQ */ typedef struct cost_state_t { pair_cost_t pairCost[16 + 1]; // OLD: +1 rdcost_t sigCGFlagCost; rdcost_t sigCGFlagCost0; rdcost_t lastRunCost; int pairNum; } cost_state_t; /* --------------------------------------------------------------------------- * level info for RDOQ */ typedef struct level_info_t { int pos_scan; /* position in transform block zig-zag scan order */ int pos_xy; /* position in block */ int num_level; /* number of candidate levels */ coeff_t level[3]; /* candidate levels */ coeff_t coeff; /* coefficient before quantization */ double errLevel[3]; /* quantization errors of each candidate */ } level_info_t; /* --------------------------------------------------------------------------- */ typedef struct node_t node_t; struct node_t { node_t *prev; node_t *next; level_info_t *level_info; int attrib; // node_type_e, 0: last pos; 1: last run; 2: (Run, Level) pair int level; int run; int pos; // scan position in CG CoeffPosInCG = ScanCoeffInCG[LastCoeffPosX][LastCoefPosY] rdcost_t cost; }; /* --------------------------------------------------------------------------- */ typedef struct node_list_t { ALIGN16(node_t nodeBuf[16 + 4]); node_t *head; node_t *tail; int i_size; /* number of nodes in the list */ } node_list_t; /** * =========================================================================== * local & global variable defines * =========================================================================== */ /* --------------------------------------------------------------------------- * AC ENGINE PARAMETERS */ static const int16_t tab_LPSbits[256] = { 2184,2184,1928,1779,1673,1591,1525,1468,1419,1376,1338,1303,1272,1243,1216,1191, 1167,1145,1125,1105,1087,1069,1053,1037,1022,1007, 993, 980, 967, 954, 942, 930, 919, 908, 898, 888, 878, 868, 859, 850, 841, 832, 824, 816, 808, 800, 792, 785, 777, 770, 763, 756, 750, 743, 737, 730, 724, 718, 712, 707, 701, 695, 690, 684, 679, 674, 669, 663, 658, 654, 649, 644, 639, 635, 630, 626, 621, 617, 613, 608, 604, 600, 596, 592, 588, 584, 580, 577, 573, 569, 566, 562, 558, 555, 551, 548, 545, 541, 538, 535, 531, 528, 525, 522, 519, 516, 513, 510, 507, 504, 501, 498, 495, 492, 490, 487, 484, 482, 479, 476, 474, 471, 468, 466, 463, 461, 458, 456, 454, 451, 449, 446, 444, 442, 439, 437, 435, 433, 430, 428, 426, 424, 422, 420, 418, 415, 413, 411, 409, 407, 405, 403, 401, 399, 397, 395, 394, 392, 390, 388, 386, 384, 382, 381, 379, 377, 375, 373, 372, 370, 368, 367, 365, 363, 362, 360, 358, 357, 355, 353, 352, 350, 349, 347, 346, 344, 342, 341, 339, 338, 336, 335, 333, 332, 331, 329, 328, 326, 325, 323, 322, 321, 319, 318, 317, 315, 314, 313, 311, 310, 309, 307, 306, 305, 303, 302, 301, 300, 298, 297, 296, 295, 293, 292, 291, 290, 289, 287, 286, 285, 284, 283, 282, 281, 279, 278, 277, 276, 275, 274, 273, 272, 271, 269, 268, 267, 266, 265, 264, 263, 262, 261, 260, 259, 258, 257 }; extern const int tab_intra_mode_scan_type[NUM_INTRA_MODE]; static const int8_t tab_rank[6] = {0, 1, 2, 3, 3, 4/*, 4 ...*/}; /** * =========================================================================== * local function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void list_init(node_list_t *list) { list->head = NULL; list->tail = NULL; list->i_size = 0; } /* --------------------------------------------------------------------------- * create a new node and append it to list */ static ALWAYS_INLINE node_t * create_and_append_node(node_list_t *list, level_info_t *level_info, int attrib, int pos) { node_t *p_node = list->nodeBuf + list->i_size; /* 1, create a new node */ p_node->attrib = attrib; p_node->level_info = level_info; p_node->run = 0; p_node->pos = pos; p_node->prev = NULL; p_node->next = NULL; list->i_size++; /* 2, append the new node to list */ if (list->head == NULL) { /* for empty list */ list->head = p_node; } else { /* append tail */ list->tail->next = p_node; p_node->prev = list->tail; } list->tail = p_node; return p_node; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void remove_node(node_t *p_node) { p_node->prev->next = p_node->next; if (p_node->next != NULL) { p_node->next->prev = p_node->prev; } } /* --------------------------------------------------------------------------- * actually arithmetic encoding of one binary symbol by using the probability * estimate of its associated context model */ static ALWAYS_INLINE int biari_encode_symbol_est(uint8_t symbol, context_t *ctx) { const int lg_pmps = ctx->LG_PMPS >> 2; return (symbol == ctx->MPS) ? lg_pmps : tab_LPSbits[lg_pmps]; } /* --------------------------------------------------------------------------- * actually arithmetic encoding of one binary symbol by using the probability * estimate of its associated context model * firstly encode 0 and then encode 1 */ static ALWAYS_INLINE int biari_encode_symbol_est_0_then_1(context_t *ctx) { const int lg_pmps = ctx->LG_PMPS >> 2; return lg_pmps + tab_LPSbits[lg_pmps]; } /* --------------------------------------------------------------------------- * lensymbolֵɱ뵥ſ֪IJ£򻯳ɵŵ */ static ALWAYS_INLINE int biari_encode_symbols_est(uint8_t symbol, int len, context_t *ctx) { return len * biari_encode_symbol_est(symbol, ctx); } /* --------------------------------------------------------------------------- */ #define biari_encode_symbol_eq_prob_est(p) (256) /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int biari_encode_symbol_final_est(uint8_t symbol) { // context_t ctx = { 1 << LG_PMPS_SHIFTNO, 0, 0 }; // return biari_encode_symbol_est(symbol, &ctx); /* symbol != MPS ? tab_LPSbits[lg_pmps] : lg_pmps */ return symbol ? tab_LPSbits[1] : 1; } /* --------------------------------------------------------------------------- * set all coefficients to zeros */ static ALWAYS_INLINE void rdoq_memset_zero_coeffs(coeff_t *ncur_blk, int pos_start, int pos_end) { memset(ncur_blk + pos_start, 0, (pos_end - pos_start) * sizeof(coeff_t)); } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int get_block_size_id_wq(int i_tu_split, int i_tu_level, int b_luma) { int wqm_size_id = 0; if (b_luma && i_tu_split == TU_SPLIT_HOR) { if (i_tu_level == B8X8_IN_BIT) { wqm_size_id = 2; } else if (i_tu_level == B16X16_IN_BIT || i_tu_level == B32X32_IN_BIT) { wqm_size_id = 3; } } else if (b_luma && i_tu_split == TU_SPLIT_VER) { if (i_tu_level == B8X8_IN_BIT) { wqm_size_id = 2; } else if (i_tu_level == B16X16_IN_BIT || i_tu_level == B32X32_IN_BIT) { wqm_size_id = 3; } } else { wqm_size_id = i_tu_level - B4X4_IN_BIT; } return wqm_size_id; } /* --------------------------------------------------------------------------- */ static int est_rate_last_cg_pos(rdoq_t *p_rdoq, int iCG, int *cg_x, int *cg_y) { int rate = 0; if (p_rdoq->i_tu_level == B4X4_IN_BIT) { *cg_x = 0; *cg_y = 0; } else { context_t *p_ctx = p_rdoq->p_ctx_last_cg; int i_cg_last_x = p_rdoq->p_scan_cg[iCG][0]; int i_cg_last_y = p_rdoq->p_scan_cg[iCG][1]; *cg_x = i_cg_last_x; *cg_y = i_cg_last_y; if (p_rdoq->i_tu_level == B8X8_IN_BIT) { switch (iCG) { case 0: rate += biari_encode_symbol_est(1, p_ctx + 0); break; case 1: rate += biari_encode_symbol_est(0, p_ctx + 0); rate += biari_encode_symbol_est(1, p_ctx + 1); break; case 2: rate += biari_encode_symbol_est(0, p_ctx + 0); rate += biari_encode_symbol_est(0, p_ctx + 1); rate += biari_encode_symbol_est(1, p_ctx + 2); break; default: // case 3: rate += biari_encode_symbol_est(0, p_ctx + 0); rate += biari_encode_symbol_est(0, p_ctx + 1); rate += biari_encode_symbol_est(0, p_ctx + 2); break; } } else { const int b_luma = p_rdoq->b_luma; int num_cg_x = p_rdoq->num_cg_x - 1; // (number - 1) of CG in x direction int num_cg_y = p_rdoq->num_cg_y - 1; // (number - 1) of CG in y direction if (b_luma && p_rdoq->b_dc_diag) { XAVS2_SWAP(i_cg_last_x, i_cg_last_y); } if (i_cg_last_x == 0 && i_cg_last_y == 0) { rate += biari_encode_symbol_est(0, p_ctx + 3); /* last_cg0_flag */ } else { rate += biari_encode_symbol_est(1, p_ctx + 3); /* last_cg0_flag */ /* last_cg_x */ rate += biari_encode_symbols_est(0, i_cg_last_x, p_ctx + 4); if (i_cg_last_x < num_cg_x) { rate += biari_encode_symbol_est(1, p_ctx + 4); } /* last_cg_y */ rate += biari_encode_symbols_est(0, i_cg_last_y - (i_cg_last_x == 0), p_ctx + 5); if (i_cg_last_y < num_cg_y) { rate += biari_encode_symbol_est(1, p_ctx + 5); } } } } return rate; } /* --------------------------------------------------------------------------- * estimate rate of coding "significant_cg_flag" */ static ALWAYS_INLINE int est_rate_nonzero_cg_flag(rdoq_t *p_rdoq, int sig_cg_flag, int ctx) { return biari_encode_symbol_est((uint8_t)sig_cg_flag, p_rdoq->p_ctx_sign_cg + ctx); } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int est_rate_last_coeff_pos(rdoq_t *p_rdoq, int last_coeff_pos_x, int last_coeff_pos_y, int isLastCG, int cg_x, int cg_y) { context_t *p_ctx = p_rdoq->p_ctx_last_pos; const int b_dc_diag = p_rdoq->b_dc_diag; const int b_one_cg = (p_rdoq->i_tu_level == B4X4_IN_BIT); const int b_luma = p_rdoq->b_luma; int rate = 0; int offset = 0; if (!isLastCG) { last_coeff_pos_x = 3 - last_coeff_pos_x; if (b_dc_diag) { last_coeff_pos_y = 3 - last_coeff_pos_y; } } if (cg_x == 0 && cg_y > 0 && b_dc_diag) { XAVS2_SWAP(last_coeff_pos_x, last_coeff_pos_y); } /* AVS2-P2: 8.3.3.2.14 ȷlast_coeff_pos_x last_coeff_pos_y ctxIdxInc */ if (b_luma == 0) { // ɫȷռ12 offset = b_one_cg ? 0 : 4 + isLastCG * 4; } else if (b_one_cg) { // Log2TransformSize Ϊ 2ռ8 offset = 40 + (b_dc_diag) * 4; } else if (cg_x != 0 && cg_y != 0) { // cg_x cg_y Ϊ㣬ռ8 offset = 32 + isLastCG * 4; } else { // λռ40 offset = (4 * isLastCG + 2 * (cg_x == 0 && cg_y == 0) + (b_dc_diag)) * 4; } p_ctx += offset; switch (last_coeff_pos_x) { case 0: rate += biari_encode_symbol_est(1, p_ctx + 0); break; case 1: rate += biari_encode_symbol_est(0, p_ctx + 0); rate += biari_encode_symbol_est(1, p_ctx + 1); break; case 2: rate += biari_encode_symbol_est(0, p_ctx + 0); rate += biari_encode_symbol_est_0_then_1(p_ctx + 1); break; default: // case 3: rate += biari_encode_symbol_est(0, p_ctx + 0); rate += biari_encode_symbol_est(0, p_ctx + 1) << 1; break; } p_ctx += 2; switch (last_coeff_pos_y) { case 0: rate += biari_encode_symbol_est(1, p_ctx + 0); break; case 1: rate += biari_encode_symbol_est(0, p_ctx + 0); rate += biari_encode_symbol_est(1, p_ctx + 1); break; case 2: rate += biari_encode_symbol_est(0, p_ctx + 0); rate += biari_encode_symbol_est_0_then_1(p_ctx + 1); break; default: // case 3: rate += biari_encode_symbol_est(0, p_ctx + 0); rate += biari_encode_symbol_est(0, p_ctx + 1) << 1; break; } return rate; } /* --------------------------------------------------------------------------- */ static int est_rate_level(context_t *p_ctx, int rank, int absLevel, int pairsInCG, int iCG, int pos, int b_luma) { int rate = 0; int symbol = absLevel - 1; if (symbol > 31) { int exp_golomb_order = 0; symbol -= 32; rate += biari_encode_symbol_final_est(1); while (symbol >= (1 << exp_golomb_order)) { symbol -= (1 << exp_golomb_order); exp_golomb_order++; } rate += (2 * exp_golomb_order + 1) * biari_encode_symbol_eq_prob_est(symbol); } else { int pairsInCGIdx = XAVS2_MIN(2, ((pairsInCG + 1) >> 1)); p_ctx += 10 * (iCG == 0 && pos > 12) + XAVS2_MIN(rank, pairsInCGIdx + 2) + ((5 * pairsInCGIdx) >> 1); // chroma if (!b_luma) { p_ctx += 20; } rate += biari_encode_symbol_final_est(0); rate += biari_encode_symbols_est(0, symbol, p_ctx); if (symbol < 31) { rate += biari_encode_symbol_est(1, p_ctx); } } return rate; } /* --------------------------------------------------------------------------- */ static int est_rate_run(rdoq_t *p_rdoq, context_t *p_ctx, int run, int pos, int iCG, int remaining_pos) { static const int8_t tab_run_rate_ctx_offset[16] = { 2, 2, 2, 2, 2, // pos <= 4 1, 1, 1, 1, 1, 1, 1, // pos > 4 && pos <= 11 0, 0, 0, 0 // pos > 11 }; const int b_luma = p_rdoq->b_luma; const int b_dc_diag = p_rdoq->b_dc_diag; int symbol = run; int offset = 0; int rate = 0; int pos2 = 15 - pos - 1; int off2 = (p_rdoq->i_tu_level == B4X4_IN_BIT) ? 0 : (b_luma ? 4 : 3); int moddiv; int y_div2; if (pos < 15) { if (b_luma) { y_div2 = (tab_scan_4x4[pos2][1] + 1) >> 1; moddiv = b_dc_diag ? tab_run_rate_ctx_offset[pos] : y_div2; offset = ((iCG == 0) ? (pos == 14 ? 0 : (1 + moddiv)) : (4 + moddiv)) + off2; // 0,...,9 10 } else { moddiv = (pos <= 9); offset = ((iCG == 0) ? (pos == 14 ? 0 : (1 + moddiv)) : 2) + off2; // 0,1,2, +4 (8) } } if (iCG == 0) { if (b_luma) { while (symbol-- > 0) { assert(offset >= 0 && offset < NUM_MAP_CTX); rate += biari_encode_symbol_est(0, p_ctx + offset); pos++; if (--pos2 >= 0) { y_div2 = (tab_scan_4x4[pos2][1] + 1) >> 1; moddiv = b_dc_diag ? tab_run_rate_ctx_offset[pos] : y_div2; offset = off2 + (pos == 14 ? 0 : (1 + moddiv)); } } } else { while (symbol-- > 0) { assert(offset >= 0 && offset < NUM_MAP_CTX); rate += biari_encode_symbol_est(0, p_ctx + offset); pos++; if (--pos2 >= 0) { moddiv = (pos <= 9); offset = off2 + (pos == 14 ? 0 : (1 + moddiv)); } } } } else { if (b_luma) { while (symbol-- > 0) { assert(offset >= 0 && offset < NUM_MAP_CTX); rate += biari_encode_symbol_est(0, p_ctx + offset); pos++; if (--pos2 >= 0) { y_div2 = (tab_scan_4x4[pos2][1] + 1) >> 1; moddiv = b_dc_diag ? tab_run_rate_ctx_offset[pos] : y_div2; offset = off2 + 4 + moddiv; } } } else { offset = off2 + 2; assert(offset >= 0 && offset < NUM_MAP_CTX); rate += symbol * biari_encode_symbol_est(0, p_ctx + offset); } } if (run < remaining_pos) { assert(offset >= 0 && offset < NUM_MAP_CTX); rate += biari_encode_symbol_est(1, p_ctx + offset); } return rate; } /* --------------------------------------------------------------------------- * get number of non-zero CGs */ static ALWAYS_INLINE int rdoq_get_last_cg_pos(coeff_t *ncur_blk, int num_coeff, const int thres_lower_int) { int idx_coeff; for (idx_coeff = num_coeff - 1; idx_coeff >= 0; idx_coeff--) { if (ncur_blk[idx_coeff] <= thres_lower_int) { ncur_blk[idx_coeff] = 0; } else { break; /* last none zero coeff is found */ } } return ((idx_coeff + 16) >> 4); } #if ENABLE_WQUANT /* --------------------------------------------------------------------------- * rdoqԤȼһϵȡֵȨ */ static int rdoq_est_coeff_level_wq(xavs2_t *h, level_info_t *p_level_info, wq_data_t *wq, int wqm_size_id, int wqm_size, int xx, int yy, coeff_t coeff, int qp, int shift_bit) { const double f_err_level_mult = 256.0 / (1 << (shift_bit * 2)); const int thres_lower_int = (int)((16384 << shift_bit) / (double)(tab_Q_TAB[qp])); const int scale = tab_IQ_TAB[qp]; const int shift = tab_IQ_SHIFT[qp] - shift_bit; int wqm_shift = (h->param->PicWQDataIndex == 1) ? 3 : 0; int wqm_coef = 1; int rec, err; int level; int b_lower; int stride; if ((wqm_size_id == 0) || (wqm_size_id == 1)) { stride = wqm_size; wqm_coef = wq->cur_wq_matrix[wqm_size_id][(yy & (stride - 1)) * stride + (xx & (stride - 1))]; } else if (wqm_size_id == 2) { stride = wqm_size >> 1; wqm_coef = wq->cur_wq_matrix[wqm_size_id][((yy >> 1) & (stride - 1)) * stride + ((xx >> 1) & (stride - 1))]; } else if (wqm_size_id == 3) { stride = wqm_size >> 2; wqm_coef = wq->cur_wq_matrix[wqm_size_id][((yy >> 2) & (stride - 1)) * stride + ((xx >> 2) & (stride - 1))]; } level = (int)(coeff * tab_Q_TAB[qp] >> (15 + shift_bit)); level = XAVS2_CLIP3((-((1 << 18) / wqm_coef)), (((1 << 18) / wqm_coef) - 1), level); rec = (((((coeff * wqm_coef) >> 3) * scale) >> 4) + (1 << (shift - 1))) >> shift; b_lower = (coeff - rec) <= thres_lower_int; #define GET_ERROR_LEVEL_WQ(i, cur_level) \ {\ rec = ((((((int)(cur_level) * wqm_coef) >> wqm_shift) * scale) >> 4) + (1 << (shift - 1))) >> shift;\ err = coeff - rec;\ p_level_info->errLevel[i] = err * err * f_err_level_mult;\ p_level_info->level[i] = (coeff_t)(cur_level);\ } /* 1, generate levels of one coefficient [xx, yy] */ p_level_info->coeff = coeff; GET_ERROR_LEVEL_WQ(0, 0); if (level == 0) { if (b_lower) { p_level_info->num_level = 1; return 0; } else { GET_ERROR_LEVEL_WQ(1, 1); p_level_info->num_level = 2; return 1; } } else { if (b_lower) { GET_ERROR_LEVEL_WQ(1, level); p_level_info->num_level = 2; } else { GET_ERROR_LEVEL_WQ(1, level); GET_ERROR_LEVEL_WQ(2, level + 1); p_level_info->num_level = 3; } return level; } #undef GET_ERROR_LEVEL_WQ } #endif /* --------------------------------------------------------------------------- * rdoqԤȼһϵȡֵ */ static ALWAYS_INLINE int rdoq_est_coeff_level(level_info_t *p_level_info, coeff_t coeff, int qp, int shift_bit, const double f_err_level_mult, const int thres_lower_int) { const int scale = tab_IQ_TAB[qp]; const int shift = tab_IQ_SHIFT[qp] - shift_bit; const int shift_offset = 1 << (shift - 1); int rec, err; int level; int b_lower; #define GET_ERROR_LEVEL(i, cur_level) \ {\ rec = ((cur_level) * scale + shift_offset) >> shift;\ err = coeff - rec;\ p_level_info->errLevel[i] = err * err * f_err_level_mult;\ p_level_info->level[i] = (coeff_t)(cur_level);\ } level = (int)(coeff * tab_Q_TAB[qp] >> (15 + shift_bit)); rec = (level * scale + shift_offset) >> shift; b_lower = (coeff - rec) <= thres_lower_int; p_level_info->coeff = coeff; GET_ERROR_LEVEL(0, 0); if (level == 0) { if (b_lower) { p_level_info->num_level = 1; return 0; } else { GET_ERROR_LEVEL(1, 1); p_level_info->num_level = 2; return 1; } } else { if (b_lower) { GET_ERROR_LEVEL(1, level); p_level_info->num_level = 2; } else { GET_ERROR_LEVEL(1, level); GET_ERROR_LEVEL(2, level + 1); p_level_info->num_level = 3; } return level; } #undef GET_ERROR_LEVEL } /* --------------------------------------------------------------------------- * ָڵϵۼӺ */ static ALWAYS_INLINE int rdoq_get_sum_abs_coeff(level_info_t *p_level_info, const coeff_t *p_ncoeff, int pos_start, int pos_end) { int sum = 0; int pos; for (pos = pos_start; pos <= pos_end; pos++) { sum += p_ncoeff[p_level_info[pos].pos_scan]; } return sum; } /* --------------------------------------------------------------------------- * һCGڵϵĸlevelǰCG */ static int rdoq_est_cg(xavs2_t *h, rdoq_t *p_rdoq, level_info_t *p_level_info, cost_state_t *p_cost_stat, node_t *node, coeff_t *ncur_blk, int8_t *p_sig_cg_flag, int iCG, int rank_pre) { static const int T_Chr[5] = {0, 1, 2, 4, INT_MAX}; pair_cost_t *p_pair_cost = &p_cost_stat->pairCost[0]; context_t(*p_ctx_primary)[NUM_MAP_CTX] = p_rdoq->p_ctx_primary; context_t *p_ctx; const rdcost_t lambda_rdoq = h->f_lambda_rdoq; rdcost_t lagrUncoded = 0; rdcost_t lagrAcc = 0; int pairsInCG = 0; int isSigCG = 0; int isLastCG = 0; int level_max = T_Chr[rank_pre]; int rank = rank_pre; int CGx = p_rdoq->p_scan_cg[iCG][0]; int CGy = p_rdoq->p_scan_cg[iCG][1]; int w_shift_x = p_rdoq->bit_size_shift_x; int w_shift_y = p_rdoq->bit_size_shift_y; // rdoq for this CG if (node != NULL && node->attrib == LAST_POS) { isLastCG = 1; node = node->next; } while (node != NULL) { if (node->attrib == LAST_RUN) { // this is not the last CG if (node->run != 16) { int scan_pos = tab_1d_scan_4x4[15 - node->run]; p_cost_stat->lastRunCost = lambda_rdoq * est_rate_last_coeff_pos(p_rdoq, scan_pos & 3, (scan_pos >> 2), 0, CGx, CGy); } else { p_cost_stat->lastRunCost = 0; } lagrAcc += p_cost_stat->lastRunCost; } else { // a (level, run) pair // try level, level-1 first, then compare to level=0 case int levelNo; int absSum5; int absLevel; int rateRunMerged = 0; int best_state = 0; rdcost_t minlagr = MAX_COST; rdcost_t lagrDelta = 0; rdcost_t lagrDelta0 = 0; rdcost_t lagr; int xx_yy; isSigCG = 1; for (levelNo = 1; levelNo < node->level_info->num_level; levelNo++) { int rateLevel; int rateRunCurr; int rateRunPrev; int pos_end = XAVS2_MIN(node->pos + 6, 15); // rate: Level absLevel = node->level_info->level[levelNo]; p_ctx = p_rdoq->p_ctx_coeff_level; rateLevel = est_rate_level(p_ctx, rank, absLevel, pairsInCG, iCG, 15 - node->pos, p_rdoq->b_luma); // rate: Sign rateLevel += biari_encode_symbol_eq_prob_est(absLevel < 0); // rate: Run[i] absSum5 = absLevel + rdoq_get_sum_abs_coeff(p_level_info, ncur_blk, node->pos + 1, pos_end); p_ctx = p_ctx_primary[XAVS2_MIN((absSum5 >> 1), 2)]; rateRunCurr = est_rate_run(p_rdoq, p_ctx, node->run, 15 - node->pos, iCG, node->pos); // rate: Run[i+1] // node->prev always exists if (node->prev->attrib == LAST_POS) { rateRunPrev = 0; } else if (node->prev->attrib == LAST_RUN) { if (node->prev->run != 16) { int scan_pos = tab_1d_scan_4x4[15 - node->prev->run]; rateRunPrev = est_rate_last_coeff_pos(p_rdoq, scan_pos & 3, scan_pos >> 2, 0, CGx, CGy); } else { rateRunPrev = 0; } p_cost_stat->lastRunCost = lambda_rdoq * rateRunPrev; } else { // RUN_LEVEL_PAIR pos_end = XAVS2_MIN(node->prev->pos + 6, 15); absSum5 = rdoq_get_sum_abs_coeff(p_level_info, ncur_blk, node->prev->pos, pos_end); p_ctx = p_ctx_primary[XAVS2_MIN((absSum5 >> 1), 2)]; rateRunPrev = est_rate_run(p_rdoq, p_ctx, node->prev->run, 15 - node->prev->pos, iCG, node->prev->pos); } // cost for the current (Level, Run) pair p_pair_cost->levelCost = (rdcost_t)(node->level_info->errLevel[levelNo] + lambda_rdoq * rateLevel); p_pair_cost->runCost = lambda_rdoq * rateRunCurr; p_pair_cost->scanPos = node->level_info->pos_scan; // calculate cost: distLevel[i] + rateLevel[i] + rateRun[i] + rateRun[i+1] lagr = (rdcost_t)(node->level_info->errLevel[levelNo] + lambda_rdoq * (rateLevel + rateRunCurr + rateRunPrev)); if (lagr < minlagr) { minlagr = lagr; best_state = levelNo; } lagrDelta = lambda_rdoq * rateRunPrev; } p_pair_cost->uncodedCost = (rdcost_t)(node->level_info->errLevel[0]); // compare cost of level or level-1 with uncoded case (level=0) // Run[i] if (node->prev->attrib != LAST_POS && (node->prev->attrib != LAST_RUN || node->next != NULL)) { if (node->prev->attrib == RUN_LEVEL_PAIR) { int pos_start = node->prev->pos; int pos_end = XAVS2_MIN(pos_start + 6, 15); absSum5 = rdoq_get_sum_abs_coeff(p_level_info, ncur_blk, pos_start, pos_end); p_ctx = p_ctx_primary[XAVS2_MIN((absSum5 >> 1), 2)]; rateRunMerged = est_rate_run(p_rdoq, p_ctx, node->prev->run + 1 + node->run, 15 - pos_start, iCG, pos_start); lagrDelta0 = p_cost_stat->pairCost[pairsInCG - 1].runCost; } else /*if (node->next != NULL)*/ { // LAST_RUN // only try 0 when there's more than 1 pair in the CG lagrDelta0 = p_cost_stat->lastRunCost; if (node->prev->run != 16) { int scan_pos = tab_1d_scan_4x4[15 - (node->prev->run + 1 + node->run)]; rateRunMerged = est_rate_last_coeff_pos(p_rdoq, scan_pos & 3, scan_pos >> 2, 0, CGx, CGy); } else { rateRunMerged = 0; } } // calculate cost: distLevel[i][0] + rate(Run[i] + Run[i+1] + 1) lagr = (rdcost_t)(node->level_info->errLevel[0] + lambda_rdoq * rateRunMerged); if (lagr < minlagr) { minlagr = lagr; lagrDelta = lagrDelta0; best_state = 0; } } // set SDQ results xx_yy = p_level_info[node->pos].pos_xy; absLevel = node->level = node->level_info->level[best_state]; ncur_blk[p_level_info[node->pos].pos_scan] = (coeff_t)absLevel; lagrAcc += minlagr - lagrDelta; lagrUncoded += (rdcost_t)(node->level_info->errLevel[0]); p_pair_cost->posBlockX = (int16_t)((xx_yy >> w_shift_x) & 0x3); p_pair_cost->posBlockY = (int16_t)((xx_yy >> w_shift_y) & 0x3); p_pair_cost->scanPos = p_level_info[node->pos].pos_scan; if (best_state == 0) { // adjust the run of the previous node and remove the current node node->prev->run += node->run + 1; if (node->prev->attrib == LAST_RUN) { p_cost_stat->lastRunCost = lambda_rdoq * rateRunMerged; } else { p_cost_stat->pairCost[pairsInCG - 1].runCost = lambda_rdoq * rateRunMerged; } remove_node(node); } else { pairsInCG++; p_pair_cost++; } // update rank level_max = XAVS2_MAX(level_max, absLevel); rank = tab_rank[XAVS2_MIN(5, level_max)]; } node = node->next; } if (!isLastCG) { int sig_cg_ctx = p_rdoq->b_luma && (iCG != 0); if (isSigCG) { lagrAcc += (p_cost_stat->sigCGFlagCost = lambda_rdoq * est_rate_nonzero_cg_flag(p_rdoq, 1, sig_cg_ctx)); lagrUncoded += (p_cost_stat->sigCGFlagCost0 = lambda_rdoq * est_rate_nonzero_cg_flag(p_rdoq, 0, sig_cg_ctx)); // try to turn CG to all-zero here. don't do this to last CG if (lagrUncoded < lagrAcc) { rdoq_memset_zero_coeffs(ncur_blk, iCG << 4, (iCG + 1) << 4); p_sig_cg_flag[iCG] = 0; p_cost_stat->sigCGFlagCost = (iCG == 0) ? 0 : p_cost_stat->sigCGFlagCost0; p_cost_stat->lastRunCost = 0; pairsInCG = 0; rank = rank_pre; } } else { p_cost_stat->sigCGFlagCost = lambda_rdoq * est_rate_nonzero_cg_flag(p_rdoq, 0, sig_cg_ctx); } } p_cost_stat->pairNum = pairsInCG; return rank; } /* --------------------------------------------------------------------------- */ static int rdoq_cg(xavs2_t *h, rdoq_t *p_rdoq, cu_t *p_cu, coeff_t *ncur_blk, const int num_coeff, int qp) { ALIGN16(cost_state_t cg_cost_stat [64]); ALIGN16(level_info_t cg_level_data[16]); // level data in a CG int8_t *p_sig_cg_flag = p_rdoq->sig_cg_flag; node_list_t list_run_level; cost_state_t *p_cost_stat; const int16_t *p_tab_coeff_scan1d = p_rdoq->p_scan_tab_1d; const int i_tu_level = p_rdoq->i_tu_level; const int shift_bit = 16 - (h->param->sample_bit_depth + 1) - i_tu_level; const double f_err_level_mult = 256.0 / (1 << (shift_bit * 2)); const int thres_lower_int = (int)((16384 << shift_bit) / (double)(tab_Q_TAB[qp])); const rdcost_t lambda_rdoq = h->f_lambda_rdoq; int last_pos = -1; int rank = 0; int num_cg; int i_cg; int num_nonzero = 0; // number of non-zero coefficients #if ENABLE_WQUANT wq_data_t *wq = &h->wq_data; int wqm_size_id = 0; int wqm_size = 0; /* init weighted quant block size */ if (h->WeightQuantEnable) { wqm_size_id = get_block_size_id_wq(p_cu->cu_info.i_tu_split, i_tu_level, p_rdoq->b_luma); wqm_size = 1 << (wqm_size_id + 2); } #else UNUSED_PARAMETER(p_cu); #endif /* init */ list_init(&list_run_level); memset(p_sig_cg_flag, 0, sizeof(p_rdoq->sig_cg_flag)); /* βȫϵcg */ num_cg = rdoq_get_last_cg_pos(ncur_blk, num_coeff, thres_lower_int); for (i_cg = num_cg - 1; i_cg >= 0; i_cg--) { node_t *p_node = NULL; int idx_coeff_in_cg = 15; int idx_coeff = (i_cg << 4) + idx_coeff_in_cg; p_cost_stat = &cg_cost_stat[i_cg]; for (; idx_coeff_in_cg >= 0; idx_coeff_in_cg--, idx_coeff--) { level_info_t *p_level_info = &cg_level_data[idx_coeff_in_cg]; int xx_yy; // quant_init xx_yy = p_tab_coeff_scan1d[idx_coeff]; /* 1, generate levels of one coefficient [xx, yy] */ #if ENABLE_WQUANT if (h->WeightQuantEnable) { ncur_blk[idx_coeff] = (coeff_t)rdoq_est_coeff_level_wq(h, p_level_info, wq, wqm_size_id, wqm_size, xx, yy, ncur_blk[idx_coeff], qp, shift_bit); } else { ncur_blk[idx_coeff] = (coeff_t)rdoq_est_coeff_level(p_level_info, ncur_blk[idx_coeff], qp, shift_bit, f_err_level_mult, thres_lower_int); } #else ncur_blk[idx_coeff] = (coeff_t)rdoq_est_coeff_level(p_level_info, ncur_blk[idx_coeff], qp, shift_bit, f_err_level_mult, thres_lower_int); #endif p_level_info->pos_xy = xx_yy; p_level_info->pos_scan = idx_coeff; /* 2, build (Level, Run) pair linked list */ if (last_pos == -1) { // last is not found yet if (p_level_info->num_level > 1) { list_init(&list_run_level); // found last position in last CG last_pos = idx_coeff_in_cg; // first node in the list is last position p_node = create_and_append_node(&list_run_level, NULL, LAST_POS, idx_coeff_in_cg); // the second node is the (run, pair) pair p_node = create_and_append_node(&list_run_level, p_level_info, RUN_LEVEL_PAIR, idx_coeff_in_cg); num_cg = i_cg + 1; p_sig_cg_flag[i_cg] = 1; // this is the last CG } } else { // last is found // first node is last run if (idx_coeff_in_cg == 15) { // a new CG begins list_init(&list_run_level); // the position of the last run is always initialized to 15 p_node = create_and_append_node(&list_run_level, NULL, LAST_RUN, idx_coeff_in_cg); } // starting from the 2nd node, it is (level, run) node if (p_level_info->num_level > 1) { p_node = create_and_append_node(&list_run_level, p_level_info, RUN_LEVEL_PAIR, idx_coeff_in_cg); p_sig_cg_flag[i_cg] = 1; // get the real position of last run if (p_node->prev->attrib == LAST_RUN) { p_node->prev->pos = idx_coeff_in_cg; } } else { p_node->run++; } } } /* 3, estimate costs */ if (last_pos != -1) { // a CG just ended rank = rdoq_est_cg(h, p_rdoq, cg_level_data, p_cost_stat, list_run_level.head, ncur_blk, p_sig_cg_flag, i_cg, rank); num_nonzero += p_cost_stat->pairNum; } } if (!num_nonzero) { return 0; } /* 4, estimate last */ i_cg = num_cg - 1; if (last_pos != -1) { int CGx, CGy; int xx, yy; int pos_last_scan = last_pos + (i_cg << 4); // get scan position of the last rdcost_t cost_uncoded_block = 0; rdcost_t cost_prev_last_cg, cost_prev_last_pos; rdcost_t cost_prev_level = 0, cost_prev_run = 0, cost_prev_uncoded = 0; rdcost_t cost_best, cost_curr; cost_prev_last_cg = lambda_rdoq * est_rate_last_cg_pos(p_rdoq, num_cg - 1, &CGx, &CGy); p_cost_stat = &cg_cost_stat[i_cg]; xx = p_cost_stat->pairCost[0].posBlockX; yy = p_cost_stat->pairCost[0].posBlockY; cost_prev_last_pos = lambda_rdoq * est_rate_last_coeff_pos(p_rdoq, xx, yy, 1, CGx, CGy); // init cost_best cost_best = cost_prev_last_cg + cost_prev_last_pos + cost_prev_level + cost_prev_run; cost_curr = cost_best; for (; i_cg >= 0; i_cg--, p_cost_stat--) { int pairNo = 0; rdcost_t cost_curr_last_cg = lambda_rdoq * est_rate_last_cg_pos(p_rdoq, i_cg, &CGx, &CGy); rdcost_t cost_curr_last_pos; pair_cost_t *p_pair_cost = &p_cost_stat->pairCost[pairNo]; if (i_cg != num_cg - 1) { // last run cost_best += p_cost_stat->lastRunCost; if (i_cg > 0) { cost_best += p_cost_stat->sigCGFlagCost; } } cost_curr += cost_curr_last_cg - cost_prev_last_cg; for (; pairNo < p_cost_stat->pairNum; pairNo++, p_pair_cost++) { // when pairNo == 0, it is the last pair in CG // last position in last CG xx = p_pair_cost->posBlockX; yy = p_pair_cost->posBlockY; cost_curr_last_pos = lambda_rdoq * est_rate_last_coeff_pos(p_rdoq, xx, yy, 1, CGx, CGy); cost_curr += cost_curr_last_pos - cost_prev_last_pos + cost_prev_uncoded + p_pair_cost->levelCost - cost_prev_level + p_pair_cost->runCost - cost_prev_run; cost_best += p_pair_cost->levelCost + p_pair_cost->runCost; cost_uncoded_block += p_pair_cost->uncodedCost; cost_prev_uncoded = p_pair_cost->uncodedCost; cost_prev_level = p_pair_cost->levelCost; cost_prev_run = p_pair_cost->runCost; cost_prev_last_pos = cost_curr_last_pos; if (cost_curr <= cost_best) { cost_best = cost_curr; rdoq_memset_zero_coeffs(ncur_blk, p_pair_cost->scanPos + 1, pos_last_scan + 1); pos_last_scan = p_pair_cost->scanPos; } } cost_prev_last_cg = cost_curr_last_cg; } // cost_uncoded_block is the total uncoded distortion // cost_best is the summation of Best LastPos and Best lastCG and CGSign and lastrun and (run,level)s if (cost_uncoded_block < cost_best) { rdoq_memset_zero_coeffs(ncur_blk, 0, pos_last_scan + 1); return 0; } i_cg = num_cg - 2; p_cost_stat = &cg_cost_stat[i_cg]; // estimate last for each non-last CG for (; i_cg >= 0; i_cg--, p_cost_stat--) { pair_cost_t *p_pair_cost = &p_cost_stat->pairCost[0]; int lastScanPosInCG; int pairNo; rdcost_t cost_curr_last_pos; if (p_sig_cg_flag[i_cg] == 0) { continue; } lastScanPosInCG = p_pair_cost->scanPos; // Last Position in current CG xx = p_pair_cost->posBlockX; yy = p_pair_cost->posBlockY; cost_prev_last_pos = lambda_rdoq * est_rate_last_coeff_pos(p_rdoq, xx, yy, 0, CGx, CGy); cost_prev_level = 0; cost_prev_run = 0; cost_prev_uncoded = 0; cost_curr = cost_prev_last_pos; cost_best = cost_curr; for (pairNo = 0; pairNo < p_cost_stat->pairNum; pairNo++, p_pair_cost++) { // Last Position in current CG xx = p_pair_cost->posBlockX; yy = p_pair_cost->posBlockY; cost_curr_last_pos = lambda_rdoq * est_rate_last_coeff_pos(p_rdoq, xx, yy, 0, CGx, CGy); cost_curr += cost_curr_last_pos - cost_prev_last_pos + cost_prev_uncoded + p_pair_cost->levelCost - cost_prev_level + p_pair_cost->runCost - cost_prev_run; if (pairNo == p_cost_stat->pairNum - 1) { cost_curr += p_cost_stat->sigCGFlagCost0 - p_cost_stat->sigCGFlagCost; } cost_best += p_pair_cost->levelCost + p_pair_cost->runCost; cost_prev_uncoded = p_pair_cost->uncodedCost; cost_prev_level = p_pair_cost->levelCost; cost_prev_run = p_pair_cost->runCost; cost_prev_last_pos = cost_curr_last_pos; if (cost_curr <= cost_best) { cost_best = cost_curr; rdoq_memset_zero_coeffs(ncur_blk, p_pair_cost->scanPos + 1, lastScanPosInCG + 1); lastScanPosInCG = p_pair_cost->scanPos; } } } } return num_nonzero; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void rdoq_init(rdoq_t *p_rdoq, aec_t *p_aec, cu_t *p_cu, int bsx, int bsy, int i_tu_level, int b_luma, int intra_mode) { int scan_level = XAVS2_MIN(2, i_tu_level - 2); int b_swap_xy; b_swap_xy = (b_luma && tab_intra_mode_scan_type[intra_mode] == INTRA_PRED_HOR && p_cu->cu_info.i_mode != PRED_I_2Nxn && p_cu->cu_info.i_mode != PRED_I_nx2N); p_rdoq->num_cg_x = bsx >> 2; p_rdoq->num_cg_y = bsy >> 2; p_rdoq->i_tu_level = i_tu_level; p_rdoq->b_luma = b_luma; p_rdoq->b_dc_diag = (b_luma && tab_intra_mode_scan_type[intra_mode] != INTRA_PRED_DC_DIAG) ? 0 : 1; if (b_swap_xy) { p_rdoq->bit_size_shift_x = xavs2_log2u(bsx); p_rdoq->bit_size_shift_y = 0; } else { p_rdoq->bit_size_shift_x = 0; p_rdoq->bit_size_shift_y = xavs2_log2u(bsx); } if (b_luma && p_cu->cu_info.i_tu_split == TU_SPLIT_HOR) { p_rdoq->p_scan_cg = tab_cg_scan_list_hor [scan_level]; p_rdoq->p_scan_tab_1d = tab_coef_scan1_list_hor[scan_level]; } else if (b_luma && p_cu->cu_info.i_tu_split == TU_SPLIT_VER) { p_rdoq->p_scan_cg = tab_cg_scan_list_ver [scan_level]; p_rdoq->p_scan_tab_1d = tab_coef_scan1_list_ver[scan_level]; } else { scan_level = XAVS2_MIN(3, i_tu_level - 2); p_rdoq->p_scan_cg = tab_cg_scan_list_nxn[scan_level]; p_rdoq->p_scan_tab_1d = tab_coef_scan1_list_nxn[b_swap_xy][scan_level]; } // initialize contexts if (b_luma) { p_rdoq->p_ctx_primary = p_aec->p_ctx_set->coeff_run[0]; p_rdoq->p_ctx_sign_cg = p_aec->p_ctx_set->nonzero_cg_flag; p_rdoq->p_ctx_last_cg = p_aec->p_ctx_set->last_cg_contexts; p_rdoq->p_ctx_last_pos = p_aec->p_ctx_set->last_pos_contexts; } else { p_rdoq->p_ctx_primary = p_aec->p_ctx_set->coeff_run[1]; p_rdoq->p_ctx_sign_cg = p_aec->p_ctx_set->nonzero_cg_flag + NUM_SIGN_CG_CTX_LUMA; p_rdoq->p_ctx_last_cg = p_aec->p_ctx_set->last_cg_contexts + NUM_LAST_CG_CTX_LUMA; p_rdoq->p_ctx_last_pos = p_aec->p_ctx_set->last_pos_contexts + NUM_LAST_POS_CTX_LUMA; } p_rdoq->p_ctx_coeff_level = p_aec->p_ctx_set->coeff_level; } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ int rdoq_block(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, coeff_t *cur_blk, int bsx, int bsy, int i_tu_level, int qp, int b_luma, int intra_mode) { cu_parallel_t *p_enc = cu_get_enc_context(h, p_cu->cu_info.i_level); rdoq_t *p_rdoq = &p_enc->rdoq_info; coeff_t *p_coeff = p_rdoq->coeff_buff; coeff_t *ncur_blk = p_rdoq->ncur_blk; const int coeff_num = bsx * bsy; int num_non_zero = 0; int i; const int16_t *p_tab_coeff_scan_1d; rdoq_init(p_rdoq, p_aec, p_cu, bsx, bsy, i_tu_level, b_luma, intra_mode); g_funcs.dctf.abs_coeff(p_coeff, cur_blk, coeff_num); /* scan the coeffs */ p_tab_coeff_scan_1d = p_rdoq->p_scan_tab_1d; for (i = 0; i < coeff_num; i++) { ncur_blk[i] = p_coeff[p_tab_coeff_scan_1d[i]]; } num_non_zero = rdoq_cg(h, p_rdoq, p_cu, ncur_blk, coeff_num, qp); /* inverse scan the coeffs */ if (num_non_zero) { for (i = 0; i < coeff_num; i++) { p_coeff[p_tab_coeff_scan_1d[i]] = ncur_blk[i]; } num_non_zero = g_funcs.dctf.add_sign(cur_blk, p_coeff, coeff_num); } else { cur_blk[0] = 0; } return num_non_zero; } xavs2-1.3/source/encoder/rdoq.h000066400000000000000000000032611340660520300164560ustar00rootroot00000000000000/* * rdoq.h * * Description of this file: * RDOQ functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_RDOQ_H #define XAVS2_RDOQ_H #define rdoq_block FPFX(rdoq_block) int rdoq_block(xavs2_t *h, aec_t *p_aec, cu_t *p_cu, coeff_t *cur_blk, int bsx, int bsy, int i_tu_level, int qp, int b_luma, int intra_mode); #endif // XAVS2_RDOQ_H xavs2-1.3/source/encoder/rps.c000066400000000000000000001016771340660520300163220ustar00rootroot00000000000000/* * rps.c * * Description of this file: * RPS functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common/common.h" #include "cudata.h" #include "wrapper.h" #include "ratecontrol.h" #include "rps.h" /** * =========================================================================== * local definitions * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void ALWAYS_INLINE set_ref_man(xavs2_rps_t *p_refman, int idx, int poc, int qp_offset, int refered_by_others, int num_of_ref, int ref_pic[4], int num_to_rm, int rm_pic[4], int temporal_id) { p_refman->idx_in_gop = idx; p_refman->poc = poc; p_refman->qp_offset = qp_offset; p_refman->referd_by_others = refered_by_others; p_refman->num_of_ref = num_of_ref; memcpy(p_refman->ref_pic, ref_pic, XAVS2_MAX_REFS * sizeof(int)); p_refman->num_to_rm = num_to_rm; memcpy(p_refman->rm_pic, rm_pic, XAVS2_MAX_REFS * sizeof(int)); p_refman->temporal_id = temporal_id; } /* --------------------------------------------------------------------------- * low delay configuration for reference management */ static void default_reference_management_ldp(xavs2_rps_t *p_refman) { int ref_pic0[4] = { 1, 5, 9, 13 }; int ref_pic1[4] = { 1, 2, 6, 10 }; int ref_pic2[4] = { 1, 3, 7, 11 }; int ref_pic3[4] = { 1, 4, 8, 12 }; int remove_pic0[4] = { 2, -1, -1, -1 }; int remove_pic1[4] = { 4, -1, -1, -1 }; int remove_pic2[4] = { 2, -1, -1, -1 }; int remove_pic3[4] = { 12, -1, -1, -1 }; memset(p_refman, -1, sizeof(xavs2_rps_t) * XAVS2_MAX_GOPS); set_ref_man(&p_refman[0], 0, 1, 5, 1, 4, ref_pic0, 1, remove_pic0, -1); set_ref_man(&p_refman[1], 1, 2, 4, 1, 4, ref_pic1, 1, remove_pic1, -1); set_ref_man(&p_refman[2], 2, 3, 5, 1, 4, ref_pic2, 1, remove_pic2, -1); set_ref_man(&p_refman[3], 3, 4, 2, 1, 4, ref_pic3, 1, remove_pic3, -1); } /* --------------------------------------------------------------------------- * random access configuration (GOP 4) for reference management * (the max default reference frame number is 4) * coding order index in mini-GOP: 2---1---3---0 * type+POC: I0 B1 B2 B3 P4 B5 B6 B7 P8 B9 B10 B11 P12 B13 B14 B15 P16 B17 B18 B19 P20 * COI: 0 3 2 4 1 7 6 8 5 11 10 12 9 15 14 16 13 19 18 20 17 * 3 4 | | | | * 2 | | | +-- ref_pic: 4: 13 - 4 = 9 * 0 1 | | | | 3: 13 - 3 = 10 * encoding layer | | | | 8: 13 - 8 = 5 * | | | | 12: 13 - 12 = 1 * | | | | * rm_pic = { 9,12 }, in order to | | | +-- rm_pic: 9: 13 - 9 = 4 * 9: remove I0 after P12 encoded | | | 12: 13 - 12 = 1 * 12: remove the most front reference frame | | | * | | +------ ref_pic: 3: 16 - 3 = 13 * | | 2: 16 - 2 = 14 * | | * | +---------- ref_pic: 1: 14 - 1 = 13 * | | 5: 14 - 5 = 9 * | | * rm_pic = { 4 }, in order to | +---------- rm_pic: 4: 14 - 4 = 10 * remove the reference picture-B in last mini-GOP | * +-------------- ref_pic: 1: 15 - 1 = 14 * 6: 15 - 6 = 9 */ static void default_reference_management_ra_gop4(xavs2_rps_t *p_refman) { int ref_pic0[4] = { 4, 3, 8, 12 }; int ref_pic1[4] = { 1, 5, -1, -1 }; int ref_pic2[4] = { 1, 6, -1, -1 }; int ref_pic3[4] = { 3, 2, -1, -1 }; int remove_pic0[4] = { 9, 12, -1, -1 }; int remove_pic1[4] = { 4, -1, -1, -1 }; int remove_pic2[4] = { -1, -1, -1, -1 }; int remove_pic3[4] = { -1, -1, -1, -1 }; memset(p_refman, -1, sizeof(xavs2_rps_t) * XAVS2_MAX_GOPS); set_ref_man(&p_refman[0], 0, 4, 2, 1, 4, ref_pic0, 2, remove_pic0, -1); set_ref_man(&p_refman[1], 1, 2, 3, 1, 2, ref_pic1, 1, remove_pic1, -1); set_ref_man(&p_refman[2], 2, 1, 4, 0, 2, ref_pic2, 0, remove_pic2, -1); set_ref_man(&p_refman[3], 3, 3, 4, 0, 2, ref_pic3, 0, remove_pic3, -1); } /* --------------------------------------------------------------------------- * random access configuration (GOP 8) for reference management */ static void default_reference_management_ra_gop8(xavs2_rps_t *p_refman) { int ref_pic0[4] = { 8, 3, 7, 16 }; int ref_pic1[4] = { 1, 9, -1, -1 }; int ref_pic2[4] = { 1, 10, -1, -1 }; int ref_pic3[4] = { 1, 11, -1, -1 }; int ref_pic4[4] = { 3, 2, -1, -1 }; int ref_pic5[4] = { 5, 4, -1, -1 }; int ref_pic6[4] = { 1, 5, -1, -1 }; int ref_pic7[4] = { 7, 2, -1, -1 }; int remove_pic0[4] = { 16, 17, -1, -1 }; int remove_pic1[4] = { 4, -1, -1, -1 }; int remove_pic2[4] = { 9, -1, -1, -1 }; int remove_pic3[4] = { -1, -1, -1, -1 }; int remove_pic4[4] = { -1, -1, -1, -1 }; int remove_pic5[4] = { -1, -1, -1, -1 }; int remove_pic6[4] = { 4, -1, -1, -1 }; int remove_pic7[4] = { -1, -1, -1, -1 }; memset(p_refman, -1, sizeof(xavs2_rps_t) * XAVS2_MAX_GOPS); set_ref_man(&p_refman[0], 0, 8, 1, 1, 4, ref_pic0, 2, remove_pic0, -1); set_ref_man(&p_refman[1], 1, 4, 1, 1, 2, ref_pic1, 1, remove_pic1, -1); set_ref_man(&p_refman[2], 2, 2, 2, 1, 2, ref_pic2, 1, remove_pic2, -1); set_ref_man(&p_refman[3], 3, 1, 4, 0, 2, ref_pic3, 0, remove_pic3, -1); set_ref_man(&p_refman[4], 4, 3, 4, 0, 2, ref_pic4, 0, remove_pic4, -1); set_ref_man(&p_refman[5], 5, 6, 2, 1, 2, ref_pic5, 0, remove_pic5, -1); set_ref_man(&p_refman[6], 6, 5, 4, 0, 2, ref_pic6, 1, remove_pic6, -1); set_ref_man(&p_refman[7], 7, 7, 4, 0, 2, ref_pic7, 0, remove_pic7, -1); } /* --------------------------------------------------------------------------- * find a frame in DPB with the specific COI */ static xavs2_frame_t *find_frame_by_coi(xavs2_frame_buffer_t *frm_buf, int coi) { xavs2_frame_t *frame; int i; for (i = 0; i < frm_buf->num_frames; i++) { if ((frame = frm_buf->frames[i]) != NULL) { xavs2_thread_mutex_lock(&frame->mutex); /* lock */ if (frame->i_frm_coi == coi) { xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ return frame; } xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ } } return NULL; } /* --------------------------------------------------------------------------- * get RPS of one frame */ static int xavs2e_get_frame_rps(const xavs2_t *h, xavs2_frame_buffer_t *frm_buf, xavs2_frame_t *cur_frm, xavs2_rps_t *p_rps) { const xavs2_rps_t *p_seq_rps = h->param->cfg_ref_all; xavs2_frame_t *frame = NULL; int rps_idx = 0; int j; if (cur_frm->i_frm_type == XAVS2_TYPE_I) { if (h->param->intra_period_max == 1) { memcpy(p_rps, &p_seq_rps[0], sizeof(xavs2_rps_t)); p_rps->num_of_ref = 0; // clear reference frames for I frame p_rps->referd_by_others = 0; } else { p_rps->idx_in_gop = -1; p_rps->num_of_ref = 0; p_rps->num_to_rm = 0; p_rps->referd_by_others = 1; if (!h->param->b_open_gop || !h->param->num_bframes) { // IDR refresh for (j = 0; j < frm_buf->num_frames; j++) { if ((frame = frm_buf->frames[j]) != NULL && cur_frm->i_frame != frame->i_frame) { xavs2_thread_mutex_lock(&frame->mutex); /* lock */ assert(p_rps->num_to_rm < sizeof(p_rps->rm_pic) / sizeof(p_rps->rm_pic[0])); if (frame->rps.referd_by_others == 1 && frame->cnt_refered > 0) { if (cur_frm->i_frm_coi - frame->i_frm_coi < 64) { /* only 6 bits for delta coi */ p_rps->rm_pic[p_rps->num_to_rm++] = cur_frm->i_frm_coi - frame->i_frm_coi; } } xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ if (p_rps->num_to_rm == sizeof(p_rps->rm_pic) / sizeof(p_rps->rm_pic[0])) { break; } /* at most 7 frames can be removed limited to the "num of removed picture" in sequence/picture header */ if (p_rps->num_to_rm == 7) { break; } } } } else { // RA OpenGOP, I֡P/F֡λãP/F֡Ƴ֡б memcpy(p_rps, &p_seq_rps[0], sizeof(xavs2_rps_t)); p_rps->idx_in_gop = -1; p_rps->num_of_ref = 0; p_rps->referd_by_others = 1; } p_rps->qp_offset = 0; } } else { rps_idx = (cur_frm->i_frm_coi - 1 - ((!h->param->b_open_gop && h->param->num_bframes > 0) ? frm_buf->COI_IDR : 0)) % h->i_gop_size; memcpy(p_rps, &p_seq_rps[rps_idx], sizeof(xavs2_rps_t)); if (cur_frm->i_frame > frm_buf->POC_IDR && (!h->param->b_open_gop || !h->param->num_bframes)) { /* clear frames before IDR frame */ for (j = 0; j < frm_buf->num_frames; j++) { if ((frame = frm_buf->frames[j]) != NULL) { xavs2_thread_mutex_lock(&frame->mutex); /* lock */ assert(p_rps->num_to_rm < sizeof(p_rps->rm_pic) / sizeof(p_rps->rm_pic[0])); if (frame->rps.referd_by_others == 1 && frame->cnt_refered > 0) { /* only 6 bits for delta coi */ if (frame->i_frame < frm_buf->POC_IDR && cur_frm->i_frm_coi - frame->i_frm_coi < 64) { p_rps->rm_pic[p_rps->num_to_rm++] = cur_frm->i_frm_coi - frame->i_frm_coi; } } xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ if (p_rps->num_to_rm == sizeof(p_rps->rm_pic) / sizeof(p_rps->rm_pic[0])) { break; } /* at most 7 frames can be removed limited to the "num of removed picture" in sequence/picture header */ if (p_rps->num_to_rm == 7) { break; } } } } } return rps_idx; } /* --------------------------------------------------------------------------- * build reference list according to RPS, returns the number of frames found */ static INLINE int rps_init_reference_list(const xavs2_t *h, xavs2_frame_buffer_t *frm_buf, xavs2_frame_t *cur_frm, xavs2_rps_t *p_rps, xavs2_frame_t *frefs[XAVS2_MAX_REFS]) { xavs2_frame_t *frame; int i, k, m; int num_ref = 0; for (i = 0; i < XAVS2_MAX_REFS; i++) { frefs[i] = NULL; } assert(p_rps->num_of_ref <= XAVS2_MAX_REFS); for (i = 0; i < p_rps->num_of_ref; i++) { for (m = 0;; m++) { int coi = cur_frm->i_frm_coi - p_rps->ref_pic[i]; frame = find_frame_by_coi(frm_buf, coi); if (frame != NULL) { int b_could_be_referenced; xavs2_thread_mutex_lock(&frame->mutex); /* lock */ /* check whether the frame is already in the reference list */ for (k = 0; k < num_ref; k++) { if (frefs[k] == frame) { // already in the reference list p_rps->idx_in_gop = -1; break; } } /* check whether the frame could be referenced by current frame */ b_could_be_referenced = frame->i_frame >= frm_buf->POC_IDR || (frame->i_frame < frm_buf->POC_IDR && cur_frm->i_frm_type == XAVS2_TYPE_B && h->param->b_open_gop); if (k == num_ref && frame->i_frm_coi == coi && frame->cnt_refered > 0 && b_could_be_referenced) { // put in the reference list assert(frame->cnt_refered > 0); assert(frame->rps.referd_by_others != 0); // hold reference to this frame frame->cnt_refered++; frefs[num_ref] = frame; p_rps->ref_pic[num_ref] = cur_frm->i_frm_coi - frame->i_frm_coi; num_ref++; xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ /* a reference frame found */ break; } xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ } /* reference frame not found in the second run, break now */ if (m > 0) { break; } // reference frame not found, fall back on the IDR frame. p_rps->ref_pic[i] = cur_frm->i_frm_coi - frm_buf->COI_IDR; p_rps->idx_in_gop = -1; } } return num_ref; } /* --------------------------------------------------------------------------- * fix reference list of B frame, returns the number of reference frames */ static INLINE int rps_fix_reference_list_b(const xavs2_t *h, xavs2_frame_buffer_t *frm_buf, xavs2_frame_t *cur_frm, xavs2_rps_t *p_rps, xavs2_frame_t *frefs[XAVS2_MAX_REFS]) { xavs2_frame_t **DPB = frm_buf->frames; xavs2_frame_t *frame; int i; // reassign reference frames for this B frame. int max_fwd_idx = -1, min_bwd_idx = -1; dist_t max_fwd_poi = 0, min_bwd_poi = MAX_DISTORTION; UNUSED_PARAMETER(h); for (i = 0; i < frm_buf->num_frames; i++) { if ((frame = DPB[i]) != NULL) { xavs2_thread_mutex_lock(&frame->mutex); /* lock */ if (frame->rps.referd_by_others != 0 && frame->cnt_refered > 0 && frame->i_frame < cur_frm->i_frame && frame->i_frame > max_fwd_poi) { if (max_fwd_idx != -1) { xavs2_thread_mutex_lock(&DPB[max_fwd_idx]->mutex); /* lock */ DPB[max_fwd_idx]->cnt_refered--; assert(DPB[max_fwd_idx]->cnt_refered >= 0); xavs2_thread_mutex_unlock(&DPB[max_fwd_idx]->mutex); /* unlock */ } frame->cnt_refered++; max_fwd_idx = i; max_fwd_poi = frame->i_frame; } xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ } } assert(max_fwd_idx >= 0); assert(DPB[max_fwd_idx]->cnt_refered > 0); xavs2_thread_mutex_lock(&frefs[1]->mutex); /* lock */ frefs[1]->cnt_refered--; assert(frefs[1]->cnt_refered >= 0); xavs2_thread_mutex_unlock(&frefs[1]->mutex); /* unlock */ frefs[1] = DPB[max_fwd_idx]; for (i = 0; i < frm_buf->num_frames; i++) { if ((frame = DPB[i]) != NULL) { xavs2_thread_mutex_lock(&frame->mutex); /* lock */ if (frame->rps.referd_by_others != 0 && frame->cnt_refered > 0 && frame->i_frame > cur_frm->i_frame && frame->i_frame < min_bwd_poi) { if (min_bwd_idx != -1) { xavs2_thread_mutex_lock(&DPB[min_bwd_idx]->mutex); /* lock */ DPB[min_bwd_idx]->cnt_refered--; assert(DPB[min_bwd_idx]->cnt_refered >= 0); xavs2_thread_mutex_unlock(&DPB[min_bwd_idx]->mutex); /* unlock */ } frame->cnt_refered++; min_bwd_idx = i; min_bwd_poi = frame->i_frame; } xavs2_thread_mutex_unlock(&frame->mutex);/* unlock */ } } assert(min_bwd_idx >= 0); assert(DPB[min_bwd_idx]->cnt_refered > 0); xavs2_thread_mutex_lock(&frefs[0]->mutex); /* lock */ frefs[0]->cnt_refered--; assert(frefs[0]->cnt_refered >= 0); xavs2_thread_mutex_unlock(&frefs[0]->mutex); /* unlock */ frefs[0] = DPB[min_bwd_idx]; p_rps->ref_pic[0] = cur_frm->i_frm_coi - frefs[0]->i_frm_coi; p_rps->ref_pic[1] = cur_frm->i_frm_coi - frefs[1]->i_frm_coi; p_rps->idx_in_gop = -1; return 2; // number of reference frames for B frame } /* --------------------------------------------------------------------------- * fix reference list of P/F frame, returns the number of reference frames */ static INLINE int rps_fix_reference_list_pf(const xavs2_t *h, xavs2_frame_buffer_t *frm_buf, xavs2_frame_t *cur_frm, xavs2_rps_t *p_rps, xavs2_frame_t *frefs[XAVS2_MAX_REFS]) { xavs2_frame_t **DPB = frm_buf->frames; xavs2_frame_t *frame; int i, j, k; int num_ref = p_rps->num_of_ref; for (i = num_ref; i < h->i_max_ref; i++) { int max_fwd_idx = -1; int max_fwd_poi = frm_buf->POC_IDR; int switch_flag = 0; for (j = 0; j < frm_buf->num_frames; j++) { if ((frame = DPB[j]) != NULL && frame->rps.referd_by_others) { xavs2_thread_mutex_lock(&frame->mutex); /* lock */ int poi = frame->i_frame; for (k = 0; k < num_ref; k++) { if (frefs[k] == frame) { break; } } if (k < num_ref) { xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ continue; } if (poi < cur_frm->i_frame && poi > max_fwd_poi && XAVS2_ABS(poi - cur_frm->i_frame) < 128 && frame->cnt_refered > 0 && (h->param->temporal_id_exist_flag == 0 || h->i_layer >= frame->rps.temporal_id)) { if (max_fwd_idx != -1) { xavs2_thread_mutex_lock(&DPB[max_fwd_idx]->mutex); /* lock */ DPB[max_fwd_idx]->cnt_refered--; assert(DPB[max_fwd_idx]->cnt_refered >= 0); xavs2_thread_mutex_unlock(&DPB[max_fwd_idx]->mutex); /* unlock */ } assert(frame->rps.referd_by_others != 0); frame->cnt_refered++; max_fwd_idx = j; max_fwd_poi = poi; switch_flag = 1; } xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ } } if (switch_flag == 0) { break; } assert(DPB[max_fwd_idx]->cnt_refered > 0); frefs[i] = DPB[max_fwd_idx]; p_rps->ref_pic[i] = cur_frm->i_frm_coi - frefs[i]->i_frm_coi; p_rps->idx_in_gop = -1; num_ref++; } return num_ref; // number of reference frames for B frame } /* --------------------------------------------------------------------------- * check whether a frame is writable: * no one is referencing this frame */ static ALWAYS_INLINE int frame_is_writable(const xavs2_handler_t *h_mgr, xavs2_frame_t *frame) { UNUSED_PARAMETER(h_mgr); return frame->cnt_refered == 0; } /* --------------------------------------------------------------------------- * check whether a frame is free to use */ static ALWAYS_INLINE int frame_is_free(const xavs2_handler_t *h_mgr, int cur_poc, xavs2_frame_t *frame) { if (frame_is_writable(h_mgr, frame)) { return 1; /* this frame will never be used */ } else { return (XAVS2_ABS(cur_poc - frame->i_frame) >= 128) /* is too long-ago frame ? */; } } /* --------------------------------------------------------------------------- * find a free frame for encoding */ static INLINE xavs2_frame_t *frame_buffer_find_free_frame_dpb(xavs2_handler_t *h_mgr, xavs2_t *h, xavs2_frame_buffer_t *frm_buf, xavs2_frame_t *cur_frm, xavs2_rps_t *p_rps) { xavs2_frame_t **DPB = frm_buf->frames; xavs2_frame_t *fdec_frm = NULL; int num_frames = frm_buf->num_frames; int i; // find a free frame for the fdec for (i = 0; i < num_frames; i++) { xavs2_frame_t *frame = DPB[i]; if (frame != NULL) { xavs2_thread_mutex_lock(&frame->mutex); /* lock */ if (frame_is_free(h_mgr, cur_frm->i_frame, frame)) { frame->cnt_refered++; // RDO frame->cnt_refered++; // reconstruction output frame->cnt_refered++; // Entropy encoding fdec_frm = frame; xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ break; } xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ } } // fdec must exist for (; fdec_frm == NULL;) { for (i = 0; i < num_frames; i++) { xavs2_frame_t *frame = DPB[i]; if (frame != NULL) { xavs2_thread_mutex_lock(&frame->mutex); /* unlock */ if (frame_is_writable(h_mgr, frame)) { p_rps->rm_pic[p_rps->num_to_rm++] = cur_frm->i_frm_coi - frame->i_frm_coi; p_rps->idx_in_gop = -1; frame->cnt_refered++; // RDO frame->cnt_refered++; // reconstruction output frame->cnt_refered++; // Entropy encoding fdec_frm = frame; xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ break; } xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ } } if (fdec_frm) { break; } xavs2_thread_cond_wait(&h_mgr->cond[SIG_FRM_BUFFER_RELEASED], &h_mgr->mutex); } if (fdec_frm) { memcpy(&fdec_frm->rps, p_rps, sizeof(xavs2_rps_t)); fdec_frm->i_frame = -1; fdec_frm->i_frm_coi = -1; fdec_frm->cnt_refered += fdec_frm->rps.referd_by_others; memset(fdec_frm->num_lcu_coded_in_row, 0, h->i_height_in_lcu * sizeof(fdec_frm->num_lcu_coded_in_row[0])); } return fdec_frm; } /* --------------------------------------------------------------------------- * find a free frame for encoding */ static INLINE void rps_determine_remove_frames(xavs2_frame_buffer_t *frm_buf, xavs2_frame_t *cur_frm) { int i, k; // remove the frames that will never be referenced for (i = 0, k = 0; i < cur_frm->rps.num_to_rm; i++) { int coi = cur_frm->i_frm_coi - cur_frm->rps.rm_pic[i]; xavs2_frame_t *frame; if (coi < 0) { continue; } frame = find_frame_by_coi(frm_buf, coi); if (frame != NULL) { xavs2_thread_mutex_lock(&frame->mutex); /* lock */ if (frame->i_frm_coi == coi && frame->cnt_refered > 0) { // can not remove frames with lower layers assert(cur_frm->rps.temporal_id <= frame->rps.temporal_id); cur_frm->rps.rm_pic[k++] = cur_frm->rps.rm_pic[i]; xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ continue; } xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ } } if (cur_frm->rps.num_to_rm != k) { cur_frm->rps.num_to_rm = k; cur_frm->rps.idx_in_gop = -1; } } /* --------------------------------------------------------------------------- * update frame buffer, record frames to be removed */ void frame_buffer_update_remove_frames(xavs2_frame_buffer_t *frm_buf, xavs2_frame_t *cur_frm) { int i; frm_buf->num_frames_to_remove = cur_frm->rps.num_to_rm; for (i = 0; i < frm_buf->num_frames_to_remove; i++) { frm_buf->coi_remove_frame[i] = cur_frm->i_frm_coi - cur_frm->rps.rm_pic[i]; } // xavs2_log(NULL, XAVS2_LOG_INFO, "RPS remove[%d]: %d, [%d %d]\n", // cur_frm->i_frm_coi, cur_frm->rps.num_to_rm, cur_frm->rps.rm_pic[0], cur_frm->rps.rm_pic[1]); } /* --------------------------------------------------------------------------- * update frame buffer, remove frames */ void frame_buffer_remove_frames(xavs2_frame_buffer_t *frm_buf) { int i; for (i = 0; i < frm_buf->num_frames_to_remove; i++) { int coi_frame_to_remove = frm_buf->coi_remove_frame[i]; xavs2_frame_t *frame = find_frame_by_coi(frm_buf, coi_frame_to_remove); if (frame != NULL) { xavs2_thread_mutex_lock(&frame->mutex); /* lock */ if (frame->i_frm_coi == coi_frame_to_remove && frame->cnt_refered > 0) { frame->cnt_refered--; // xavs2_log(NULL, XAVS2_LOG_DEBUG, "remove frame COI: %3d, POC %3d\n", // frame->i_frm_coi, frame->i_frame); xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ break; } xavs2_thread_mutex_unlock(&frame->mutex); /* unlock */ } } } /** * =========================================================================== * functions * =========================================================================== */ /* --------------------------------------------------------------------------- */ xavs2_frame_t *frame_buffer_get_free_frame_ipb(xavs2_handler_t *h_mgr) { xavs2_frame_t *frame = NULL; frame = (xavs2_frame_t *)xl_remove_head(&h_mgr->list_frames_free, 1); return frame; } /* --------------------------------------------------------------------------- * build rps of a frame */ int rps_build(const xavs2_t *h, xavs2_frame_buffer_t *frm_buf, xavs2_frame_t *cur_frm, xavs2_rps_t *p_rps, xavs2_frame_t *frefs[XAVS2_MAX_REFS]) { // initialize current RPS cur_frm->rps_index_in_gop = xavs2e_get_frame_rps(h, frm_buf, cur_frm, p_rps); // get encoding layer of current frame if (h->param->temporal_id_exist_flag == 1 && cur_frm->i_frm_type != XAVS2_TYPE_I) { if (p_rps->temporal_id < 0 || p_rps->temporal_id >= TEMPORAL_MAXLEVEL) { p_rps->temporal_id = TEMPORAL_MAXLEVEL - 1; // the lowest level } } else { p_rps->temporal_id = 0; } // prepare the reference list p_rps->num_of_ref = rps_init_reference_list(h, frm_buf, cur_frm, p_rps, frefs); if (cur_frm->i_frm_type == XAVS2_TYPE_B && p_rps->num_of_ref != 2) { return -1; } if (cur_frm->i_frm_type == XAVS2_TYPE_B && (frefs[0]->i_frame <= cur_frm->i_frame || frefs[1]->i_frame >= cur_frm->i_frame)) { // for B frames with wrong reference frames p_rps->num_of_ref = rps_fix_reference_list_b(h, frm_buf, cur_frm, p_rps, frefs); } else if (cur_frm->i_frm_type == XAVS2_TYPE_P || cur_frm->i_frm_type == XAVS2_TYPE_F) { // for P/F-frame p_rps->num_of_ref = rps_fix_reference_list_pf(h, frm_buf, cur_frm, p_rps, frefs); } rps_determine_remove_frames(frm_buf, cur_frm); return 0; } /* --------------------------------------------------------------------------- * initializes the parameters for a new frame */ xavs2_frame_t *find_fdec_and_build_rps(xavs2_t *h, xavs2_frame_buffer_t *frm_buf, xavs2_frame_t *cur_frm, xavs2_frame_t *frefs[XAVS2_MAX_REFS]) { xavs2_handler_t *h_mgr = h->h_top; xavs2_frame_t *frame = NULL; /* remove frames before current frame encoding */ frame_buffer_remove_frames(frm_buf); if (rps_build(h, frm_buf, cur_frm, &cur_frm->rps, frefs) < 0) { return NULL; } /* find a frame for encoding */ frame = frame_buffer_find_free_frame_dpb(h_mgr, h, frm_buf, cur_frm, &cur_frm->rps); /* label frames to be removed */ frame_buffer_update_remove_frames(frm_buf, cur_frm); return frame; } /* --------------------------------------------------------------------------- * set picture reorder delay */ void rps_set_picture_reorder_delay(xavs2_t *h) { h->i_gop_size = h->param->i_gop_size; if (!h->param->low_delay) { int delta_dd = 1000; int tmp_delta_dd; int i; for (i = 0; i < h->i_gop_size; i++) { tmp_delta_dd = h->param->cfg_ref_all[i].poc - (i + 1); if (tmp_delta_dd < delta_dd) { delta_dd = tmp_delta_dd; } } // set picture reorder delay if (delta_dd < 0) { h->picture_reorder_delay = -delta_dd; } else { h->picture_reorder_delay = 0; } } } /* --------------------------------------------------------------------------- * check RPS config */ static int update_rps_config(xavs2_param_t *param) { xavs2_rps_t *p_seq_rps = param->cfg_ref_all; if (param->i_gop_size < 0) { param->i_gop_size = XAVS2_ABS(param->i_gop_size); /* set default configuration for reference_management */ memset(p_seq_rps, -1, XAVS2_MAX_GOPS * sizeof(xavs2_rps_t)); if (param->num_bframes == 0) { /* LDP */ default_reference_management_ldp(&p_seq_rps[0]); } else { /* RA */ if (param->i_gop_size == 4) { default_reference_management_ra_gop4(&p_seq_rps[0]); param->num_bframes = 3; } else if (param->i_gop_size == 8) { default_reference_management_ra_gop8(&p_seq_rps[0]); param->num_bframes = 7; } else { /* GOP size error */ return -1; } } } return 0; } /* --------------------------------------------------------------------------- * config RPS */ int rps_check_config(xavs2_param_t *param) { xavs2_rps_t *p_seq_rps = param->cfg_ref_all; int rps_idx; if (update_rps_config(param) < 0) { return -1; } // set index for (rps_idx = 0; rps_idx < param->i_gop_size; rps_idx++) { p_seq_rps[rps_idx].idx_in_gop = rps_idx; } if (param->num_max_ref < 4) { for (rps_idx = 0; rps_idx < param->i_gop_size; rps_idx++) { p_seq_rps[rps_idx].num_of_ref = XAVS2_MIN(param->num_max_ref, p_seq_rps[rps_idx].num_of_ref); } } return 0; } xavs2-1.3/source/encoder/rps.h000066400000000000000000000055271340660520300163240ustar00rootroot00000000000000/* * rps.h * * Description of this file: * RPS functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_RPS_H #define XAVS2_RPS_H // (1 - LDP, 2 - RA, 3 - RAP, 4 - AI) enum xavs2e_rps_cfg_e { XAVS2_RPS_CFG_LDP = 1, XAVS2_RPS_CFG_RA = 2, XAVS2_RPS_CFG_RAP = 3, XAVS2_RPS_CFG_AI = 4 }; #define frame_buffer_get_free_frame_ipb FPFX(frame_buffer_get_free_frame_ipb) xavs2_frame_t *frame_buffer_get_free_frame_ipb(xavs2_handler_t *h_mgr); #define frame_buffer_update_remove_frames FPFX(frame_buffer_update_remove_frames) void frame_buffer_update_remove_frames(xavs2_frame_buffer_t *frm_buf, xavs2_frame_t *cur_frm); #define frame_buffer_remove_frames FPFX(frame_buffer_remove_frames) void frame_buffer_remove_frames(xavs2_frame_buffer_t *frm_buf); #define rps_build FPFX(rps_build) int rps_build(const xavs2_t *h, xavs2_frame_buffer_t *frm_buf, xavs2_frame_t *cur_frm, xavs2_rps_t *p_rps, xavs2_frame_t *frefs[XAVS2_MAX_REFS]); #define find_fdec_and_build_rps FPFX(find_fdec_and_build_rps) xavs2_frame_t *find_fdec_and_build_rps(xavs2_t *h, xavs2_frame_buffer_t *frm_buf, xavs2_frame_t *cur_frm, xavs2_frame_t *frefs[XAVS2_MAX_REFS]); #define rps_check_config FPFX(rps_check_config) int rps_check_config(xavs2_param_t *param); #define rps_set_picture_reorder_delay FPFX(rps_set_picture_reorder_delay) void rps_set_picture_reorder_delay(xavs2_t *h); #endif // XAVS2_RPS_H xavs2-1.3/source/encoder/sao.c000066400000000000000000001365011340660520300162720ustar00rootroot00000000000000/* * sao.c * * Description of this file: * SAO functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "aec.h" #include "sao.h" #include "filter.h" #include "cpu.h" #include "cudata.h" #include "vec/intrinsic.h" static const int tab_sao_check_mode_fast[3][5] = { 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 }; /** * =========================================================================== * local function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void sao_init_stat_data(SAOStatData *p_stats) { memset(p_stats, 0, sizeof(SAOStatData)); } /* --------------------------------------------------------------------------- */ static void sao_get_stat_block_EO_0(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, SAOStatData *p_stats, sao_region_t *p_region, int compIdx) { int start_x, end_x, start_y, end_y; int x, y; int leftsign, rightsign; /* the size of SAO region max be larger than MAX_CU_SIZE on right/down of picture */ int edgetype; int pix_x = p_region->pix_x[compIdx]; int pix_y = p_region->pix_y[compIdx]; int width = p_region->width[compIdx]; int height = p_region->height[compIdx]; int i_rec = frm_rec->i_stride[compIdx]; int i_org = frm_org->i_stride[compIdx]; const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x; const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x; const pel_t *p_org_iter; const pel_t *p_rec_iter; sao_init_stat_data(p_stats); p_org_iter = p_org; p_rec_iter = p_rec; start_y = 0; end_y = height; start_x = p_region->b_left ? 0 : 1; end_x = p_region->b_right ? width : (width - 1); p_org_iter = p_org + start_y * i_org; p_rec_iter += start_y * i_rec; for (y = start_y; y < end_y; y++) { leftsign = xavs2_sign3(p_rec_iter[start_x] - p_rec_iter[start_x - 1]); for (x = start_x; x < end_x; x++) { rightsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1]); edgetype = leftsign + rightsign; leftsign = -rightsign; p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); p_stats->count[edgetype + 2]++; } p_rec_iter += i_rec; p_org_iter += i_org; } } /* --------------------------------------------------------------------------- */ static void sao_get_stat_block_EO_90(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, SAOStatData *p_stats, sao_region_t *p_region, int compIdx) { int start_x, end_x, start_y, end_y; int x, y; int upsign, downsign; /* the size of SAO region max be larger than MAX_CU_SIZE on right/down of picture */ int edgetype; int pix_x = p_region->pix_x[compIdx]; int pix_y = p_region->pix_y[compIdx]; int width = p_region->width[compIdx]; int height = p_region->height[compIdx]; int i_rec = frm_rec->i_stride[compIdx]; int i_org = frm_org->i_stride[compIdx]; const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x; const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x; const pel_t *p_org_iter; const pel_t *p_rec_iter; sao_init_stat_data(p_stats); p_org_iter = p_org; p_rec_iter = p_rec; start_x = 0; end_x = width; start_y = p_region->b_top ? 0 : 1; end_y = p_region->b_down ? height : (height - 1); for (x = start_x; x < end_x; x++) { upsign = xavs2_sign3(p_rec_iter[start_y * i_rec + x] - p_rec_iter[(start_y - 1) * i_rec + x]); for (y = start_y; y < end_y; y++) { downsign = xavs2_sign3(p_rec_iter[y * i_rec + x] - p_rec_iter[(y + 1) * i_rec + x]); edgetype = downsign + upsign; upsign = -downsign; p_stats->diff[edgetype + 2] += (p_org_iter[y * i_org + x] - p_rec_iter[y * i_rec + x]); p_stats->count[edgetype + 2]++; } } } /* --------------------------------------------------------------------------- */ static void sao_get_stat_block_EO_135(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, SAOStatData *p_stats, sao_region_t *p_region, int compIdx) { int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn; int x, y; int upsign, downsign; /* the size of SAO region max be larger than MAX_CU_SIZE on right/down of picture */ int signupline[MAX_CU_SIZE << 1]; int reg = 0; int edgetype; int pix_x = p_region->pix_x[compIdx]; int pix_y = p_region->pix_y[compIdx]; int width = p_region->width[compIdx]; int height = p_region->height[compIdx]; int i_rec = frm_rec->i_stride[compIdx]; int i_org = frm_org->i_stride[compIdx]; const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x; const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x; const pel_t *p_org_iter; const pel_t *p_rec_iter; sao_init_stat_data(p_stats); p_org_iter = p_org; p_rec_iter = p_rec; start_x_r0 = p_region->b_top_left ? 0 : 1; end_x_r0 = p_region->b_top ? (p_region->b_right ? width : (width - 1)) : 1; start_x_r = p_region->b_left ? 0 : 1; end_x_r = p_region->b_right ? width : (width - 1); start_x_rn = p_region->b_down ? (p_region->b_left ? 0 : 1) : (width - 1); end_x_rn = p_region->b_right_down ? width : (width - 1); // init the line buffer for (x = start_x_r + 1; x < end_x_r + 1; x++) { upsign = xavs2_sign3(p_rec_iter[x + i_rec] - p_rec_iter[x - 1]); signupline[x] = upsign; } // first row for (x = start_x_r0; x < end_x_r0; x++) { upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 - i_rec]); edgetype = upsign - signupline[x + 1]; p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); p_stats->count[edgetype + 2]++; } // middle rows p_rec_iter += i_rec; p_org_iter += i_org; for (y = 1; y < height - 1; y++) { for (x = start_x_r; x < end_x_r; x++) { if (x == start_x_r) { upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 - i_rec]); signupline[x] = upsign; } downsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 + i_rec]); edgetype = downsign + signupline[x]; p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); p_stats->count[edgetype + 2]++; signupline[x] = (char)reg; reg = -downsign; } p_rec_iter += i_rec; p_org_iter += i_org; } // last row for (x = start_x_rn; x < end_x_rn; x++) { if (x == start_x_r) { upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 - i_rec]); signupline[x] = upsign; } downsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 + i_rec]); edgetype = downsign + signupline[x]; p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); p_stats->count[edgetype + 2]++; } } /* --------------------------------------------------------------------------- */ static void sao_get_stat_block_EO_45(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, SAOStatData *p_stats, sao_region_t *p_region, int compIdx) { int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn; int x, y; int upsign, downsign; /* the size of SAO region max be larger than MAX_CU_SIZE on right/down of picture */ int signupline[MAX_CU_SIZE << 1]; int *signupline1; int edgetype; int pix_x = p_region->pix_x[compIdx]; int pix_y = p_region->pix_y[compIdx]; int width = p_region->width[compIdx]; int height = p_region->height[compIdx]; int i_rec = frm_rec->i_stride[compIdx]; int i_org = frm_org->i_stride[compIdx]; const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x; const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x; const pel_t *p_org_iter; const pel_t *p_rec_iter; sao_init_stat_data(p_stats); p_org_iter = p_org; p_rec_iter = p_rec; start_x_r0 = p_region->b_top ? (p_region->b_left ? 0 : 1) : (width - 1); end_x_r0 = p_region->b_top_right ? width : (width - 1); start_x_r = p_region->b_left ? 0 : 1; end_x_r = p_region->b_right ? width : (width - 1); start_x_rn = p_region->b_down_left ? 0 : 1; end_x_rn = p_region->b_down ? (p_region->b_right ? width : (width - 1)) : 1; // init the line buffer signupline1 = signupline + 1; for (x = start_x_r - 1; x < XAVS2_MAX(end_x_r - 1, end_x_r0 - 1); x++) { upsign = xavs2_sign3(p_rec_iter[x + i_rec] - p_rec_iter[x + 1]); signupline1[x] = upsign; } // first row for (x = start_x_r0; x < end_x_r0; x++) { upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 - i_rec]); edgetype = upsign - signupline1[x - 1]; p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); p_stats->count[edgetype + 2]++; } // middle rows p_rec_iter += i_rec; p_org_iter += i_org; for (y = 1; y < height - 1; y++) { for (x = start_x_r; x < end_x_r; x++) { if (x == end_x_r - 1) { upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 - i_rec]); signupline1[x] = upsign; } downsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 + i_rec]); edgetype = downsign + signupline1[x]; p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); p_stats->count[edgetype + 2]++; signupline1[x - 1] = -downsign; } p_rec_iter += i_rec; p_org_iter += i_org; } for (x = start_x_rn; x < end_x_rn; x++) { if (x == end_x_r - 1) { upsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x + 1 - i_rec]); signupline1[x] = upsign; } downsign = xavs2_sign3(p_rec_iter[x] - p_rec_iter[x - 1 + i_rec]); edgetype = downsign + signupline1[x]; p_stats->diff[edgetype + 2] += (p_org_iter[x] - p_rec_iter[x]); p_stats->count[edgetype + 2]++; } } /* --------------------------------------------------------------------------- */ static void sao_get_stat_block_BO(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, SAOStatData *p_stats, sao_region_t *p_region, int compIdx) { int start_x, end_x, start_y, end_y; int x, y; /* the size of SAO region max be larger than MAX_CU_SIZE on right/down of picture */ int bandtype; int band_shift; int pix_x = p_region->pix_x[compIdx]; int pix_y = p_region->pix_y[compIdx]; int width = p_region->width[compIdx]; int height = p_region->height[compIdx]; int i_rec = frm_rec->i_stride[compIdx]; int i_org = frm_org->i_stride[compIdx]; const pel_t *p_rec = frm_rec->planes[compIdx] + pix_y * i_rec + pix_x; const pel_t *p_org = frm_org->planes[compIdx] + pix_y * i_org + pix_x; const pel_t *p_org_iter; const pel_t *p_rec_iter; sao_init_stat_data(p_stats); p_org_iter = p_org; p_rec_iter = p_rec; band_shift = (g_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT); start_x = 0; end_x = width; start_y = 0; end_y = height; for (y = start_y; y < end_y; y++) { for (x = start_x; x < end_x; x++) { bandtype = p_rec_iter[x] >> band_shift; p_stats->diff[bandtype] += (p_org_iter[x] - p_rec_iter[x]); p_stats->count[bandtype]++; } p_rec_iter += i_rec; p_org_iter += i_org; } } /* --------------------------------------------------------------------------- */ typedef void(*sao_pf)(xavs2_frame_t *frm_rec, xavs2_frame_t *frm_org, SAOStatData *stat_datas, sao_region_t *p_region, int compIdx); sao_pf gf_sao_stat[5] = { sao_get_stat_block_EO_0, sao_get_stat_block_EO_90, sao_get_stat_block_EO_135, sao_get_stat_block_EO_45, sao_get_stat_block_BO }; /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE long distortion_cal(long count, int offset, long diff) { return (count * offset * offset - diff * offset * 2); } /* --------------------------------------------------------------------------- */ static int offset_estimation(int typeIdx, int classIdx, rdcost_t lambda, long offset_ori, int count, long diff, rdcost_t *bestCost) { const int tab_EO_OFFSET_MAP[8] = {4, 2, 1, 3, 5, 6, 7, 7}; // -1, 0, ..., 6 int cur_offset = offset_ori; int offset_best = 0; int lower_bd, upper_bd, Th; int temp_offset, start_offset, end_offset; int temprate; long tempdist; rdcost_t tempcost, mincost; const int *eo_offset_bins = &(tab_EO_OFFSET_MAP[1]); int offset_type; if (typeIdx == SAO_TYPE_BO) { offset_type = SAO_CLASS_BO; } else { offset_type = classIdx; } lower_bd = tab_saoclip[offset_type][0]; upper_bd = tab_saoclip[offset_type][1]; Th = tab_saoclip[offset_type][2]; cur_offset = XAVS2_CLIP3(lower_bd, upper_bd, cur_offset); if (typeIdx == SAO_TYPE_BO) { start_offset = XAVS2_MIN(cur_offset, 0); end_offset = XAVS2_MAX(cur_offset, 0); } else { assert(typeIdx >= SAO_TYPE_EO_0 && typeIdx <= SAO_TYPE_EO_45); switch (classIdx) { case SAO_CLASS_EO_FULL_VALLEY: start_offset = -1; end_offset = XAVS2_MAX(cur_offset, 1); break; case SAO_CLASS_EO_HALF_VALLEY: start_offset = 0; end_offset = 1; break; case SAO_CLASS_EO_HALF_PEAK: start_offset = -1; end_offset = 0; break; case SAO_CLASS_EO_FULL_PEAK: start_offset = XAVS2_MIN(cur_offset, -1); end_offset = 1; break; default: xavs2_log(NULL, XAVS2_LOG_ERROR, "Not a supported SAO mode offset_estimation\n"); exit(-1); } } mincost = MAX_COST; for (temp_offset = start_offset; temp_offset <= end_offset; temp_offset++) { if (typeIdx == SAO_TYPE_BO) { assert(offset_type == SAO_CLASS_BO); temprate = XAVS2_ABS(temp_offset); temprate = temprate ? (temprate + 1) : 0; } else if (classIdx == SAO_CLASS_EO_HALF_VALLEY || classIdx == SAO_CLASS_EO_HALF_PEAK) { temprate = XAVS2_ABS(temp_offset); } else { temprate = eo_offset_bins[classIdx == SAO_CLASS_EO_FULL_VALLEY ? temp_offset : -temp_offset]; } temprate = (temprate == Th) ? temprate : (temprate + 1); tempdist = distortion_cal(count, temp_offset, diff); tempcost = tempdist + lambda * temprate; if (tempcost < mincost) { mincost = tempcost; offset_best = temp_offset; *bestCost = tempcost; } } return offset_best; } /* --------------------------------------------------------------------------- */ static void find_offset(int typeIdc, SAOStatData *p_stat, SAOBlkParam *p_param, rdcost_t lambda) { int class_i; rdcost_t classcost[MAX_NUM_SAO_CLASSES]; rdcost_t offth; rdcost_t mincost_bandsum, cost_bandsum; int num_class = (typeIdc == SAO_TYPE_BO) ? NUM_SAO_BO_CLASSES : NUM_SAO_EO_CLASSES; static const int deltaband_cost[] = { -1, -1, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 }; int start_band1, start_band2, delta_band12; for (class_i = 0; class_i < num_class; class_i++) { if ((typeIdc != SAO_TYPE_BO) && (class_i == SAO_CLASS_EO_PLAIN)) { p_param->offset[class_i] = 0; continue; } if (p_stat[typeIdc].count[class_i] == 0) { p_param->offset[class_i] = 0; //offset will be zero continue; } offth = p_stat[typeIdc].diff[class_i] > 0 ? 0.5 : (p_stat[typeIdc].diff[class_i] < 0 ? -0.5 : 0); p_param->offset[class_i] = (int8_t)((double)p_stat[typeIdc].diff[class_i] / (double)p_stat[typeIdc].count[class_i] + offth); } if (typeIdc == SAO_TYPE_BO) { int best_start_band1 = 0; int best_start_band2 = 0; for (class_i = 0; class_i < num_class; class_i++) { p_param->offset[class_i] = (int8_t)offset_estimation(typeIdc, class_i, lambda, p_param->offset[class_i], p_stat[typeIdc].count[class_i], p_stat[typeIdc].diff[class_i], &(classcost[class_i])); } mincost_bandsum = MAX_DOUBLE; for (start_band1 = 0; start_band1 < (NUM_SAO_BO_CLASSES - 1); start_band1++) { for (start_band2 = start_band1 + 2; start_band2 < (NUM_SAO_BO_CLASSES - 1); start_band2++) { cost_bandsum = classcost[start_band1] + classcost[start_band1 + 1] + classcost[start_band2] + classcost[start_band2 + 1]; delta_band12 = (start_band2 - start_band1) >(NUM_SAO_BO_CLASSES >> 1) ? (32 - start_band2 + start_band1) : (start_band2 - start_band1); assert(delta_band12 >= 0 && delta_band12 <= (NUM_SAO_BO_CLASSES >> 1)); cost_bandsum += lambda * deltaband_cost[delta_band12]; if (cost_bandsum < mincost_bandsum) { mincost_bandsum = cost_bandsum; best_start_band1 = start_band1; best_start_band2 = start_band2; } } } for (class_i = 0; class_i < num_class; class_i++) { if ((class_i >= best_start_band1 && class_i <= best_start_band1 + 1) || (class_i >= best_start_band2 && class_i <= best_start_band2 + 1)) { continue; } p_param->offset[class_i] = 0; } start_band1 = XAVS2_MIN(best_start_band1, best_start_band2); start_band2 = XAVS2_MAX(best_start_band1, best_start_band2); delta_band12 = (start_band2 - start_band1); if (delta_band12 > (NUM_SAO_BO_CLASSES >> 1)) { p_param->deltaBand = 32 - delta_band12; // TODO: Ӧ (32 + delta_band12) p_param->startBand = start_band2; } else { p_param->deltaBand = delta_band12; p_param->startBand = start_band1; } } else { assert(typeIdc >= SAO_TYPE_EO_0 && typeIdc <= SAO_TYPE_EO_45); for (class_i = 0; class_i < num_class; class_i++) { if (class_i == SAO_CLASS_EO_PLAIN) { p_param->offset[class_i] = 0; classcost[class_i] = 0; } else { p_param->offset[class_i] = (int8_t)offset_estimation(typeIdc, class_i, lambda, p_param->offset[class_i], p_stat[typeIdc].count[class_i], p_stat[typeIdc].diff[class_i], &(classcost[class_i])); } } p_param->startBand = 0; } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE long get_distortion(int compIdx, int type, SAOStatData stat_data[NUM_SAO_COMPONENTS][NUM_SAO_NEW_TYPES], SAOBlkParam *sao_cur_param) { int classIdc, bandIdx; long dist = 0; switch (type) { case SAO_TYPE_EO_0: case SAO_TYPE_EO_90: case SAO_TYPE_EO_135: case SAO_TYPE_EO_45: for (classIdc = 0; classIdc < NUM_SAO_EO_CLASSES; classIdc++) { dist += distortion_cal(stat_data[compIdx][type].count[classIdc], sao_cur_param[compIdx].offset[classIdc], stat_data[compIdx][type].diff[classIdc]); } break; case SAO_TYPE_BO: for (classIdc = 0; classIdc < NUM_BO_OFFSET; classIdc++) { bandIdx = classIdc % NUM_SAO_BO_CLASSES; dist += distortion_cal(stat_data[compIdx][type].count[bandIdx], sao_cur_param[compIdx].offset[bandIdx], stat_data[compIdx][type].diff[bandIdx]); } break; default: xavs2_log(NULL, XAVS2_LOG_ERROR, "Not a supported type in get_distortion()"); exit(-1); } return dist; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void copy_sao_param_lcu(SAOBlkParam *saopara_dst, SAOBlkParam *saopara_src) { memcpy(saopara_dst, saopara_src, NUM_SAO_COMPONENTS * sizeof(SAOBlkParam)); } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void copy_sao_param_one_comp(SAOBlkParam *saopara_dst, SAOBlkParam *saopara_src) { memcpy(saopara_dst, saopara_src, sizeof(SAOBlkParam)); } /* --------------------------------------------------------------------------- */ static rdcost_t sao_rdo_new_params(xavs2_t *h, aec_t *p_aec, int avail_left, int avail_up, bool_t *slice_sao_on, rdcost_t sao_lambda, SAOStatData stat_data[NUM_SAO_COMPONENTS][NUM_SAO_NEW_TYPES], SAOBlkParam *sao_cur_param) { ALIGN16(SAOBlkParam temp_sao_param[NUM_SAO_COMPONENTS]); rdcost_t total_rdcost = 0; int bits; int compIdx, type; sao_cur_param[SAO_Y].mergeIdx = SAO_MERGE_NONE; // SET AS NOT MERGE MODE if (avail_left + avail_up) { bits = p_aec->binary.write_sao_mergeflag(p_aec, avail_left, avail_up, &(sao_cur_param[SAO_Y])); total_rdcost += bits * sao_lambda; } for (compIdx = 0; compIdx < 3; compIdx++){ if (slice_sao_on[compIdx]) { rdcost_t mincost; rdcost_t curcost; aec_copy_coding_state_sao(&h->cs_data.cs_sao_start, p_aec); // for off mode sao_cur_param[compIdx].mergeIdx = SAO_MERGE_NONE; sao_cur_param[compIdx].typeIdc = SAO_TYPE_OFF; bits = p_aec->binary.write_sao_mode(p_aec, &(sao_cur_param[compIdx])); mincost = sao_lambda * bits; aec_copy_coding_state_sao(&h->cs_data.cs_sao_temp, p_aec); // for other normal mode for (type = 0; type < 5; type++) { if (!h->param->b_fast_sao || tab_sao_check_mode_fast[compIdx][type]) { if (((!IS_ALG_ENABLE(OPT_FAST_SAO)) || (!(!h->fdec->rps.referd_by_others && h->i_type == SLICE_TYPE_B)))) { aec_copy_coding_state_sao(p_aec, &h->cs_data.cs_sao_start); temp_sao_param[compIdx].mergeIdx = SAO_MERGE_NONE; temp_sao_param[compIdx].typeIdc = type; find_offset(type, stat_data[compIdx], &temp_sao_param[compIdx], sao_lambda); curcost = get_distortion(compIdx, type, stat_data, temp_sao_param); bits = p_aec->binary.write_sao_mode(p_aec, &(temp_sao_param[compIdx])); bits += p_aec->binary.write_sao_offset(p_aec, &(temp_sao_param[compIdx])); bits += p_aec->binary.write_sao_type(p_aec, &(temp_sao_param[compIdx])); curcost += sao_lambda * bits; if (curcost < mincost) { mincost = curcost; copy_sao_param_one_comp(&sao_cur_param[compIdx], &temp_sao_param[compIdx]); aec_copy_coding_state_sao(&h->cs_data.cs_sao_temp, p_aec); } } } } aec_copy_coding_state_sao(p_aec, &h->cs_data.cs_sao_temp); total_rdcost += mincost; } } return total_rdcost; } /* --------------------------------------------------------------------------- */ static void getMergeNeighbor(xavs2_t *h, int lcu_x, int lcu_y, SAOBlkParam (*blk_param)[NUM_SAO_COMPONENTS], int *MergeAvail, SAOBlkParam sao_merge_param[][NUM_SAO_COMPONENTS]) { int mb_y = lcu_y << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); int mb_x = lcu_x << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); int mergeup_avail, mergeleft_avail; int width_in_smb = h->i_width_in_lcu; mergeup_avail = (mb_y == 0) ? 0 : (cu_get_slice_index(h, mb_x, mb_y) == cu_get_slice_index(h, mb_x, mb_y - 1)); mergeleft_avail = (mb_x == 0) ? 0 : (cu_get_slice_index(h, mb_x, mb_y) == cu_get_slice_index(h, mb_x - 1, mb_y)); if (blk_param != NULL) { if (mergeleft_avail) { copy_sao_param_lcu(sao_merge_param[SAO_MERGE_LEFT], blk_param[-1]); } if (mergeup_avail) { copy_sao_param_lcu(sao_merge_param[SAO_MERGE_ABOVE], blk_param[-width_in_smb]); } } MergeAvail[SAO_MERGE_LEFT] = mergeleft_avail; MergeAvail[SAO_MERGE_ABOVE] = mergeup_avail; } /* --------------------------------------------------------------------------- */ static rdcost_t sao_rdcost_merge(xavs2_t *h, aec_t *p_aec, rdcost_t sao_labmda, SAOStatData stat_data[NUM_SAO_COMPONENTS][NUM_SAO_NEW_TYPES], SAOBlkParam *sao_cur_param, int merge_avail[NUM_SAO_MERGE_TYPES], int mergeIdx, SAOBlkParam merge_candidate[NUM_SAO_MERGE_TYPES][NUM_SAO_COMPONENTS]) { int compIdx; int type; int currate; rdcost_t curcost = 0; assert(merge_avail[mergeIdx]); copy_sao_param_lcu(sao_cur_param, merge_candidate[mergeIdx]); for (compIdx = 0; compIdx < NUM_SAO_COMPONENTS; compIdx++) { type = merge_candidate[mergeIdx][compIdx].typeIdc; sao_cur_param[compIdx].mergeIdx = SAO_MERGE_LEFT + mergeIdx; if (type != SAO_TYPE_OFF && h->slice_sao_on[compIdx] != 0) { curcost += get_distortion(compIdx, type, stat_data, sao_cur_param); } } currate = p_aec->binary.write_sao_mergeflag(p_aec, merge_avail[SAO_MERGE_LEFT], merge_avail[SAO_MERGE_ABOVE], sao_cur_param); curcost += sao_labmda * currate; return curcost; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void off_sao(SAOBlkParam *saoblkparam) { int i; for (i = 0; i < NUM_SAO_COMPONENTS; i++) { saoblkparam[i].mergeIdx = SAO_MERGE_NONE; saoblkparam[i].typeIdc = SAO_TYPE_OFF; saoblkparam[i].startBand = -1; saoblkparam[i].deltaBand = -1; memset(saoblkparam[i].offset, 0, sizeof(saoblkparam[0].offset)); } } /* --------------------------------------------------------------------------- */ static void sao_filter_region(xavs2_t *h, SAOBlkParam *blk_param, int compIdx, sao_region_t *p_region) { ALIGN16(int signupline[MAX_CU_SIZE + SAO_SHIFT_PIX_NUM]); const int max_val = ((1 << h->param->sample_bit_depth) - 1); int start_x, end_x, start_y, end_y; int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn; int x, y; int leftsign, rightsign, upsign, downsign; int reg = 0; int edgetype, bandtype; int band_shift; int pix_x = p_region->pix_x[compIdx]; int pix_y = p_region->pix_y[compIdx]; int width = p_region->width[compIdx]; int height = p_region->height[compIdx]; int i_src = h->img_sao->i_stride[compIdx]; int i_dst = h->fdec->i_stride[compIdx]; pel_t *dst = h->fdec->planes[compIdx] + pix_y * i_dst + pix_x; pel_t *src = h->img_sao->planes[compIdx] + pix_y * i_src + pix_x; assert(blk_param->typeIdc != SAO_TYPE_OFF); switch (blk_param->typeIdc) { case SAO_TYPE_EO_0: end_y = height; start_x = p_region->b_left ? 0 : 1; end_x = p_region->b_right ? width : (width - 1); for (y = 0; y < end_y; y++) { leftsign = xavs2_sign3(src[start_x] - src[start_x - 1]); for (x = start_x; x < end_x; x++) { rightsign = xavs2_sign3(src[x] - src[x + 1]); edgetype = leftsign + rightsign; leftsign = -rightsign; dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); } src += i_src; dst += i_dst; } break; case SAO_TYPE_EO_90: { pel_t *src_base = src; pel_t *dst_base = dst; start_x = 0; end_x = width; start_y = p_region->b_top ? 0 : 1; end_y = p_region->b_down ? height : (height - 1); src_base += start_y * i_src; dst_base += start_y * i_dst; for (x = start_x; x < end_x; x++) { src = src_base; dst = dst_base; upsign = xavs2_sign3(src[0] - src[-i_src]); for (y = start_y; y < end_y; y++) { downsign = xavs2_sign3(src[0] - src[i_src]); edgetype = downsign + upsign; upsign = -downsign; *dst = (pel_t)XAVS2_CLIP3(0, max_val, src[0] + blk_param->offset[edgetype + 2]); src += i_src; dst += i_dst; } src_base++; dst_base++; } break; } case SAO_TYPE_EO_135: { start_x_r0 = p_region->b_top_left ? 0 : 1; end_x_r0 = p_region->b_top ? (p_region->b_right ? width : (width - 1)) : 1; start_x_r = p_region->b_left ? 0 : 1; end_x_r = p_region->b_right ? width : (width - 1); start_x_rn = p_region->b_down ? (p_region->b_left ? 0 : 1) : (width - 1); end_x_rn = p_region->b_right_down ? width : (width - 1); // init the line buffer for (x = start_x_r + 1; x < end_x_r + 1; x++) { signupline[x] = xavs2_sign3(src[x + i_src] - src[x - 1]); } // first row for (x = start_x_r0; x < end_x_r0; x++) { upsign = xavs2_sign3(src[x] - src[x - 1 - i_src]); edgetype = upsign - signupline[x + 1]; dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); } // middle rows src += i_src; dst += i_dst; for (y = 1; y < height - 1; y++) { x = start_x_r; signupline[x] = xavs2_sign3(src[x] - src[x - 1 - i_src]); for (; x < end_x_r; x++) { downsign = xavs2_sign3(src[x] - src[x + 1 + i_src]); edgetype = downsign + signupline[x]; dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); signupline[x] = reg; reg = -downsign; } dst += i_dst; src += i_src; } // last row x = start_x_rn; signupline[x] = xavs2_sign3(src[x] - src[x - 1 - i_src]); for (; x < end_x_rn; x++) { downsign = xavs2_sign3(src[x] - src[x + 1 + i_src]); edgetype = downsign + signupline[x]; dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); } } break; case SAO_TYPE_EO_45: { start_x_r0 = p_region->b_top ? (p_region->b_left ? 0 : 1) : (width - 1); end_x_r0 = p_region->b_top_right ? width : (width - 1); start_x_r = p_region->b_left ? 0 : 1; end_x_r = p_region->b_right ? width : (width - 1); start_x_rn = p_region->b_down_left ? 0 : 1; end_x_rn = p_region->b_down ? (p_region->b_right ? width : (width - 1)) : 1; // init the line buffer for (x = start_x_r; x < end_x_r; x++) { signupline[x] = xavs2_sign3(src[x - 1 + i_src] - src[x]); } // first row for (x = start_x_r0; x < end_x_r0; x++) { upsign = xavs2_sign3(src[x] - src[x + 1 - i_src]); edgetype = upsign - signupline[x]; dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); } // middle rows src += i_src; dst += i_dst; for (y = 1; y < height - 1; y++) { signupline[end_x_r] = xavs2_sign3(src[end_x_r - 1] - src[end_x_r - i_src]); for (x = start_x_r; x < end_x_r; x++) { downsign = xavs2_sign3(src[x] - src[x - 1 + i_src]); edgetype = downsign + signupline[x + 1]; dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); signupline[x] = -downsign; } src += i_src; dst += i_dst; } //last row for (x = start_x_rn; x < end_x_rn; x++) { if (x == end_x_r - 1) { upsign = xavs2_sign3(src[x] - src[x + 1 - i_src]); signupline[x + 1] = upsign; } downsign = xavs2_sign3(src[x] - src[x - 1 + i_src]); edgetype = downsign + signupline[x + 1]; dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[edgetype + 2]); } break; } case SAO_TYPE_BO: band_shift = (h->param->sample_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT); start_x = 0; end_x = width; start_y = 0; end_y = height; src += start_y * i_src; dst += start_y * i_dst; for (y = start_y; y < end_y; y++) { for (x = start_x; x < end_x; x++) { bandtype = src[x] >> band_shift; dst[x] = (pel_t)XAVS2_CLIP3(0, max_val, src[x] + blk_param->offset[bandtype]); } src += i_src; dst += i_dst; } break; default: xavs2_log(h, XAVS2_LOG_ERROR, "Not a supported SAO types for SAO_on_Block\n"); exit(-1); } } /* --------------------------------------------------------------------------- */ static void sao_get_neighbor_avail(xavs2_t *h, sao_region_t *p_avail, int i_lcu_x, int i_lcu_y) { int i_lcu_level = h->i_lcu_level; int pix_x = i_lcu_x << i_lcu_level; int pix_y = i_lcu_y << i_lcu_level; int width = XAVS2_MIN(1 << i_lcu_level, h->i_width - pix_x); int height = XAVS2_MIN(1 << i_lcu_level, h->i_height - pix_y); int pix_x_c = pix_x >> 1; int pix_y_c = pix_y >> CHROMA_V_SHIFT; int width_c = width >> 1; int height_c = height >> 1; /* Իȡ */ p_avail->b_left = i_lcu_x != 0; p_avail->b_top = i_lcu_y != 0; p_avail->b_right = (i_lcu_x < h->i_width_in_lcu - 1); p_avail->b_down = (i_lcu_y < h->i_height_in_lcu - 1); if (h->param->b_cross_slice_loop_filter == FALSE) { slice_t *slice = h->slices[h->i_slice_index]; if (p_avail->b_top) { p_avail->b_top = (slice->i_first_lcu_y != i_lcu_y); } if (p_avail->b_down) { p_avail->b_down = (slice->i_last_lcu_y != i_lcu_y); } } p_avail->b_top_left = p_avail->b_top && p_avail->b_left; p_avail->b_top_right = p_avail->b_top && p_avail->b_right; p_avail->b_down_left = p_avail->b_down && p_avail->b_left; p_avail->b_right_down = p_avail->b_down && p_avail->b_right; /* ˲ĵ */ if (!p_avail->b_right) { width += SAO_SHIFT_PIX_NUM; width_c += SAO_SHIFT_PIX_NUM; } if (!p_avail->b_down) { height += SAO_SHIFT_PIX_NUM; height_c += SAO_SHIFT_PIX_NUM; } if (p_avail->b_left) { pix_x -= SAO_SHIFT_PIX_NUM; pix_x_c -= SAO_SHIFT_PIX_NUM; } else { width -= SAO_SHIFT_PIX_NUM; width_c -= SAO_SHIFT_PIX_NUM; } if (p_avail->b_top) { pix_y -= SAO_SHIFT_PIX_NUM; pix_y_c -= SAO_SHIFT_PIX_NUM; } else { height -= SAO_SHIFT_PIX_NUM; height_c -= SAO_SHIFT_PIX_NUM; } /* make sure the width and height is not outside a picture */ width = XAVS2_MIN(width , h->i_width - pix_x); width_c = XAVS2_MIN(width_c, (h->i_width >> 1) - pix_x_c); height = XAVS2_MIN(height , h->i_height - pix_y); height_c = XAVS2_MIN(height_c, (h->i_height >> 1) - pix_y_c); /* luma component */ p_avail->pix_x[0] = pix_x; p_avail->pix_y[0] = pix_y; p_avail->width[0] = width; p_avail->height[0] = height; /* chroma components */ p_avail->pix_x[1] = p_avail->pix_x[2] = pix_x_c; p_avail->pix_y[1] = p_avail->pix_y[2] = pix_y_c; p_avail->width[1] = p_avail->width[2] = width_c; p_avail->height[1] = p_avail->height[2] = height_c; } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void sao_get_param_lcu(xavs2_t *h, aec_t *p_aec, int lcu_x, int lcu_y, bool_t *slice_sao_on, SAOStatData stat_data [NUM_SAO_COMPONENTS][NUM_SAO_NEW_TYPES], SAOBlkParam (*blk_param)[NUM_SAO_COMPONENTS], rdcost_t sao_labmda) { if (slice_sao_on[0] || slice_sao_on[1] || slice_sao_on[2]) { SAOBlkParam sao_cur_param[NUM_SAO_COMPONENTS]; SAOBlkParam merge_candidate[NUM_SAO_MERGE_TYPES][NUM_SAO_COMPONENTS]; int merge_avail[NUM_SAO_MERGE_TYPES]; rdcost_t mcost; rdcost_t mincost = MAX_COST; getMergeNeighbor(h, lcu_x, lcu_y, blk_param, merge_avail, merge_candidate); // backup AEC contexts aec_copy_coding_state_sao(&h->cs_data.cs_sao_start, p_aec); // MERGE MODE if (merge_avail[SAO_MERGE_LEFT]) { mincost = sao_rdcost_merge(h, p_aec, sao_labmda, stat_data, sao_cur_param, merge_avail, SAO_MERGE_LEFT, merge_candidate); copy_sao_param_lcu(blk_param[0], sao_cur_param); aec_copy_coding_state_sao(&h->cs_data.cs_sao_best, p_aec); aec_copy_coding_state_sao(p_aec, &h->cs_data.cs_sao_start); } if (merge_avail[SAO_MERGE_ABOVE]) { mcost = sao_rdcost_merge(h, p_aec, sao_labmda, stat_data, sao_cur_param, merge_avail, SAO_MERGE_ABOVE, merge_candidate); if (mcost < mincost) { mincost = mcost; copy_sao_param_lcu(blk_param[0], sao_cur_param); aec_copy_coding_state_sao(&h->cs_data.cs_sao_best, p_aec); } aec_copy_coding_state_sao(p_aec, &h->cs_data.cs_sao_start); } // NEW MODE mcost = sao_rdo_new_params(h, p_aec, merge_avail[SAO_MERGE_LEFT], merge_avail[SAO_MERGE_ABOVE], slice_sao_on, sao_labmda, stat_data, sao_cur_param); if (mcost < mincost) { mincost = mcost; copy_sao_param_lcu(blk_param[0], sao_cur_param); aec_copy_coding_state_sao(&h->cs_data.cs_sao_best, &h->cs_data.cs_sao_temp); } // RESET ENTROPY CODING aec_copy_coding_state_sao(p_aec, &h->cs_data.cs_sao_best); } else { off_sao(blk_param[0]); } } /* --------------------------------------------------------------------------- */ void write_saoparam_one_lcu(xavs2_t *h, aec_t *p_aec, int lcu_x, int lcu_y, bool_t *slice_sao_on, SAOBlkParam sao_cur_param[NUM_SAO_COMPONENTS]) { if (slice_sao_on[0] || slice_sao_on[1] || slice_sao_on[2]) { int merge_avail[NUM_SAO_MERGE_TYPES]; int avail_left, avail_up; getMergeNeighbor(h, lcu_x, lcu_y, NULL, merge_avail, NULL); avail_left = merge_avail[0]; avail_up = merge_avail[1]; if (avail_left || avail_up) { p_aec->binary.write_sao_mergeflag(p_aec, avail_left, avail_up, &sao_cur_param[SAO_Y]); } if (sao_cur_param[SAO_Y].mergeIdx == SAO_MERGE_NONE) { int compIdx; for (compIdx = SAO_Y; compIdx < NUM_SAO_COMPONENTS; compIdx++) { if (slice_sao_on[compIdx]) { p_aec->binary.write_sao_mode(p_aec, &sao_cur_param[compIdx]); if (sao_cur_param[compIdx].typeIdc != SAO_TYPE_OFF) { p_aec->binary.write_sao_offset(p_aec, &sao_cur_param[compIdx]); p_aec->binary.write_sao_type(p_aec, &sao_cur_param[compIdx]); } } } } } } /* --------------------------------------------------------------------------- */ void sao_slice_onoff_decision(xavs2_t *h, bool_t *slice_sao_on) { const double saorate[NUM_SAO_COMPONENTS] = {SAO_RATE_THR, SAO_RATE_CHROMA_THR, SAO_RATE_CHROMA_THR}; const int num_lcu = h->i_width_in_lcu * h->i_height_in_lcu; int compIdx; for (compIdx = 0; compIdx < NUM_SAO_COMPONENTS; compIdx++) { if (h->param->chroma_format == CHROMA_420 || compIdx == IMG_Y) { slice_sao_on[compIdx] = TRUE; if (h->fref[0] != NULL && h->fref[0]->num_lcu_sao_off[compIdx] > num_lcu * saorate[compIdx]) { slice_sao_on[compIdx] = FALSE; } } else { slice_sao_on[compIdx] = FALSE; } } } /* --------------------------------------------------------------------------- */ static void sao_copy_lcu(xavs2_t *h, xavs2_frame_t *frm_dst, xavs2_frame_t *frm_src, int lcu_x, int lcu_y) { int i_src = frm_src->i_stride[0]; int i_dst = frm_dst->i_stride[0]; int start_y = lcu_y << h->i_lcu_level; int start_x = lcu_x << h->i_lcu_level; int end_y = XAVS2_MIN(h->i_height, ((lcu_y + 1) << h->i_lcu_level)); int end_x = XAVS2_MIN(h->i_width, ((lcu_x + 1) << h->i_lcu_level)); int lcu_width = end_x - start_x; int lcu_height; int i_first_lcu_y_for_filter = h->param->b_cross_slice_loop_filter ? 0 : h->slices[h->i_slice_index]->i_first_lcu_y; int start_y_shift = (lcu_y != i_first_lcu_y_for_filter) ? SAO_SHIFT_PIX_NUM : 0; pel_t *p_src; pel_t *p_dst; pel_t *p_src2, *p_dst2; /* luma component */ start_y -= start_y_shift; lcu_height = end_y - start_y; p_src = frm_src->planes[0] + start_y * i_src + start_x; p_dst = frm_dst->planes[0] + start_y * i_dst + start_x; g_funcs.plane_copy(p_dst, i_dst, p_src, i_src, lcu_width, lcu_height); /* chroma component */ start_y = lcu_y << (h->i_lcu_level - CHROMA_V_SHIFT); start_y -= start_y_shift; end_y >>= CHROMA_V_SHIFT; start_x >>= CHROMA_V_SHIFT; end_x >>= CHROMA_V_SHIFT; lcu_width = end_x - start_x; lcu_height = end_y - start_y; i_src = frm_src->i_stride[1]; i_dst = frm_dst->i_stride[1]; p_src = frm_src->planes[1] + start_y * i_src + start_x; p_src2 = frm_src->planes[2] + start_y * i_src + start_x; p_dst = frm_dst->planes[1] + start_y * i_dst + start_x; p_dst2 = frm_dst->planes[2] + start_y * i_dst + start_x; g_funcs.plane_copy(p_dst, i_dst, p_src, i_src, lcu_width, lcu_height); g_funcs.plane_copy(p_dst2, i_dst, p_src2, i_src, lcu_width, lcu_height); } /* --------------------------------------------------------------------------- */ void sao_get_lcu_param_after_deblock(xavs2_t *h, aec_t *p_aec, int i_lcu_x, int i_lcu_y) { sao_region_t region; int i_lcu_xy = i_lcu_y * h->i_width_in_lcu + i_lcu_x; int compIdx, type; sao_copy_lcu(h, h->img_sao, h->fdec, i_lcu_x, i_lcu_y); sao_get_neighbor_avail(h, ®ion, i_lcu_x, i_lcu_y); for (compIdx = 0; compIdx < 3; compIdx++) { if (h->slice_sao_on[compIdx]) { for (type = 0; type < 5; type++) { if (!h->param->b_fast_sao || tab_sao_check_mode_fast[compIdx][type]) { if (((!IS_ALG_ENABLE(OPT_FAST_SAO)) || (!(!h->fdec->rps.referd_by_others && h->i_type == SLICE_TYPE_B)))) { gf_sao_stat[type](h->img_sao, h->fenc, &h->sao_stat_datas[i_lcu_xy][compIdx][type], ®ion, compIdx); } // SAOStatData tmp; // memset(&tmp, 0, sizeof(tmp)); // gf_sao_stat[type](h->fdec, h->fenc, &tmp, ®ion, compIdx); // if (memcmp(&tmp, &h->sao_stat_datas[i_lcu_xy][compIdx][type], sizeof(tmp)) != 0) { // xavs2_log(h, XAVS2_LOG_ERROR, "SAO mismatch!\n"); // gf_sao_stat[type](h->img_sao, h->fenc, &h->sao_stat_datas[i_lcu_xy][compIdx][type], ®ion, compIdx); // gf_sao_stat[type](h->fdec, h->fenc, &tmp, ®ion, compIdx); // } } } } } sao_get_param_lcu(h, p_aec, i_lcu_x, i_lcu_y, h->slice_sao_on, h->sao_stat_datas[i_lcu_xy], &h->sao_blk_params[i_lcu_xy], h->f_lambda_mode); } /* --------------------------------------------------------------------------- */ void sao_filter_lcu(xavs2_t *h, SAOBlkParam blk_param[NUM_SAO_COMPONENTS], int lcu_x, int lcu_y) { sao_region_t region; SAOBlkParam *p_param = blk_param; int compIdx; sao_get_neighbor_avail(h, ®ion, lcu_x, lcu_y); for (compIdx = 0; compIdx < NUM_SAO_COMPONENTS; compIdx++) { if (h->slice_sao_on[compIdx] == 0 || p_param[compIdx].typeIdc == SAO_TYPE_OFF) { continue; } int pix_y = region.pix_y[compIdx]; int pix_x = region.pix_x[compIdx]; int i_dst = h->fdec->i_stride[compIdx]; int i_src = h->img_sao->i_stride[compIdx]; pel_t *dst = h->fdec->planes[compIdx] + pix_y * i_dst + pix_x; pel_t *src = h->img_sao->planes[compIdx] + pix_y * i_src + pix_x; int avail[8]; avail[0] = region.b_top; avail[1] = region.b_down; avail[2] = region.b_left; avail[3] = region.b_right; avail[4] = region.b_top_left; avail[5] = region.b_top_right; avail[6] = region.b_down_left; avail[7] = region.b_right_down; g_funcs.sao_block(dst, i_dst, src, i_src, region.width[compIdx], region.height[compIdx], avail, &p_param[compIdx]); } } xavs2-1.3/source/encoder/sao.h000066400000000000000000000043551340660520300163000ustar00rootroot00000000000000/* * sao.h * * Description of this file: * SAO functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_SAO_H #define XAVS2_SAO_H #define write_saoparam_one_lcu FPFX(write_saoparam_one_lcu) void write_saoparam_one_lcu(xavs2_t *h, aec_t *p_aec, int lcu_x, int lcu_y, bool_t *slice_sao_on, SAOBlkParam *saoBlkParam); #define sao_slice_onoff_decision FPFX(sao_slice_onoff_decision) void sao_slice_onoff_decision(xavs2_t *h, bool_t *slice_sao_on); /* decide sao parameters directly after one lcu reconstruction */ #define sao_get_lcu_param_after_deblock FPFX(sao_get_lcu_param_after_deblock) void sao_get_lcu_param_after_deblock(xavs2_t *h, aec_t *p_aec, int i_lcu_x, int i_lcu_y); /* conduct SAO filtering after one lcu row coding */ #define sao_filter_lcu FPFX(sao_filter_lcu) void sao_filter_lcu(xavs2_t *h, SAOBlkParam blk_param[NUM_SAO_COMPONENTS], int lcu_x, int lcu_y); #endif // XAVS2_SAO_H xavs2-1.3/source/encoder/slice.c000066400000000000000000000551111340660520300166040ustar00rootroot00000000000000/* * slice.c * * Description of this file: * Slice Processing functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "mc.h" #include "aec.h" #include "nal.h" #include "wrapper.h" #include "slice.h" #include "header.h" #include "bitstream.h" #include "cudata.h" #include "rdo.h" #include "tdrdo.h" #include "wrapper.h" #include "frame.h" #include "alf.h" #include "sao.h" /** * =========================================================================== * global variables * =========================================================================== */ slice_row_index_t g_slice_lcu_row_order[1024]; #if XAVS2_TRACE extern int g_sym_count; /* global symbol count for trace */ extern int g_bit_count; /* global bit count for trace */ #endif /* --------------------------------------------------------------------------- * ʼLCUеı˳ */ void slice_lcu_row_order_init(xavs2_t *h) { slice_row_index_t *lcurow = g_slice_lcu_row_order; int num_lcu_row = h->i_height_in_lcu; int idx_slice = 0; int i; if (h->param->i_lcurow_threads > 1 && h->param->slice_num > 1) { int slice_num = h->param->slice_num; int set_new_lcu_row = 1; int k; /* set task table. the order of encoding task priority: * 1) first LCU row in each slice; * 2) other LCU rows. */ for (i = 0, idx_slice = 0; idx_slice < slice_num; idx_slice++) { lcurow[i].lcu_y = (int16_t)(h->slices[idx_slice]->i_first_lcu_y); lcurow[i].row_type = 0; lcurow[i].slice_idx = (int8_t)idx_slice; i++; } for (k = 0; set_new_lcu_row; k++) { set_new_lcu_row = 0; for (idx_slice = 0; idx_slice < slice_num && i < num_lcu_row; idx_slice++) { slice_t *p_slice = h->slices[idx_slice]; int bottom_row_y = p_slice->i_first_lcu_y + p_slice->i_lcu_row_num - 1; int new_row = lcurow[k * slice_num + idx_slice].lcu_y + 1; /* next LCU row in same slice */ if (new_row > p_slice->i_first_lcu_y && new_row <= bottom_row_y) { lcurow[i].lcu_y = (int16_t)(new_row); lcurow[i].row_type = 1 + (new_row == bottom_row_y); lcurow[i].slice_idx = (int8_t)(idx_slice); set_new_lcu_row = 1; /* set a new LCU row */ i++; } } } } else { slice_t *p_slice = h->slices[idx_slice]; for (i = 0; i < num_lcu_row; i++) { int row_type = (i != p_slice->i_first_lcu_y) + (i == p_slice->i_first_lcu_y + p_slice->i_lcu_row_num - 1); lcurow[i].lcu_y = (int16_t)(i); lcurow[i].row_type = (int8_t)row_type; lcurow[i].slice_idx = (int8_t)idx_slice; if (row_type == 2) { idx_slice++; /* a new slice appear */ p_slice = h->slices[idx_slice]; } } } // Ĭм˳ } /* --------------------------------------------------------------------------- * initializes the parameters for all slices */ void xavs2_slices_init(xavs2_t *h) { slice_t *p_slice; if (h->param->slice_num < 2) { /* single slice per frame */ p_slice = h->slices[0]; /* set slice properties */ p_slice->i_first_lcu_xy = 0; p_slice->i_last_lcu_xy = h->i_height_in_lcu * h->i_width_in_lcu - 1; p_slice->i_first_scu_y = 0; p_slice->i_first_lcu_y = 0; p_slice->i_lcu_row_num = h->i_height_in_lcu; p_slice->i_last_lcu_y = p_slice->i_first_lcu_y + p_slice->i_lcu_row_num - 1; p_slice->p_slice_bs_buf = h->p_bs_buf_slice; p_slice->len_slice_bs_buf = h->i_bs_buf_slice; } else { /* multi-slice per frame */ uint8_t *p_bs_start = h->p_bs_buf_slice; const int i_slice_num = h->param->slice_num; int i_rest_rows = h->i_height_in_lcu; int i_len_per_row = (h->i_bs_buf_slice - i_slice_num * CACHE_LINE_256B) / i_rest_rows; int i_first_row_id = 0; int i_left_slice_num = i_slice_num; int i_scus_in_lcu = 1 << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); int i_avg_rows; int i_bs_len; int i; /* set properties for each slice */ for (i = 0; i < i_slice_num; i++) { p_slice = h->slices[i]; /* compute lcu-row number in a slice */ i_avg_rows = (i_rest_rows + i_left_slice_num - 1) / i_left_slice_num; i_rest_rows -= i_avg_rows;/* left lcu rows */ i_left_slice_num--; /* left slice number */ /* set slice properties */ p_slice->i_first_lcu_xy = i_first_row_id * h->i_width_in_lcu; p_slice->i_first_scu_y = i_first_row_id * i_scus_in_lcu; p_slice->i_first_lcu_y = i_first_row_id; p_slice->i_lcu_row_num = i_avg_rows; p_slice->i_last_lcu_xy = p_slice->i_first_lcu_xy + (p_slice->i_lcu_row_num * h->i_width_in_lcu - 1); p_slice->i_last_lcu_y = p_slice->i_first_lcu_y + p_slice->i_lcu_row_num - 1; /* init slice bs, start at align 128-byte */ ALIGN_256_PTR(p_bs_start);/* align 256B */ i_bs_len = i_len_per_row * p_slice->i_lcu_row_num; i_bs_len = (i_bs_len >> 8) << 8; /* the length is a multiple of 256 */ p_slice->p_slice_bs_buf = p_bs_start; p_slice->len_slice_bs_buf = i_bs_len; p_bs_start += i_bs_len; /* update row id for next slice */ i_first_row_id += i_avg_rows; assert(i_first_row_id <= h->i_height_in_lcu); } } } /* --------------------------------------------------------------------------- * estimate CU depth range */ //#if OPT_CU_DEPTH_CTRL static void est_cu_depth_range(xavs2_t *h, int *min_level, int *max_level) { static const int L_WEIGHT[] = {3,2,0,1,5}; // [Left Top TopLeft TopRight Col] static const int TH_WEIGHT[3] = {25, 15, 5}; int b_left_cu = h->lcu.i_pix_x > 0 && (cu_get_slice_index(h, h->lcu.i_scu_x, h->lcu.i_scu_y) == cu_get_slice_index(h, h->lcu.i_scu_x - 1, h->lcu.i_scu_y)); int b_top_cu = h->lcu.i_pix_y > 0 && (cu_get_slice_index(h, h->lcu.i_scu_x, h->lcu.i_scu_y) == cu_get_slice_index(h, h->lcu.i_scu_x, h->lcu.i_scu_y - 1)); #if SAVE_CU_INFO int b_col_cu = (h->i_type != SLICE_TYPE_I) && (h->fref[0]->cu_mode[h->lcu.i_scu_xy] < PRED_I_2Nx2N); #else int b_col_cu = FALSE; #endif int min_level_ctrl = h->i_scu_level; int max_level_ctrl = h->i_lcu_level; int min_level_pred = h->i_lcu_level - 3; int max_level_pred = h->i_lcu_level - 0; int min_left_level = h->i_lcu_level; int min_top_level = h->i_lcu_level; int i = 0; int cu_with_of_lcu = 1 << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); if (b_left_cu && b_top_cu) { // check left CTU's max depth int i_left_cu_y = h->lcu.i_scu_y; int i_top_cu_x = h->lcu.i_scu_x; cu_info_t *p_left = &h->cu_info[h->lcu.i_scu_xy - 1]; cu_info_t *p_top = &h->cu_info[h->lcu.i_scu_xy - h->i_width_in_mincu]; for (i = cu_with_of_lcu; i != 0; i--) { if (i_left_cu_y++ < h->i_height_in_mincu) { min_left_level = XAVS2_MIN(min_left_level, p_left->i_level); p_left += h->i_width_in_mincu; } if (i_top_cu_x++ < h->i_width_in_mincu) { min_top_level = XAVS2_MIN(min_top_level, p_top->i_level); p_top++; } } min_left_level = (min_left_level >= h->i_lcu_level - 1); min_top_level = (min_top_level >= h->i_lcu_level - 1); if (min_left_level && min_top_level) { min_level_pred = h->i_lcu_level - 2; max_level_pred = h->i_lcu_level - 0; // depth range limited to [0, 1, 2] } else if (!min_left_level && !min_top_level) { min_level_pred = h->i_lcu_level - 3; max_level_pred = h->i_lcu_level - 1; // depth range limited to [1, 2, 3] } } min_level_pred = XAVS2_MAX(min_level_pred, h->i_scu_level); if (b_left_cu && b_top_cu && b_col_cu) { #if SAVE_CU_INFO int level_T = h->i_lcu_level - h->cu_info[h->lcu.i_scu_xy - h->i_width_in_mincu].i_level; // top int level_L = h->i_lcu_level - h->cu_info[h->lcu.i_scu_xy - 1].i_level; // left int level_TL = h->i_lcu_level - h->cu_info[h->lcu.i_scu_xy - 1 - h->i_width_in_mincu].i_level; // top-left int level_TR = h->i_lcu_level - h->cu_info[h->lcu.i_scu_xy + 1 - h->i_width_in_mincu].i_level; // top-right int level_C = h->i_lcu_level - h->fref[0]->cu_level[h->lcu.i_scu_xy]; // col-located int weight = L_WEIGHT[0] * level_L + L_WEIGHT[1] * level_T+L_WEIGHT[2] * level_TL + L_WEIGHT[3] * level_TR+L_WEIGHT[4] * level_C; if (weight >= TH_WEIGHT[0]) { min_level_ctrl = -3; max_level_ctrl = -2; } else if (weight >= TH_WEIGHT[1]) { min_level_ctrl = -3; max_level_ctrl = -1; } else if (weight >= TH_WEIGHT[2]) { min_level_ctrl = -2; max_level_ctrl = 0; } else { min_level_ctrl = -1; max_level_ctrl = 0; } min_level_ctrl = XAVS2_MAX(h->i_scu_level, min_level_ctrl + h->i_lcu_level); max_level_ctrl = max_level_ctrl + h->i_lcu_level; #endif } else { min_level_ctrl = h->i_scu_level; max_level_ctrl = h->i_lcu_level; } *min_level = XAVS2_MAX(min_level_ctrl, min_level_pred); *max_level = XAVS2_MIN(max_level_ctrl, max_level_pred); assert(*min_level <= *max_level); } //#endif /* --------------------------------------------------------------------------- * store cu info for one LCU row */ static void store_cu_info_row(row_info_t *row) { int i, j, k, l; xavs2_t *h = row->h; int lcu_height_in_scu = 1 << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); int last_lcu_row = ((h->lcu.i_scu_y + lcu_height_in_scu) < h->i_height_in_mincu ? 0 : 1); int num_scu_y = last_lcu_row == 0 ? lcu_height_in_scu : h->i_height_in_mincu - h->lcu.i_scu_y; #if SAVE_CU_INFO /* store cu info (one lcu row) of reference frame */ for (i = 0; i < num_scu_y; i++) { int scu_offset = (h->lcu.i_scu_y + i) * h->i_width_in_mincu; cu_info_t *p_cu_info = &h->cu_info[scu_offset]; for (j = 0; j < h->i_width_in_mincu; j++) { h->fdec->cu_level[scu_offset + j] = (int8_t)p_cu_info->i_level; h->fdec->cu_mode[scu_offset + j] = (int8_t)p_cu_info->i_mode; h->fdec->cu_cbp[scu_offset + j] = (int8_t)p_cu_info->i_cbp; p_cu_info++; } } #endif if (h->i_type != SLICE_TYPE_I) { // store motion information for temporal prediction const int w0_in_16x16 = h->i_width_in_minpu >> 2; const int h0_in_16x16 = h->i_height_in_minpu >> 2; const int w_in_16x16 = (h->i_width_in_minpu + 3) >> 2; const int h_in_16x16 = (h->i_height_in_minpu + 3) >> 2; const int w_in_4x4 = h->i_width_in_minpu; const int h_in_4x4 = h->i_height_in_minpu; int start_16x16_y = h->lcu.i_scu_y >> 1; int num_16x16_y = num_scu_y >> 1; const mv_t *src_mv = h->fwd_1st_mv; const int8_t *src_ref = h->fwd_1st_ref; mv_t *dst_mv = h->fdec->pu_mv; int8_t *dst_ref = h->fdec->pu_ref; assert(num_16x16_y >= 1 || last_lcu_row); // store middle pixel's motion information for (i = start_16x16_y; i < start_16x16_y + num_16x16_y; i++) { k = ((i << 2) + 2) * w_in_4x4; for (j = 0; j < w0_in_16x16; j++) { l = (j << 2) + 2; dst_mv[i * w_in_16x16 + j] = src_mv[k + l]; dst_ref[i * w_in_16x16 + j] = src_ref[k + l]; } } ///! last LCU row if (last_lcu_row && (h0_in_16x16 < h_in_16x16)) { k = (((h0_in_16x16 << 2) + h_in_4x4) >> 1) * w_in_4x4; for (j = 0; j < w0_in_16x16; j++) { l = (j << 2) + 2; dst_mv[h0_in_16x16 * w_in_16x16 + j] = src_mv[k + l]; dst_ref[h0_in_16x16 * w_in_16x16 + j] = src_ref[k + l]; } if (w0_in_16x16 < w_in_16x16) { l = ((w0_in_16x16 << 2) + w_in_4x4) >> 1; dst_mv[h0_in_16x16 * w_in_16x16 + w0_in_16x16] = src_mv[k + l]; dst_ref[h0_in_16x16 * w_in_16x16 + w0_in_16x16] = src_ref[k + l]; } } ///! last column if (w0_in_16x16 < w_in_16x16) { i = ((w0_in_16x16 << 2) + w_in_4x4) >> 1; for (j = start_16x16_y; j < start_16x16_y + num_16x16_y; j++) { dst_mv[j * w_in_16x16 + w0_in_16x16] = src_mv[(j * 4 + 2) * w_in_4x4 + i]; dst_ref[j * w_in_16x16 + w0_in_16x16] = src_ref[(j * 4 + 2) * w_in_4x4 + i]; } } } } /* --------------------------------------------------------------------------- * encode one lcu row */ void *xavs2_lcu_row_write(void *arg) { row_info_t *row = (row_info_t *)arg; xavs2_t *h = row->h; slice_t *slice = h->slices[h->i_slice_index]; aec_t *p_aec = &h->aec; const int i_lcu_y = row->row; row_info_t *last_row = (i_lcu_y > slice->i_first_lcu_y) ? &h->frameinfo->rows[i_lcu_y - 1] : 0; lcu_analyse_t lcu_analyse = g_funcs.compress_ctu[h->i_type]; const bool_t b_enable_wpp = h->param->i_lcurow_threads > 1; int min_level = h->i_scu_level; int max_level = h->i_lcu_level; int i_lcu_x; #if ENABLE_RATE_CONTROL_CU int temp_dquant; #endif h->lcu.get_skip_mvs = g_funcs.get_skip_mv_predictors[h->i_type]; if (h->param->slice_num > 1) { slice_init_bufer(h, slice); } /* loop over all LCUs in current lcu row ------------------------ */ for (i_lcu_x = 0; i_lcu_x < h->i_width_in_lcu; i_lcu_x++) { /* 0, initialization before sync */ lcu_info_t *lcu = &row->lcus[i_lcu_x]; lcu_start_init_pos(h, i_lcu_x, i_lcu_y); lcu->slice_index = h->i_slice_index; lcu->scu_xy = h->lcu.i_scu_xy; lcu->pix_x = h->lcu.i_pix_x; lcu->pix_y = h->lcu.i_pix_y; h->lcu.lcu_coeff[0] = lcu->coeffs_y; h->lcu.lcu_coeff[1] = lcu->coeffs_uv[0]; h->lcu.lcu_coeff[2] = lcu->coeffs_uv[1]; #if ENABLE_RATE_CONTROL_CU h->last_dquant = &lcu->last_dqp; #endif /* 1, sync */ wait_lcu_row_coded(last_row, XAVS2_MIN(h->i_width_in_lcu - 1, i_lcu_x + 1)); if (b_enable_wpp && last_row != NULL && i_lcu_x == 0) { aec_copy_aec_state(p_aec, &last_row->aec_set); } /* 3, start */ lcu_start_init_pixels(h, i_lcu_x, i_lcu_y); if (h->td_rdo != NULL) { tdrdo_lcu_adjust_lambda(h, &h->f_lambda_mode); } #if ENABLE_RATE_CONTROL_CU temp_dquant = *h->last_dquant; #endif /* 4, analyze */ if (IS_ALG_ENABLE(OPT_CU_DEPTH_CTRL)) { est_cu_depth_range(h, &min_level, &max_level); } lcu_analyse(h, p_aec, h->lcu.p_ctu, h->i_lcu_level, min_level, max_level, MAX_COST); if (h->td_rdo != NULL) { tdrdo_lcu_update(h); } #if ENABLE_RATE_CONTROL_CU *h->last_dquant = temp_dquant; #endif /* 5, lcu end */ lcu_end(h, i_lcu_x, i_lcu_y); if (b_enable_wpp && i_lcu_x == 1) { /* backup aec contexts for the next row */ aec_copy_aec_state(&row->aec_set, p_aec); } /* 4, deblock on lcu */ #if XAVS2_DUMP_REC if (!h->param->loop_filter_disable) { xavs2_lcu_deblock(h, h->fdec); } #else /* no need to do loop-filter without dumping, but at this time, * the PSNR is computed not correctly if XAVS2_STAT is on. */ if (!h->param->loop_filter_disable && h->fdec->rps.referd_by_others) { xavs2_lcu_deblock(h, h->fdec); } #endif /* copy reconstruction pixels when the last LCU is reconstructed */ if (h->param->enable_sao) { if (i_lcu_x > 0) { sao_get_lcu_param_after_deblock(h, p_aec, i_lcu_x - 1, i_lcu_y); sao_filter_lcu(h, h->sao_blk_params[i_lcu_y * h->i_width_in_lcu + i_lcu_x - 1], i_lcu_x - 1, i_lcu_y); } if (i_lcu_x == h->i_width_in_lcu - 1) { sao_get_lcu_param_after_deblock(h, p_aec, i_lcu_x, i_lcu_y); sao_filter_lcu(h, h->sao_blk_params[i_lcu_y * h->i_width_in_lcu + i_lcu_x], i_lcu_x, i_lcu_y); } } xavs2_thread_mutex_lock(&row->mutex); /* lock */ row->coded = i_lcu_x; // h->fdec->num_lcu_coded_in_row[row->row]++; xavs2_thread_mutex_unlock(&row->mutex); /* unlock */ /* signal to the next row */ if (i_lcu_x >= 1) { xavs2_thread_cond_signal(&row->cond); } } /* post-processing for current lcu row ------------------------- */ if (h->param->enable_sao && (h->slice_sao_on[0] || h->slice_sao_on[1] || h->slice_sao_on[2])) { int sao_off_num_y = 0; int sao_off_num_u = 0; int sao_off_num_v = 0; int idx_lcu = i_lcu_y * h->i_width_in_lcu; for (i_lcu_x = 0; i_lcu_x < h->i_width_in_lcu; i_lcu_x++, idx_lcu++) { if (h->sao_blk_params[idx_lcu][0].typeIdc == SAO_TYPE_OFF) { sao_off_num_y++; } if (h->sao_blk_params[idx_lcu][1].typeIdc == SAO_TYPE_OFF) { sao_off_num_u++; } if (h->sao_blk_params[idx_lcu][2].typeIdc == SAO_TYPE_OFF) { sao_off_num_v++; } } h->num_sao_lcu_off[i_lcu_y][0] = sao_off_num_y; h->num_sao_lcu_off[i_lcu_y][1] = sao_off_num_u; h->num_sao_lcu_off[i_lcu_y][2] = sao_off_num_v; } else { int num_lcu = h->i_width_in_lcu; h->num_sao_lcu_off[i_lcu_y][0] = num_lcu; h->num_sao_lcu_off[i_lcu_y][1] = num_lcu; h->num_sao_lcu_off[i_lcu_y][2] = num_lcu; } if (h->param->enable_alf && (h->pic_alf_on[0] || h->pic_alf_on[1] || h->pic_alf_on[2])) { if (h->i_type == SLICE_TYPE_B && IS_ALG_ENABLE(OPT_FAST_ALF)) { i_lcu_x = ((i_lcu_y + h->fenc->i_frm_coi) & 1); for (; i_lcu_x < h->i_width_in_lcu; i_lcu_x += 2) { alf_get_statistics_lcu(h, i_lcu_x, i_lcu_y, h->fenc, h->fdec); } } else { for (i_lcu_x = 0; i_lcu_x < h->i_width_in_lcu; i_lcu_x++) { alf_get_statistics_lcu(h, i_lcu_x, i_lcu_y, h->fenc, h->fdec); } } } /* reference frame */ if (h->fdec->rps.referd_by_others) { /* store cu info */ store_cu_info_row(row); /* expand border */ xavs2_frame_expand_border_lcurow(h, h->fdec, i_lcu_y); /* interpolate (after finished expanding border) */ #if ENABLE_FRAME_SUBPEL_INTPL if (h->use_fractional_me != 0) { interpolate_lcu_row(h, h->fdec, i_lcu_y); } #endif if (last_row) { /* make sure the top row have finished interpolation and padding */ xavs2_frame_t *fdec = h->fdec; xavs2_thread_mutex_lock(&fdec->mutex); /* lock */ while (fdec->num_lcu_coded_in_row[last_row->row] < h->i_width_in_lcu) { xavs2_thread_cond_wait(&fdec->cond, &fdec->mutex); } xavs2_thread_mutex_unlock(&fdec->mutex); /* unlock */ } } /* release task */ xavs2e_release_row_task(row); return 0; } /* --------------------------------------------------------------------------- * start encodes one slice */ void xavs2_slice_write_start(xavs2_t *h) { aec_t *p_aec = &h->aec; slice_t *slice = h->slices[h->i_slice_index]; /* init slice */ #if ENABLE_RATE_CONTROL_CU h->frameinfo->rows[slice->i_first_lcu_y].lcus[0].last_dqp = 0; #endif slice->i_qp = h->i_qp; /* init bs_t, reserve space to store the length of bitstream */ xavs2_bs_init(&slice->bs, slice->p_slice_bs_buf, slice->len_slice_bs_buf); sao_slice_onoff_decision(h, h->slice_sao_on); /* write slice header */ xavs2_slice_header_write(h, slice); bs_byte_align(&slice->bs); /* init AEC */ aec_start(h, p_aec, slice->bs.p_start + PSEUDO_CODE_SIZE, slice->bs.p_end, 0); /* init slice buffers */ slice_init_bufer(h, slice); /* prediction mode is set to -1 outside the frame, * indicating that no prediction can be made from this part */ { int ip_stride = h->i_width_in_minpu + 16; int lcu_height_in_pu = ((1 << h->i_lcu_level) >> MIN_PU_SIZE_IN_BIT); g_funcs.fast_memset((h->ipredmode - ip_stride - 16), -1, (lcu_height_in_pu + 1) * ip_stride * sizeof(int8_t)); } } xavs2-1.3/source/encoder/slice.h000066400000000000000000000243101340660520300166060ustar00rootroot00000000000000/* * slice.h * * Description of this file: * Slice Processing functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_SLICE_H #define XAVS2_SLICE_H /** * =========================================================================== * structures * =========================================================================== */ typedef struct slice_row_index_t { int16_t lcu_y; /* б */ int8_t slice_idx; /* ڵSlice */ int8_t row_type; /* 0: SliceʼλõУ1:ͨ2: Sliceλõ */ } slice_row_index_t; extern slice_row_index_t g_slice_lcu_row_order[1024]; /* --------------------------------------------------------------------------- * ʼSlicebufferָ */ static ALWAYS_INLINE void slice_init_bufer(xavs2_t *h, slice_t *slice) { /* init slice buffers */ h->ipredmode = slice->slice_ipredmode; h->intra_border[0] = slice->slice_intra_border[0]; h->intra_border[1] = slice->slice_intra_border[1]; h->intra_border[2] = slice->slice_intra_border[2]; h->p_deblock_flag[0] = slice->slice_deblock_flag[0]; h->p_deblock_flag[1] = slice->slice_deblock_flag[1]; } /* --------------------------------------------------------------------------- * ȴһLCUָLCU */ static ALWAYS_INLINE void wait_lcu_row_coded(row_info_t *last_row, int wait_lcu_coded) { if (last_row != NULL && last_row->coded < wait_lcu_coded) { xavs2_thread_mutex_lock(&last_row->mutex); /* lock */ while (last_row->coded < wait_lcu_coded) { xavs2_thread_cond_wait(&last_row->cond, &last_row->mutex); } xavs2_thread_mutex_unlock(&last_row->mutex); /* unlock */ } } /* --------------------------------------------------------------------------- * ѯһLCUǷѱ */ static ALWAYS_INLINE int is_lcu_row_finished(xavs2_t *h, xavs2_frame_t *frm, int lcu_row) { return (frm->num_lcu_coded_in_row[lcu_row] > h->i_width_in_lcu); } /* --------------------------------------------------------------------------- * ѯһLCUǷѱ */ static ALWAYS_INLINE void set_lcu_row_finished(xavs2_t *h, xavs2_frame_t *frm, int lcu_row) { frm->num_lcu_coded_in_row[lcu_row] = h->i_width_in_lcu + 1; } /* --------------------------------------------------------------------------- * release a row task */ static INLINE void xavs2e_release_row_task(row_info_t *row) { if (row) { xavs2_t *h = row->h; xavs2_frame_t *fdec = h->fdec; xavs2_handler_t *h_mgr = h->h_top; int b_slice_boundary_done = FALSE; /* ʱSlice߽Ѵֱ꣬ӽвֵҪ * Ҫд */ if (h->param->b_cross_slice_loop_filter == FALSE) { if (row->b_top_slice_border && row->row > 0) { if (is_lcu_row_finished(h, fdec, row->row - 1)) { int y_start = (row->row << h->i_lcu_level) - 4; interpolate_sample_rows(h, h->fdec, y_start, 8, 0, 0); b_slice_boundary_done = TRUE; } } else if (row->b_down_slice_border && row->row < h->i_height_in_lcu - 1) { if (is_lcu_row_finished(h, fdec, row->row + 1)) { int y_start = ((row->row + 1) << h->i_lcu_level) - 4; interpolate_sample_rows(h, h->fdec, y_start, 8, 0, 0); b_slice_boundary_done = TRUE; } } } else { /* TODO: SliceʱSlice߽Ĵ */ if (h->param->slice_num > 1) { xavs2_log(NULL, XAVS2_LOG_ERROR, "CrossSliceLoopFilter not supported now!\n"); assert(0); } } xavs2_thread_mutex_lock(&fdec->mutex); /* lock */ if (h->param->b_cross_slice_loop_filter == FALSE) { if (b_slice_boundary_done == FALSE && row->b_top_slice_border && row->row > 0) { if (is_lcu_row_finished(h, fdec, row->row - 1)) { int y_start = (row->row << h->i_lcu_level) - 4; interpolate_sample_rows(h, h->fdec, y_start, 8, 0, 0); // xavs2_log(NULL, XAVS2_LOG_DEBUG, "Intp2 POC [%3d], Slice %2d, Row %2d, [%3d, %3d)\n", // h->fenc->i_frame, h->i_slice_index, row->row, y_start, y_start + 8); } } else if (b_slice_boundary_done == FALSE && row->b_down_slice_border && row->row < h->i_height_in_lcu - 1) { if (is_lcu_row_finished(h, fdec, row->row + 1)) { int y_start = ((row->row + 1) << h->i_lcu_level) - 4; interpolate_sample_rows(h, h->fdec, y_start, 8, 0, 0); // xavs2_log(NULL, XAVS2_LOG_DEBUG, "Intp3 POC [%3d], Slice %2d, Row %2d, [%3d, %3d)\n", // h->fenc->i_frame, h->i_slice_index, row->row, y_start, y_start + 8); } } } else { /* TODO: SliceʱSlice߽Ĵ */ } set_lcu_row_finished(h, fdec, row->row); xavs2_thread_mutex_unlock(&fdec->mutex); /* unlock */ /* broadcast to the aec thread and all waiting contexts */ xavs2_thread_cond_broadcast(&fdec->cond); if (h->task_type == XAVS2_TASK_ROW) { xavs2_thread_mutex_lock(&h_mgr->mutex); /* lock */ h->task_status = XAVS2_TASK_FREE; xavs2_thread_mutex_unlock(&h_mgr->mutex); /* unlock */ /* signal a free row context available */ xavs2_thread_cond_signal(&h_mgr->cond[SIG_ROW_CONTEXT_RELEASED]); } } } /* --------------------------------------------------------------------------- * sync of frame parallel coding */ static ALWAYS_INLINE void xavs2e_inter_sync(xavs2_t *h, int lcu_y, int lcu_x) { if (h->i_type != SLICE_TYPE_I && h->h_top->i_frm_threads > 1) { int num_lcu_delay = ((h->param->search_range + (1 << h->i_lcu_level) - 1) >> h->i_lcu_level) + 1; int low_bound = XAVS2_MAX(lcu_y - num_lcu_delay, 0); int up_bound = XAVS2_MIN(lcu_y + num_lcu_delay, h->i_height_in_lcu - 1); int col_coded = h->i_width_in_lcu; int i, j; UNUSED_PARAMETER(lcu_x); for (i = 0; i < h->i_ref; i++) { xavs2_frame_t *p_ref = h->fref[i]; for (j = low_bound; j <= up_bound; j++) { xavs2_thread_mutex_lock(&p_ref->mutex); /* lock */ while (p_ref->num_lcu_coded_in_row[j] < col_coded) { xavs2_thread_cond_wait(&p_ref->cond, &p_ref->mutex); } xavs2_thread_mutex_unlock(&p_ref->mutex); /* unlock */ } } } } /* --------------------------------------------------------------------------- * get a row encoder handle */ static INLINE xavs2_t *xavs2e_alloc_row_task(xavs2_t *h) { xavs2_handler_t *h_mgr = h->h_top; int i; assert(h->task_type == XAVS2_TASK_FRAME && h->frameinfo); xavs2_thread_mutex_lock(&h_mgr->mutex); /* lock */ /* wait until we successfully get one free row context */ for (; h_mgr->i_exit_flag != XAVS2_EXIT_THREAD;) { for (i = 0; i < h_mgr->num_row_contexts; i++) { xavs2_t *h_row_coder = &h_mgr->row_contexts[i]; if (h_row_coder->task_status == XAVS2_TASK_FREE) { h_row_coder->task_status = XAVS2_TASK_BUSY; h_row_coder->frameinfo = h->frameinfo; /* duplicate frame info */ /* sync row contexts */ memcpy(&h_row_coder->row_vars_1, &h->row_vars_1, (uint8_t *)&h->row_vars_2 - (uint8_t *)&h->row_vars_1); /* make the state of the aec engine same as the one when the slice starts */ /* h->aecλòͬܲһLCUбʱ֤ͬһ */ aec_copy_aec_state(&h_row_coder->aec, &h->aec); /* unlock */ xavs2_thread_mutex_unlock(&h_mgr->mutex); return h_row_coder; } } xavs2_thread_cond_wait(&h_mgr->cond[SIG_ROW_CONTEXT_RELEASED], &h_mgr->mutex); } /* unlock */ xavs2_thread_mutex_unlock(&h_mgr->mutex); return NULL; } #define xavs2_slices_init FPFX(slices_init) void xavs2_slices_init(xavs2_t *h); #define xavs2_slice_write_start FPFX(slice_write_start) void xavs2_slice_write_start(xavs2_t *h); #define xavs2_lcu_row_write FPFX(lcu_row_write) void *xavs2_lcu_row_write(void *arg); #define slice_lcu_row_order_init FPFX(slice_lcu_row_order_init) void slice_lcu_row_order_init(xavs2_t *h); #define xavs2e_encode_one_frame FPFX(xavs2e_encode_one_frame) void *xavs2e_encode_one_frame(void *arg); #endif // XAVS2_SLICE_H xavs2-1.3/source/encoder/tdrdo.c000066400000000000000000000666551340660520300166400ustar00rootroot00000000000000/* * tdrdo.c * * Description of this file: * TDRDO functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "tdrdo.h" #include "wrapper.h" #include "frame.h" #define WORKBLOCKSIZE 64 #define SEARCHRANGE 64 /** * =========================================================================== * type defines * =========================================================================== */ typedef struct Frame { uint32_t FrameWidth; uint32_t FrameHeight; uint32_t nStrideY; pel_t *Y_base; } Frame; typedef struct BlockDistortion { uint32_t GlobalBlockNumber; uint16_t BlockNumInHeight; uint16_t BlockNumInWidth; uint16_t BlockWidth; uint16_t BlockHeight; uint16_t OriginX; uint16_t OriginY; uint16_t SearchRange; short MVx; short MVy; double MSE; double MVL; short BlockQP; double BlockLambda; short BlockType; } BlockDistortion, BD; typedef struct FrameDistortion { uint32_t FrameNumber; uint32_t BlockSize; uint32_t CUSize; uint32_t TotalNumOfBlocks; uint32_t TotalBlockNumInHeight; uint32_t TotalBlockNumInWidth; BD *BlockDistortionArray; struct FrameDistortion *subFrameDistortionArray; } FrameDistortion, FD; typedef struct DistortionList { uint32_t TotalFrameNumber; uint32_t FrameWidth; uint32_t FrameHeight; uint32_t BlockSize; FD *FrameDistortionArray; } DistortionList, DL; struct td_rdo_t { Frame porgF; Frame ppreF; Frame precF; DL OMCPDList; DL RealDList; FD *pOMCPFD, *pRealFD; int StepLength; double *KappaTable; double GlobeLambdaRatio; int GlobeFrameNumber; int CurMBQP; int QpOffset[32]; int globenumber; double *D; double *DMCP; double *BetaTable; double *MultiplyBetas; }; typedef struct Block { uint32_t BlockWidth; uint32_t BlockHeight; uint32_t OriginX; uint32_t OriginY; } Block; /** * =========================================================================== * local function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static DL *CreatDistortionList(DL *NewDL, uint32_t totalframenumber, uint32_t width, uint32_t height, uint32_t blocksize, uint32_t cusize) { int tBlockNumInHeight, tBlockNumInWidth, tBlockNumber; uint32_t i; NewDL->TotalFrameNumber = totalframenumber; NewDL->FrameWidth = width; NewDL->FrameHeight = height; blocksize = blocksize < 4 ? 4 : (blocksize > 64 ? 64 : blocksize); NewDL->BlockSize = blocksize; tBlockNumInHeight = (int)ceil(1.0 * height / blocksize); tBlockNumInWidth = (int)ceil(1.0 * width / blocksize); tBlockNumber = tBlockNumInHeight * tBlockNumInWidth; for (i = 0; i < totalframenumber; i++) { NewDL->FrameDistortionArray[i].FrameNumber = i; NewDL->FrameDistortionArray[i].BlockSize = blocksize; NewDL->FrameDistortionArray[i].CUSize = cusize; NewDL->FrameDistortionArray[i].TotalNumOfBlocks = tBlockNumber; NewDL->FrameDistortionArray[i].TotalBlockNumInHeight = tBlockNumInHeight; NewDL->FrameDistortionArray[i].TotalBlockNumInWidth = tBlockNumInWidth; NewDL->FrameDistortionArray[i].BlockDistortionArray = NULL; NewDL->FrameDistortionArray[i].subFrameDistortionArray = NULL; } return NewDL; } /* --------------------------------------------------------------------------- */ static double CalculateBlockMSE(Frame *FA, Frame *FB, Block *A, Block *B) { uint16_t x, y; int e, blockpixel = A->BlockHeight * A->BlockWidth; pel_t *YA, *YB; double dSSE = 0; YA = FA->Y_base + A->OriginY * FA->nStrideY + A->OriginX; YB = FB->Y_base + B->OriginY * FB->nStrideY + B->OriginX; for (y = 0; y < A->BlockHeight; y++) { for (x = 0; x < A->BlockWidth; x++) { e = YA[x] - YB[x]; dSSE += e * e; } YA = YA + FA->nStrideY; YB = YB + FB->nStrideY; } return dSSE / blockpixel; } /* --------------------------------------------------------------------------- */ static void MotionDistortion(FD *currentFD, Frame *FA, Frame *FB, uint32_t searchrange) { static int dlx[9] = {0, -2, -1, 0, 1, 2, 1, 0, -1}; static int dly[9] = {0, 0, -1, -2, -1, 0, 1, 2, 1}; static int dsx[5] = {0, -1, 0, 1, 0}; static int dsy[5] = {0, 0, -1, 0, 1}; double currentMSE, candidateMSE; BD *currentBD; Block BA, BB; Block *pBA, *pBB; uint32_t blocksize, TotalBlockNumInHeight, TotalBlockNumInWidth, nBH, nBW; int top, bottom, left, right; int *searchpatternx = NULL; int *searchpatterny = NULL; int patternsize = 0; int cx, cy; int l; int flag9p, flag5p; int nextcx = 0; int nextcy = 0; int x, y; pBA = &BA; pBB = &BB; blocksize = currentFD->BlockSize; TotalBlockNumInHeight = currentFD->TotalBlockNumInHeight; TotalBlockNumInWidth = currentFD->TotalBlockNumInWidth; for (nBH = 0; nBH < TotalBlockNumInHeight; nBH++) { for (nBW = 0; nBW < TotalBlockNumInWidth; nBW++) { memset(pBA, 0, sizeof(BA)); memset(pBB, 0, sizeof(BB)); pBA->OriginX = blocksize * nBW; pBA->OriginY = blocksize * nBH; pBA->BlockHeight = blocksize * (nBH + 1) < FA->FrameHeight ? blocksize : FA->FrameHeight - blocksize * nBH; pBA->BlockWidth = blocksize * (nBW + 1) < FA->FrameWidth ? blocksize : FA->FrameWidth - blocksize * nBW; currentBD = ¤tFD->BlockDistortionArray[nBH * TotalBlockNumInWidth + nBW]; currentBD->GlobalBlockNumber = nBH * TotalBlockNumInWidth + nBW; currentBD->BlockNumInHeight = (uint16_t)nBH; currentBD->BlockNumInWidth = (uint16_t)nBW; currentBD->BlockWidth = (uint16_t)pBA->BlockWidth; currentBD->BlockHeight = (uint16_t)pBA->BlockHeight; currentBD->OriginX = (uint16_t)pBA->OriginX; currentBD->OriginY = (uint16_t)pBA->OriginY; currentBD->SearchRange = (uint16_t)searchrange; top = pBA->OriginY - searchrange; bottom = pBA->OriginY + searchrange; left = pBA->OriginX - searchrange; right = pBA->OriginX + searchrange; top = XAVS2_CLIP3(0, (int)(FB->FrameHeight - pBA->BlockHeight), top); bottom = XAVS2_CLIP3(0, (int)(FB->FrameHeight - pBA->BlockHeight), bottom); left = XAVS2_CLIP3(0, (int)(FB->FrameWidth - pBA->BlockWidth ), left); right = XAVS2_CLIP3(0, (int)(FB->FrameWidth - pBA->BlockWidth ), right); pBB->BlockHeight = pBA->BlockHeight; pBB->BlockWidth = pBA->BlockWidth; flag5p = 0; flag9p = 1; cy = pBA->OriginY; cx = pBA->OriginX; while (flag9p || flag5p) { candidateMSE = 1048576; // 1048576 = 1024 * 1024; if (flag9p) { searchpatternx = dlx; searchpatterny = dly; patternsize = 9; } else if (flag5p) { searchpatternx = dsx; searchpatterny = dsy; patternsize = 5; } for (l = 0; l < patternsize; l++) { y = cy + searchpatterny[l]; x = cx + searchpatternx[l]; if (x >= left && x <= right && y >= top && y <= bottom) { pBB->OriginX = x; pBB->OriginY = y; currentMSE = CalculateBlockMSE(FA, FB, pBA, pBB); if (currentMSE < candidateMSE) { candidateMSE = currentMSE; currentBD->MSE = currentMSE; nextcx = x; nextcy = y; } } } if (cy == nextcy && cx == nextcx) { flag9p = 0; flag5p = 1 - flag5p; } else { cy = nextcy; cx = nextcx; } } } } } /* --------------------------------------------------------------------------- */ static void StoreLCUInf(FD *curRealFD, int LeaderBlockNumber, int cuinwidth, int iqp, rdcost_t lambda, int curtype) { BD *workBD; int LeaderNumber = ((LeaderBlockNumber % cuinwidth) / 8 + LeaderBlockNumber / cuinwidth / 8 * curRealFD->TotalBlockNumInWidth) * (curRealFD->CUSize / curRealFD->BlockSize); int workBlockNum; uint32_t x, y, top, left, bottom, right; top = LeaderNumber / curRealFD->TotalBlockNumInWidth; left = LeaderNumber % curRealFD->TotalBlockNumInWidth; bottom = top + curRealFD->CUSize / curRealFD->BlockSize; bottom = bottom <= curRealFD->TotalBlockNumInHeight ? bottom : curRealFD->TotalBlockNumInHeight; right = left + curRealFD->CUSize / curRealFD->BlockSize; right = right <= curRealFD->TotalBlockNumInWidth ? right : curRealFD->TotalBlockNumInWidth; workBlockNum = LeaderNumber; for (y = top; y < bottom; y++) { for (x = left; x < right; x++) { workBD = &curRealFD->BlockDistortionArray[workBlockNum + x - left]; workBD->BlockQP = (short)iqp; workBD->BlockLambda = (double)lambda; workBD->BlockType = (short)curtype; } workBlockNum = workBlockNum + curRealFD->TotalBlockNumInWidth; } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE double F(double invalue) { double f; if (invalue < 0.5F) { f = 0.015F; } else if (invalue < 2.0F) { f = (54.852103 * invalue * invalue + 10.295705 * invalue - 3.667158) / 1000; } else if (invalue < 8.0F) { f = (-19.235059 * invalue * invalue + 311.129530 * invalue - 317.360050) / 1000 - 0.2280 + 0.2363; } else { f = 0.949F; } return XAVS2_CLIP3F(0.015F, 0.949F, f); } /* --------------------------------------------------------------------------- */ static void CaculateKappaTableLDP(xavs2_t *h, DL *omcplist, DL *realDlist, int framenum, int FrameQP) { td_rdo_t *td_rdo = h->td_rdo; BD *p1stBD, *pcurBD; const double tdrdoAlpha = 0.94F; double fxvalue; double *D, *DMCP; double *BetaTable, *MultiplyBetas; double DsxKappa, Ds; int TotalBlocksInAframe = realDlist->FrameDistortionArray[0].TotalNumOfBlocks; int BetaLength; int PreFrameQP; int t, b; BetaLength = realDlist->TotalFrameNumber - 1 - framenum - 1; BetaLength = XAVS2_MIN(2, BetaLength); memset(td_rdo->KappaTable, 0, TotalBlocksInAframe * sizeof(double)); if (framenum <= 0) { return; } D = td_rdo->D; DMCP = td_rdo->DMCP; BetaTable = td_rdo->BetaTable; MultiplyBetas = td_rdo->MultiplyBetas; memset(D, 0, TotalBlocksInAframe * sizeof(double)); memset(DMCP, 0, TotalBlocksInAframe * sizeof(double)); memset(BetaTable, 0, TotalBlocksInAframe * sizeof(double)); memset(MultiplyBetas, 0, TotalBlocksInAframe * sizeof(double)); p1stBD = realDlist->FrameDistortionArray[framenum - 1].BlockDistortionArray; for (b = 0; b < TotalBlocksInAframe; b++) { D[b] = p1stBD[b].MSE; BetaTable[b] = 1.0F; } for (b = 0; b < TotalBlocksInAframe; b++) { MultiplyBetas[b] = 1.0; } pcurBD = omcplist->FrameDistortionArray[framenum - 1].BlockDistortionArray; for (t = 0; t <= BetaLength; t++) { PreFrameQP = FrameQP - td_rdo->QpOffset[framenum % h->i_gop_size] + td_rdo->QpOffset[(framenum + t) % h->i_gop_size]; for (b = 0; b < TotalBlocksInAframe; b++) { DMCP[b] = tdrdoAlpha * (D[b] + pcurBD[b].MSE); } for (b = 0; b < TotalBlocksInAframe; b++) { fxvalue = (sqrt(2.0) * pow(2.0, (PreFrameQP) / 8.0)) / sqrt(DMCP[b]); D[b] = DMCP[b] * F(fxvalue); BetaTable[b] = tdrdoAlpha * F(fxvalue); if (t > 0) { MultiplyBetas[b] *= BetaTable[b]; td_rdo->KappaTable[b] += MultiplyBetas[b]; } } } DsxKappa = Ds = 0.0F; for (b = 0; b < TotalBlocksInAframe; b++) { t = framenum - 1; Ds += realDlist->FrameDistortionArray[t].BlockDistortionArray[b].MSE; DsxKappa += realDlist->FrameDistortionArray[t].BlockDistortionArray[b].MSE * (1.0F + td_rdo->KappaTable[b]); } td_rdo->GlobeLambdaRatio = DsxKappa / Ds; } /* --------------------------------------------------------------------------- */ static void AdjustLcuQPLambdaLDP(xavs2_t *h, FD *curOMCPFD, int LeaderBlockNumber, int cuinwidth, rdcost_t *plambda) { td_rdo_t *td_rdo = h->td_rdo; double ArithmeticMean, HarmonicMean, GeometricMean; double SumOfMSE; double Kappa, LambdaRatio, dDeltaQP; uint32_t x, y, top, left, bottom, right; int LeaderNumber; int workBlockNum; int counter, iDeltaQP; if (curOMCPFD == NULL) { return; } if (td_rdo->KappaTable == NULL) { dDeltaQP = 0.0F; iDeltaQP = dDeltaQP > 0 ? (int)(dDeltaQP + 0.5) : -(int)(-dDeltaQP + 0.5); iDeltaQP = XAVS2_CLIP3F(-2, 2, iDeltaQP); return; } LeaderNumber = ((LeaderBlockNumber % cuinwidth) / 8 + LeaderBlockNumber / cuinwidth / 8 * curOMCPFD->TotalBlockNumInWidth) * (curOMCPFD->CUSize / curOMCPFD->BlockSize); top = LeaderNumber / curOMCPFD->TotalBlockNumInWidth; left = LeaderNumber % curOMCPFD->TotalBlockNumInWidth; bottom = top + curOMCPFD->CUSize / curOMCPFD->BlockSize; bottom = bottom <= curOMCPFD->TotalBlockNumInHeight ? bottom : curOMCPFD->TotalBlockNumInHeight; right = left + curOMCPFD->CUSize / curOMCPFD->BlockSize; right = right <= curOMCPFD->TotalBlockNumInWidth ? right : curOMCPFD->TotalBlockNumInWidth; ArithmeticMean = 0.0; HarmonicMean = 0.0; GeometricMean = 1.0; SumOfMSE = 0.0; counter = 0; workBlockNum = LeaderNumber; for (y = top; y < bottom; y++) { for (x = left; x < right; x++) { SumOfMSE += curOMCPFD->BlockDistortionArray[workBlockNum + x - left].MSE; Kappa = td_rdo->KappaTable[workBlockNum + x - left]; ArithmeticMean += Kappa; HarmonicMean += 1.0 / Kappa; GeometricMean *= Kappa; counter++; } workBlockNum = workBlockNum + curOMCPFD->TotalBlockNumInWidth; } if (counter == 0) { return; } Kappa = ArithmeticMean / counter; SumOfMSE = SumOfMSE / counter; LambdaRatio = td_rdo->GlobeLambdaRatio / (1.0F + Kappa); LambdaRatio = XAVS2_CLIP3F(pow(2.0, -3.0 / 4.0), pow(2.0, 3.0 / 4.0), LambdaRatio); dDeltaQP = (4.0 / log(2.0F)) * log(LambdaRatio); iDeltaQP = dDeltaQP > 0.0 ? (int)(dDeltaQP + 0.5) : -(int)(-dDeltaQP - 0.5); iDeltaQP = XAVS2_CLIP3F(-3, 3, iDeltaQP); *plambda = (rdcost_t)((*plambda) * LambdaRatio); } /* --------------------------------------------------------------------------- * i_frame_index: frame number in file */ static FD *SearchFrameDistortionArray(DL *omcplist, int i_frame_index, int StepLength, int IntraPeriod) { FD *NewFD = NULL; int keyframenum = i_frame_index / StepLength; int subframenumIndex = i_frame_index % StepLength; int subframenum; int i; if (subframenumIndex == 0) { NewFD = &omcplist->FrameDistortionArray[keyframenum - 1]; } if (subframenumIndex != 0 && IntraPeriod == 0) { NewFD = &omcplist->FrameDistortionArray[keyframenum].subFrameDistortionArray[subframenumIndex - 1]; } if (subframenumIndex != 0 && IntraPeriod != 0) { for (i = 0; i < StepLength - 1; i++) { subframenum = omcplist->FrameDistortionArray[keyframenum].subFrameDistortionArray[i].FrameNumber; if (subframenum == i_frame_index) { NewFD = &omcplist->FrameDistortionArray[keyframenum].subFrameDistortionArray[i]; break; } } } return NewFD; } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ int tdrdo_get_buffer_size(xavs2_param_t *param) { int StepLength = param->num_bframes == 0 ? 1 : param->i_gop_size; int num_frames = 0; int size_blocks; if (param->enable_tdrdo) { if (!param->num_bframes) { num_frames += (param->num_frames / StepLength + 1); num_frames += (param->num_frames / StepLength + 1); } else { num_frames += (param->num_frames - 1) / StepLength + 1; num_frames += param->num_frames + 1; } } size_blocks = 5 * sizeof(double) * (int)ceil(1.0 * param->org_width / WORKBLOCKSIZE) * (int)ceil(1.0 * param->org_height / WORKBLOCKSIZE); return sizeof(td_rdo_t) + num_frames * sizeof(FD) + size_blocks; } /* --------------------------------------------------------------------------- */ int tdrdo_init(td_rdo_t *td_rdo, xavs2_param_t *param) { uint8_t *mem_ptr = (uint8_t *)td_rdo; uint8_t *mem_start = mem_ptr; int size_buffer = tdrdo_get_buffer_size(param); int num_blocks = (int)ceil(1.0 * param->org_width / WORKBLOCKSIZE) * (int)ceil(1.0 * param->org_height / WORKBLOCKSIZE); int i; if (param->num_bframes != 0) { xavs2_log(NULL, XAVS2_LOG_ERROR, "TDRDO cannot be used in RA configuration.\n"); return -1; } /* memory alloc */ memset(td_rdo, 0, size_buffer); mem_ptr += sizeof(td_rdo_t); td_rdo->KappaTable = (double *)mem_ptr; mem_ptr += sizeof(double) * num_blocks; td_rdo->StepLength = param->num_bframes == 0 ? 1 : param->i_gop_size; if (!param->num_bframes) { td_rdo->OMCPDList.FrameDistortionArray = (FD *)mem_ptr; mem_ptr += (param->num_frames / td_rdo->StepLength + 1) * sizeof(FD); CreatDistortionList(&td_rdo->OMCPDList, param->num_frames / td_rdo->StepLength + 1, param->org_width, param->org_height, WORKBLOCKSIZE, 1 << param->lcu_bit_level); td_rdo->RealDList.FrameDistortionArray = (FD *)mem_ptr; mem_ptr += (param->num_frames / td_rdo->StepLength + 1) * sizeof(FD); CreatDistortionList(&td_rdo->RealDList, param->num_frames / td_rdo->StepLength + 1, param->org_width, param->org_height, WORKBLOCKSIZE, 1 << param->lcu_bit_level); } else { td_rdo->OMCPDList.FrameDistortionArray = (FD *)mem_ptr; mem_ptr += ((param->num_frames - 1) / td_rdo->StepLength + 1) * sizeof(FD); CreatDistortionList(&td_rdo->OMCPDList, (param->num_frames - 1) / td_rdo->StepLength + 1, param->org_width, param->org_height, WORKBLOCKSIZE, 1 << param->lcu_bit_level); td_rdo->RealDList.FrameDistortionArray = (FD *)mem_ptr; mem_ptr += (param->num_frames + 1) * sizeof(FD); CreatDistortionList(&td_rdo->RealDList, param->num_frames + 1, param->org_width, param->org_height, WORKBLOCKSIZE, 1 << param->lcu_bit_level); } td_rdo->porgF.FrameWidth = param->org_width; td_rdo->porgF.FrameHeight = param->org_height; memcpy(&td_rdo->ppreF, &td_rdo->porgF, sizeof(Frame)); memcpy(&td_rdo->precF, &td_rdo->porgF, sizeof(Frame)); /* copy of QP offset */ for (i = 0; i < param->i_gop_size; i++) { td_rdo->QpOffset[i] = param->cfg_ref_all[i].qp_offset; } td_rdo->D = (double *)mem_ptr; mem_ptr += num_blocks * sizeof(double); td_rdo->DMCP = (double *)mem_ptr; mem_ptr += num_blocks * sizeof(double); td_rdo->BetaTable = (double *)mem_ptr; mem_ptr += num_blocks * sizeof(double); td_rdo->MultiplyBetas = (double *)mem_ptr; mem_ptr += num_blocks * sizeof(double); if (mem_ptr - mem_start <= size_buffer) { return 0; } else { xavs2_log(NULL, XAVS2_LOG_ERROR, "TDRDO init error detected.\n"); return -1; } } /* --------------------------------------------------------------------------- */ void tdrdo_destroy(td_rdo_t *td_rdo) { UNUSED_PARAMETER(td_rdo); } /* --------------------------------------------------------------------------- */ void tdrdo_frame_start(xavs2_t *h) { td_rdo_t *td_rdo = h->td_rdo; assert(td_rdo != NULL); td_rdo->GlobeFrameNumber = h->ip_pic_idx; if (h->param->num_bframes) { td_rdo->pRealFD = &td_rdo->RealDList.FrameDistortionArray[td_rdo->GlobeFrameNumber]; } else { td_rdo->pRealFD = &td_rdo->RealDList.FrameDistortionArray[td_rdo->globenumber]; } td_rdo->pRealFD->BlockDistortionArray = (BD *)xavs2_calloc(td_rdo->pRealFD->TotalNumOfBlocks, sizeof(BD)); if (td_rdo->GlobeFrameNumber % td_rdo->StepLength == 0) { if (h->fenc->i_frame == 0) { td_rdo->porgF.Y_base = h->fenc->planes[IMG_Y]; td_rdo->porgF.nStrideY = h->fenc->i_stride[IMG_Y]; td_rdo->ppreF.Y_base = h->img_luma_pre->planes[IMG_Y]; td_rdo->ppreF.nStrideY = h->img_luma_pre->i_stride[IMG_Y]; xavs2_frame_copy_planes(h, h->img_luma_pre, h->fenc); } else if ((int)h->fenc->i_frame < h->param->num_frames) { td_rdo->pOMCPFD = &td_rdo->OMCPDList.FrameDistortionArray[td_rdo->GlobeFrameNumber - 1]; td_rdo->pOMCPFD->BlockDistortionArray = (BD *)xavs2_calloc(td_rdo->pOMCPFD->TotalNumOfBlocks, sizeof(BD)); td_rdo->porgF.Y_base = h->fenc->planes[IMG_Y]; td_rdo->porgF.nStrideY = h->fenc->i_stride[IMG_Y]; MotionDistortion(td_rdo->pOMCPFD, &td_rdo->ppreF, &td_rdo->porgF, SEARCHRANGE); xavs2_frame_copy_planes(h, h->img_luma_pre, h->fenc); } td_rdo->pOMCPFD = NULL; } if (td_rdo->GlobeFrameNumber % td_rdo->StepLength == 0 && td_rdo->GlobeFrameNumber < h->param->num_frames - 1) { CaculateKappaTableLDP(h, &td_rdo->OMCPDList, &td_rdo->RealDList, td_rdo->GlobeFrameNumber, h->i_qp); } } /* --------------------------------------------------------------------------- */ void tdrdo_frame_done(xavs2_t *h) { FD *pDelFD; int DelFDNumber; td_rdo_t *td_rdo = h->td_rdo; assert(td_rdo != NULL); if ((h->fenc->i_frame % td_rdo->StepLength == 0 && !h->param->num_bframes) || h->param->num_bframes) { td_rdo->precF.Y_base = h->fdec->planes[IMG_Y]; //td_rdo->precF.nStrideY = h->fdec->i_stride[IMG_Y];// fdec->stride[0] , bitrate rise ? td_rdo->precF.nStrideY = h->img_luma_pre->i_stride[IMG_Y]; //to check: fdec->stride[0] ? by lutao MotionDistortion(td_rdo->pRealFD, &td_rdo->porgF, &td_rdo->precF, 0); } td_rdo->pRealFD->FrameNumber = h->fenc->i_frame; td_rdo->globenumber++; DelFDNumber = td_rdo->globenumber - td_rdo->StepLength - 1; if (DelFDNumber >= 0) { pDelFD = &td_rdo->RealDList.FrameDistortionArray[DelFDNumber]; if (pDelFD->BlockDistortionArray != NULL) { xavs2_free(pDelFD->BlockDistortionArray); } pDelFD->BlockDistortionArray = NULL; } if (h->fenc->i_frame % td_rdo->StepLength == 0) { DelFDNumber = h->fenc->i_frame / td_rdo->StepLength - 2; if (DelFDNumber >= 0) { pDelFD = &td_rdo->OMCPDList.FrameDistortionArray[DelFDNumber]; if (pDelFD->BlockDistortionArray != NULL) { xavs2_free(pDelFD->BlockDistortionArray); } pDelFD->BlockDistortionArray = NULL; if (pDelFD->subFrameDistortionArray != NULL) { xavs2_free(pDelFD->subFrameDistortionArray); } pDelFD->subFrameDistortionArray = NULL; } } } /* --------------------------------------------------------------------------- */ void tdrdo_lcu_adjust_lambda(xavs2_t *h, rdcost_t *new_lambda) { td_rdo_t *td_rdo = h->td_rdo; assert(td_rdo != NULL); td_rdo->CurMBQP = h->i_qp; if (td_rdo->GlobeFrameNumber < h->param->num_frames && h->i_type != SLICE_TYPE_I) { if (h->param->num_bframes && h->param->num_frames > 1 && td_rdo->GlobeFrameNumber <= ((int)((h->param->num_frames - 1) / td_rdo->StepLength))*td_rdo->StepLength) { td_rdo->pOMCPFD = SearchFrameDistortionArray(&td_rdo->OMCPDList, td_rdo->GlobeFrameNumber, td_rdo->StepLength, h->i_type); } else if (!h->param->num_bframes && h->param->num_frames > td_rdo->StepLength && td_rdo->GlobeFrameNumber % td_rdo->StepLength == 0) { td_rdo->pOMCPFD = &td_rdo->OMCPDList.FrameDistortionArray[(td_rdo->GlobeFrameNumber - 1) / td_rdo->StepLength]; } else { td_rdo->pOMCPFD = NULL; } } // Just for LDP if (h->i_type != SLICE_TYPE_I && h->param->num_bframes == 0) { AdjustLcuQPLambdaLDP(h, td_rdo->pOMCPFD, h->lcu.i_scu_xy, h->i_width_in_mincu, new_lambda); td_rdo->CurMBQP = XAVS2_CLIP3F(MIN_QP, MAX_QP, td_rdo->CurMBQP); } } /* --------------------------------------------------------------------------- */ void tdrdo_lcu_update(xavs2_t *h) { td_rdo_t *td_rdo = h->td_rdo; assert(td_rdo != NULL); if ((td_rdo->GlobeFrameNumber % td_rdo->StepLength == 0 && !h->param->num_bframes) || h->param->num_bframes) { // stores for key frame StoreLCUInf(td_rdo->pRealFD, h->lcu.i_scu_xy, h->param->org_width / MIN_CU_SIZE, td_rdo->CurMBQP, h->f_lambda_mode, h->i_type); } } xavs2-1.3/source/encoder/tdrdo.h000066400000000000000000000045511340660520300166300ustar00rootroot00000000000000/* * tdrdo.h * * Description of this file: * TDRDO functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_TDRDO_H #define XAVS2_TDRDO_H /** * =========================================================================== * function declares * =========================================================================== */ #define tdrdo_get_buffer_size FPFX(tdrdo_get_buffer_size) int tdrdo_get_buffer_size(xavs2_param_t *param); #define tdrdo_init FPFX(tdrdo_init) int tdrdo_init(td_rdo_t *td_rdo, xavs2_param_t *param); #define tdrdo_destroy FPFX(tdrdo_destroy) void tdrdo_destroy(td_rdo_t *td_rdo); #define tdrdo_frame_start FPFX(tdrdo_frame_start) void tdrdo_frame_start(xavs2_t *h); #define tdrdo_frame_done FPFX(tdrdo_frame_done) void tdrdo_frame_done(xavs2_t *h); #define tdrdo_lcu_adjust_lambda FPFX(tdrdo_lcu_adjust_lambda) void tdrdo_lcu_adjust_lambda(xavs2_t *h, rdcost_t *new_lambda); #define tdrdo_lcu_update FPFX(tdrdo_lcu_update) void tdrdo_lcu_update(xavs2_t *h); #endif // XAVS2_TDRDO_H xavs2-1.3/source/encoder/wquant.c000066400000000000000000000413361340660520300170300ustar00rootroot00000000000000/* * wquant.c * * Description of this file: * Weighted Quant functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "wquant.h" #if ENABLE_WQUANT /** * =========================================================================== * global/local tables * =========================================================================== */ /* --------------------------------------------------------------------------- */ const short tab_wq_param_default[2][6] = { { 67, 71, 71, 80, 80, 106}, { 64, 49, 53, 58, 58, 64 } }; /* --------------------------------------------------------------------------- */ static const int tab_WqMDefault4x4[16] = { 64, 64, 64, 68, 64, 64, 68, 72, 64, 68, 76, 80, 72, 76, 84, 96 }; /* --------------------------------------------------------------------------- */ static const int tab_WqMDefault8x8[64] = { 64, 64, 64, 64, 68, 68, 72, 76, 64, 64, 64, 68, 72, 76, 84, 92, 64, 64, 68, 72, 76, 80, 88, 100, 64, 68, 72, 80, 84, 92, 100, 28, 68, 72, 80, 84, 92, 104, 112, 128, 76, 80, 84, 92, 104, 116, 132, 152, 96, 100, 104, 116, 124, 140, 164, 188, 104, 108, 116, 128, 152, 172, 192, 216 }; /* --------------------------------------------------------------------------- * weight quant model for */ static const uint8_t tab_WeightQuantModel[4][64] = { // l a b c d h // 0 1 2 3 4 5 { 0, 0, 0, 4, 4, 4, 5, 5, // Mode 0 0, 0, 3, 3, 3, 3, 5, 5, 0, 3, 2, 2, 1, 1, 5, 5, 4, 3, 2, 2, 1, 5, 5, 5, 4, 3, 1, 1, 5, 5, 5, 5, 4, 3, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, { 0, 0, 0, 4, 4, 4, 5, 5, // Mode 1 0, 0, 4, 4, 4, 4, 5, 5, 0, 3, 2, 2, 2, 1, 5, 5, 3, 3, 2, 2, 1, 5, 5, 5, 3, 3, 2, 1, 5, 5, 5, 5, 3, 3, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, { 0, 0, 0, 4, 4, 3, 5, 5, // Mode 2 0, 0, 4, 4, 3, 2, 5, 5, 0, 4, 4, 3, 2, 1, 5, 5, 4, 4, 3, 2, 1, 5, 5, 5, 4, 3, 2, 1, 5, 5, 5, 5, 3, 2, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, { 0, 0, 0, 3, 2, 1, 5, 5, // Mode 3 0, 0, 4, 3, 2, 1, 5, 5, 0, 4, 4, 3, 2, 1, 5, 5, 3, 3, 3, 3, 2, 5, 5, 5, 2, 2, 2, 2, 5, 5, 5, 5, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 } }; /* --------------------------------------------------------------------------- */ static const uint8_t tab_WeightQuantModel4x4[4][16] = { // l a b c d h // 0 1 2 3 4 5 { 0, 4, 3, 5, // Mode 0 4, 2, 1, 5, 3, 1, 1, 5, 5, 5, 5, 5 }, { 0, 4, 4, 5, // Mode 1 3, 2, 2, 5, 3, 2, 1, 5, 5, 5, 5, 5 }, { 0, 4, 3, 5, // Mode 2 4, 3, 2, 5, 3, 2, 1, 5, 5, 5, 5, 5 }, { 0, 3, 1, 5, // Mode 3 3, 4, 2, 5, 1, 2, 2, 5, 5, 5, 5, 5 } }; /* --------------------------------------------------------------------------- */ static const char *tab_WQMType[2] = { "WQM_4X4", "WQM_8X8", }; /** * =========================================================================== * local function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void wq_get_default_matrix(int wqm_idx, int *src) { int wqm_size = 1 << (wqm_idx + 2); int i; if (wqm_idx == 0) { for (i = 0; i < wqm_size * wqm_size; i++) { src[i] = tab_WqMDefault4x4[i]; } } else if (wqm_idx == 1) { for (i = 0; i < wqm_size * wqm_size; i++) { src[i] = tab_WqMDefault8x8[i]; } } } /* --------------------------------------------------------------------------- */ static void wq_get_user_defined_matrix(char* wqm_file, int wqm_idx, int *src) { char line[1024]; char *ret; FILE *fp; int x, y, coef, wqm_size; if ((fp = fopen(wqm_file, "r")) == (FILE*)NULL) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Can't open file %s.\n %s.\n", wqm_file); exit(303); } fseek(fp, 0L, SEEK_SET); do { ret = fgets(line, 1024, fp); if ((ret == NULL) || (strstr(line, tab_WQMType[wqm_idx]) == NULL && feof(fp))) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Error: can't read matrix %s.\n", tab_WQMType[wqm_idx]); exit(304); } } while (strstr(line, tab_WQMType[wqm_idx]) == NULL); wqm_size = 1 << (wqm_idx + 2); for (y = 0; y < wqm_size; y++) { for (x = 0; x < wqm_size; x++) { fscanf(fp, "%d,", &coef); if ((coef == 0) || coef > 255) { xavs2_log(NULL, XAVS2_LOG_ERROR, "QM coefficients %d is not in the range of [1, 255].\n", coef); exit(305); } else { src[y * wqm_size + x] = coef; } } } fclose(fp); } /* --------------------------------------------------------------------------- * calculate the level scale matrix from the current frequency weighting matrix * wqm_idx, 0: 4x4 1:8x8 2: 16x16 3:32x32 */ static void wq_calculate_quant_param(xavs2_t *h, int wqm_idx) { wq_data_t *wq = &h->wq_data; int *LevelScaleNxN[2] = { NULL, NULL }; int block_size = 1 << (wqm_idx + 2); int i, j; LevelScaleNxN[0] = wq->levelScale[wqm_idx][0]; LevelScaleNxN[1] = wq->levelScale[wqm_idx][1]; if (h->WeightQuantEnable) { for (j = 0; j < block_size; j++) { for (i = 0; i < block_size; i++) { if ((wqm_idx == 0) || (wqm_idx == 1)) { LevelScaleNxN[1][j * block_size + i] = (int)((float)(32768 << 7) / wq->cur_wq_matrix[wqm_idx][j * block_size + i]); } else if (wqm_idx == 2) { LevelScaleNxN[1][j * block_size + i] = (int)((float)(32768 << 7) / wq->cur_wq_matrix[wqm_idx][(j >> 1) * (block_size >> 1) + (i >> 1)]); } else if (wqm_idx == 3) { LevelScaleNxN[1][j * block_size + i] = (int)((float)(32768 << 7) / wq->cur_wq_matrix[wqm_idx][(j >> 2) * (block_size >> 2) + (i >> 2)]); } } } } else { for (j = 0; j < block_size; j++) { for (i = 0; i < block_size; i++) { LevelScaleNxN[0][j * block_size + i] = 32768; LevelScaleNxN[1][j * block_size + i] = 32768; } } } } /* --------------------------------------------------------------------------- * read user-defined frequency weighting parameters from configure file * Input: str_param, input parameters string * mode, =0 load string to the UnDetailed parameters * =1 load string to the Detailed parameters */ static void wq_get_user_defined_param(xavs2_t *h, char *str_param, int mode) { char str[WQMODEL_PARAM_SIZE]; char *p; int param = 0; int num = 0; if (strlen(str_param) > WQMODEL_PARAM_SIZE) { xavs2_log(h, XAVS2_LOG_ERROR, "Cannot read the weight parameters in configuration file %s.\n", str_param); exit(301); } strcpy(str, str_param); p = str; for (;;) { if (*p == '[') { p++; param = 0; continue; } else if ((*p >= '0') && (*p <= '9')) { param = param * 10 + (*p - '0'); } else if ((*p == ',') || (*p == ' ')) { h->wq_data.wq_param[mode][num] = (int16_t)param; num++; param = 0; } if (*p != ']') { p++; } else { h->wq_data.wq_param[mode][num] = (int16_t)param; num++; break; } } if (num != PARAM_NUM) { xavs2_log(h, XAVS2_LOG_ERROR, "Not all of the weight parameters is loaded in configuration file %s.\n", str_param); exit(302); } } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * initializes the frequency weighting parameters for a new frame */ void xavs2_wq_init_seq_quant_param(xavs2_t *h) { int seq_wqm[64]; wq_data_t *wq = &h->wq_data; int wqm_index, wqm_idx; int block_size; int i; wq->levelScale[0][0] = wq->LevelScale4x4[0]; wq->levelScale[0][1] = wq->LevelScale4x4[1]; wq->levelScale[1][0] = wq->LevelScale8x8[0]; wq->levelScale[1][1] = wq->LevelScale8x8[1]; wq->levelScale[2][0] = wq->LevelScale16x16[0]; wq->levelScale[2][1] = wq->LevelScale16x16[1]; wq->levelScale[3][0] = wq->LevelScale32x32[0]; wq->levelScale[3][1] = wq->LevelScale32x32[1]; for (wqm_index = 0; wqm_index < 4; wqm_index++) { for (i = 0; i < 64; i++) { wq->cur_wq_matrix[wqm_index][i] = 1 << 4; } } for (wqm_index = 0; wqm_index < 2; wqm_index++) { block_size = XAVS2_MIN(1 << (wqm_index + 2), 8); wqm_idx = (wqm_index < 2) ? wqm_index : 1; if (h->param->SeqWQM == 0) { wq_get_default_matrix(wqm_idx, seq_wqm); } else if (h->param->SeqWQM == 1) { wq_get_user_defined_matrix(h->param->psz_seq_wq_file, wqm_idx, seq_wqm); } for (i = 0; i < (block_size * block_size); i++) { wq->seq_wq_matrix[wqm_index][i] = (int16_t)seq_wqm[i]; } } } /* --------------------------------------------------------------------------- * initializes the frequency weighting parameters for a new picture */ void xavs2_wq_init_pic_quant_param(xavs2_t *h) { int pic_wqm[64]; wq_data_t *wq = &h->wq_data; int wqm_index, block_size, wqm_idx; int wq_model; int i, j, k; h->WeightQuantEnable = (h->param->enable_wquant && h->param->PicWQEnable); if (!h->WeightQuantEnable) { for (i = 0; i < 2; i++) { for (j = 0; j < 6; j++) { wq->wq_param[i][j] = 128; } } for (wqm_index = 0; wqm_index < 2; wqm_index++) { block_size = 1 << (wqm_index + 2); for (k = 0; k < 2; k++) { for (j = 0; j < block_size; j++) { for (i = 0; i < block_size; i++) { wq->wq_matrix[wqm_index][k][j * block_size + i] = 1 << 7; } } } } } else { if (h->param->PicWQDataIndex == 1) { // patch the weighting parameters use default weighted parameters, input->WQParam==0 for (i = 0; i < 2; i++) { for (j = 0; j < 6; j++) { wq->wq_param[i][j] = 128; } } // if input->WQParam!=0, update wq_param if (h->param->WQParam == 0) { wq->cur_frame_wq_param = FRAME_WQ_DEFAULT; // default param - detailed for (i = 0; i < 6; i++) { wq->wq_param[DETAILED][i] = tab_wq_param_default[DETAILED][i]; } } else if (h->param->WQParam == 1) { // load user defined weighted parameters wq->cur_frame_wq_param = USER_DEF_UNDETAILED; // user defined undetailed param wq_get_user_defined_param(h, h->param->WeightParamUnDetailed, 0); } else if (h->param->WQParam == 2) { // load user defined weighted parameters wq->cur_frame_wq_param = USER_DEF_DETAILED; // user defined detailed param wq_get_user_defined_param(h, h->param->WeightParamDetailed, 1); } // reconstruct the weighting matrix wq_model = h->param->WQModel; for (k = 0; k < 2; k++) { for (j = 0; j < 8; j++) { for (i = 0; i < 8; i++) { wq->wq_matrix[1][k][j * 8 + i] = (wq->wq_param[k][tab_WeightQuantModel[wq_model][j * 8 + i]]); } } } for (k = 0; k < 2; k++) { for (j = 0; j < 4; j++) { for (i = 0; i < 4; i++) { wq->wq_matrix[0][k][j * 4 + i] = (wq->wq_param[k][tab_WeightQuantModel4x4[wq_model][j * 4 + i]]); } } } } else if (h->param->PicWQDataIndex == 2) { for (wqm_index = 0; wqm_index < 2; wqm_index++) { block_size = XAVS2_MIN(1 << (wqm_index + 2), 8); wqm_idx = (wqm_index < 2) ? wqm_index : 1; wq_get_user_defined_matrix(h->param->psz_pic_wq_file, wqm_idx, pic_wqm); for (i = 0; i < (block_size * block_size); i++) { wq->pic_user_wq_matrix[wqm_index][i] = (int16_t)pic_wqm[i]; } } } } for (wqm_index = 0; wqm_index < 4; wqm_index++) { for (i = 0; i < 64; i++) { wq->cur_wq_matrix[wqm_index][i] = 1 << 7; } } } /* --------------------------------------------------------------------------- * update the frequency weighting matrix for current picture */ void xavs2_wq_update_pic_matrix(xavs2_t *h) { wq_data_t *wq = &h->wq_data; int wqm_index, wqm_idx; int block_size; int i; if (h->WeightQuantEnable) { for (wqm_index = 0; wqm_index < 4; wqm_index++) { block_size = XAVS2_MIN(1 << (wqm_index + 2), 8); wqm_idx = (wqm_index < 2) ? wqm_index : 1; if (h->param->PicWQDataIndex == 0) { for (i = 0; i < (block_size * block_size); i++) { wq->cur_wq_matrix[wqm_index][i] = wq->seq_wq_matrix[wqm_idx][i]; } } else if (h->param->PicWQDataIndex == 1) { if (h->param->WQParam == 0) { for (i = 0; i < (block_size * block_size); i++) { wq->cur_wq_matrix[wqm_index][i] = wq->wq_matrix[wqm_idx][DETAILED][i]; } } else if (h->param->WQParam == 1) { for (i = 0; i < (block_size * block_size); i++) { wq->cur_wq_matrix[wqm_index][i] = wq->wq_matrix[wqm_idx][0][i]; } } else if (h->param->WQParam == 2) { for (i = 0; i < (block_size * block_size); i++) { wq->cur_wq_matrix[wqm_index][i] = wq->wq_matrix[wqm_idx][1][i]; } } } else if (h->param->PicWQDataIndex == 2) { for (i = 0; i < (block_size * block_size); i++) { wq->cur_wq_matrix[wqm_index][i] = wq->pic_user_wq_matrix[wqm_idx][i]; } } } } for (wqm_index = 0; wqm_index < 4; wqm_index++) { wq_calculate_quant_param(h, wqm_index); } } #endif // ENABLE_WQUANT xavs2-1.3/source/encoder/wquant.h000066400000000000000000000047461340660520300170410ustar00rootroot00000000000000/* * wquant.h * * Description of this file: * Weighted Quant functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_WQUANT_H #define XAVS2_WQUANT_H #if ENABLE_WQUANT #define PARAM_NUM 6 #define WQ_MODEL_NUM 3 #define SCENE_MODE_NUM 4 #define UNDETAILED 0 #define DETAILED 1 #define WQ_MODE_F 0 #define WQ_MODE_U 1 #define WQ_MODE_D 2 #define FRAME_WQ_DEFAULT 0 #define USER_DEF_UNDETAILED 1 #define USER_DEF_DETAILED 2 /** * =========================================================================== * interface function declares * =========================================================================== */ #define xavs2_wq_init_seq_quant_param FPFX(wq_init_seq_quant_param) void xavs2_wq_init_seq_quant_param(xavs2_t *h); #define xavs2_wq_init_pic_quant_param FPFX(wq_init_pic_quant_param) void xavs2_wq_init_pic_quant_param(xavs2_t *h); #define xavs2_wq_update_pic_matrix FPFX(wq_update_pic_matrix) void xavs2_wq_update_pic_matrix(xavs2_t *h); extern const short tab_wq_param_default[2][6]; #endif #endif // XAVS2_WQUANT_H xavs2-1.3/source/encoder/wrapper.c000066400000000000000000000126351340660520300171710ustar00rootroot00000000000000/* * wrapper.c * * Description of this file: * encoder wrapper functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "wrapper.h" #include "frame.h" #include "encoder.h" #include "rps.h" /* --------------------------------------------------------------------------- */ void frame_buffer_init(xavs2_handler_t *h_mgr, uint8_t **mem_base, xavs2_frame_buffer_t *frm_buf, int num_frm, int frm_type) { int i; memset(frm_buf, 0, sizeof(xavs2_frame_buffer_t)); frm_buf->COI = 0; frm_buf->COI_IDR = 0; frm_buf->POC_IDR = 0; frm_buf->num_frames = num_frm; frm_buf->i_frame_b = 0; frm_buf->ip_pic_idx = 0; if (mem_base == NULL) { for (i = 0; i < num_frm; i++) { frm_buf->frames[i] = xavs2_frame_new(h_mgr->p_coder, NULL, frm_type); } } else { uint8_t *mem_ptr = *mem_base; for (i = 0; i < num_frm; i++) { frm_buf->frames[i] = xavs2_frame_new(h_mgr->p_coder, &mem_ptr, frm_type); ALIGN_POINTER(mem_ptr); } *mem_base = mem_ptr; } } /* --------------------------------------------------------------------------- * destroy frame buffer */ void frame_buffer_destroy(xavs2_handler_t *h_mgr, xavs2_frame_buffer_t *frm_buf) { int i; for (i = 0; i < frm_buf->num_frames; i++) { xavs2_frame_delete(h_mgr, frm_buf->frames[i]); frm_buf->frames[i] = NULL; } } /* --------------------------------------------------------------------------- * update frame buffer information */ void frame_buffer_update(xavs2_t *h, xavs2_frame_buffer_t *frm_buf, xavs2_frame_t *frm) { /* update the task manager */ if (h->param->intra_period_max != 0 && frm->i_frm_type == XAVS2_TYPE_I) { frm_buf->COI_IDR = frm->i_frm_coi; frm_buf->POC_IDR = frm->i_frame; } if (frm->i_frm_type == XAVS2_TYPE_B) { frm_buf->i_frame_b++; /* encoded B-picture index */ } else { frm_buf->i_frame_b = 0; /* reset */ frm_buf->ip_pic_idx++; /* encoded I/P/F-picture index */ } } /** * --------------------------------------------------------------------------- * Function : destroy all lists used by the AVS video encoder * Parameters : * [in ] : h_mgr - pointer of struct xavs2_handler_t, the AVS encoder * [out] : none * Return : none * --------------------------------------------------------------------------- */ void destroy_all_lists(xavs2_handler_t *h_mgr) { int i; assert(h_mgr != NULL); xl_destroy(&h_mgr->list_frames_output); xl_destroy(&h_mgr->list_frames_ready); xl_destroy(&h_mgr->list_frames_free); for (i = 0; i < XAVS2_INPUT_NUM; i++) { xavs2_frame_destroy_objects(h_mgr, h_mgr->ipb.frames[i]); } } /** * --------------------------------------------------------------------------- * Function : proceeding of wrapper thread * Parameters : * [in ] : h_mgr - pointer to xavs2_handler_t * [out] : none * Return : none * --------------------------------------------------------------------------- */ void *proc_wrapper_thread(void *args) { xavs2_handler_t *h_mgr = (xavs2_handler_t *)args; xlist_t *list_in = &h_mgr->list_frames_ready; xlist_t *list_idle = &h_mgr->list_frames_free; for (;;) { /* fetch one node from input list */ xavs2_frame_t *frame = (xavs2_frame_t *)xl_remove_head(list_in, 1); int state = frame->i_state; if (state == XAVS2_EXIT_THREAD) { xl_append(list_idle, frame); break; /* exit this thread */ } /* encoding... */ if (encoder_encode(h_mgr, frame) < 0) { xavs2_log(NULL, XAVS2_LOG_ERROR, "encode frame fail\n"); break; /* exit on error */ } /* throw it into idle list */ if (state == XAVS2_FLUSH) { xl_append(list_idle, frame); } } return NULL; } xavs2-1.3/source/encoder/wrapper.h000066400000000000000000000363731340660520300172030ustar00rootroot00000000000000/* * wrapper.h * * Description of this file: * encoder wrapper functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_WRAPPER_H #define XAVS2_WRAPPER_H #include "xlist.h" #include "threadpool.h" /** * =========================================================================== * type defines * =========================================================================== */ // function type typedef void(*vpp_ipred_t)(pel_t *p_pred, pel_t *p_top, pel_t *p_left); /* --------------------------------------------------------------------------- * lookahead_t */ typedef struct lookahead_t { int start; int bpframes; int gopframes; } lookahead_t; /* --------------------------------------------------------------------------- * low resolution of frame (luma plane) */ typedef struct frm_lowres_t { int i_width; /* width for luma plane */ int i_lines; /* height for luma plane */ int i_stride; /* stride for luma plane */ pel_t *filtered; /* half-size copy of input frame (luma only) */ } frm_lowres_t; /* --------------------------------------------------------------------------- * video pre-processing motion estimation */ typedef struct vpp_me_t { int mv_min[2]; /* full pel MV range for motion search (min) */ int mv_max[2]; /* full pel MV range for motion search (max) */ mv_t bmv; /* [OUT] best motion vector */ mv_t pmv; /* pred motion vector for the current block */ uint16_t *mvbits; /* used for getting the mv bits */ pixel_cmp_t sad_8x8; /* function handle for cal sad of 8x8 block */ pixel_cmp_x3_t sad_8x8_x3; /* function handle for cal sad of 8x8 block (X3) */ pixel_cmp_x4_t sad_8x8_x4; /* function handle for cal sad of 8x8 block (X4) */ } vpp_me_t; /* --------------------------------------------------------------------------- * frame buffer manager */ struct xavs2_frame_buffer_t { xavs2_frame_t *frames[FREF_BUF_SIZE]; /* all managed pictures */ int num_frames; /* number of managed pictures */ int COI; /* Coding Order Index */ int COI_IDR; /* COI of current IDR frame */ int POC_IDR; /* POC of current IDR frame */ int ip_pic_idx; /* encoded I/P/F-picture index (to be REMOVED) */ int i_frame_b; /* number of encoded B-picture in a GOP */ /* frames to be removed before next frame encoding */ int num_frames_to_remove; /* number of frames to be removed */ int coi_remove_frame[8]; /* COI of frames to be removed */ }; /* --------------------------------------------------------------------------- * xavs2_handler_t */ struct xavs2_handler_t { ALIGN32(xavs2_log_t module_log); /* used for logging */ /* encoder engines */ xavs2_t *p_coder; /* point to the xavs2 video encoder */ xavs2_t *frm_contexts[MAX_PARALLEL_FRAMES]; /* frame task contexts */ xavs2_t *row_contexts; /* row task contexts */ /* frame buffers */ xavs2_frame_buffer_t ipb; /* input picture buffer */ xavs2_frame_buffer_t dpb; /* decoding picture buffer */ /* properties */ int64_t max_out_pts; /* max output pts */ int64_t max_out_dts; /* max output dts */ /* number of frames */ int num_input; /* number of frames: input into the encoder */ int num_encode; /* number of frames: sent into encoding queue */ int num_output; /* number of frames: outputted */ int b_seq_end; /* has all frames been output */ /* output frame index, use get_next_frame_id() to get next output index */ int i_input; /* index of frames: input already accepted, used for frame output () */ int i_output; /* index of frames: output already encoded , used for frame output () */ /* index of frames, [0, i_frm_threads), to determine frame order */ int i_frame_in; /* frame order [0, i_frm_threads): next input */ int i_frame_aec; /* frame order [0, i_frm_threads): current AEC */ /* threads & synchronization */ volatile int i_exit_flag; /* app signal to exit */ int i_frm_threads; /* real number of thread in frame level parallel */ int i_row_threads; /* real number of thread in LCU-row level parallel */ int num_pool_threads; /* number of threads allocated in threadpool */ int num_row_contexts; /* number of row contexts */ xavs2_threadpool_t *threadpool_rdo; /* the thread pool (for parallel encoding) */ xavs2_threadpool_t *threadpool_aec; /* the thread pool for aec encoding */ xavs2_thread_t thread_wrapper; /* thread for wrapper proceeding */ xavs2_thread_cond_t cond[SIG_COUNT]; xavs2_thread_mutex_t mutex; /* mutex */ /* frames and lists */ xlist_t list_frames_free; /* list[0]: frames which are free to use */ xlist_t list_frames_ready; /* list[1]: frames which are ready for encoding (slice type configured) */ xlist_t list_frames_output; /* list[2]: frames which are ready for output */ /* lookahead and slice type decision */ xavs2_frame_t *blocked_frm_set[XAVS2_MAX_GOP_SIZE + 4]; int64_t blocked_pts_set[XAVS2_MAX_GOP_SIZE + 4]; int64_t prev_reordered_pts_set[XAVS2_MAX_GOP_SIZE + 4]; int num_encoded_frames_for_dts; lookahead_t lookahead; /* lookahead */ int num_blocked_frames; /* number of blocked frames for Slice Type decision */ /* rate-control */ ratectrl_t *rate_control; /* rate control */ td_rdo_t *td_rdo; #if XAVS2_STAT xavs2_stat_t stat; /* stat total */ FILE *fp_trace; /* for trace output */ #endif void *user_data; /* handle of user data */ int64_t create_time; /* time of encoder creation, used for encoding speed test */ #if XAVS2_DUMP_REC FILE *h_rec_file; /* file handle to output reconstructed frame data */ #endif }; /** * =========================================================================== * inline functions * =========================================================================== */ static ALWAYS_INLINE int get_next_frame_id(int idx_cur) { idx_cur = (idx_cur + 1); if (idx_cur > MAX_FRAME_INDEX) { return 0; } else { return idx_cur; } } /** * =========================================================================== * interface function declares * =========================================================================== */ /* --------------------------------------------------------------------------- * frame buffer operation */ #define frame_buffer_init FPFX(frame_buffer_init) void frame_buffer_init(xavs2_handler_t *h_mgr, uint8_t **mem_base, xavs2_frame_buffer_t *frm_buf, int num_frm, int frm_type); #define frame_buffer_destroy FPFX(frame_buffer_destroy) void frame_buffer_destroy(xavs2_handler_t *h_mgr, xavs2_frame_buffer_t *frm_buf); #define frame_buffer_update FPFX(frame_buffer_update) void frame_buffer_update(xavs2_t *h, xavs2_frame_buffer_t *frm_buf, xavs2_frame_t *frm); /* --------------------------------------------------------------------------- * wrapper */ #define destroy_all_lists FPFX(destroy_all_lists) void destroy_all_lists(xavs2_handler_t *h_mgr); #define encoder_task_manager_free FPFX(encoder_task_manager_free) void encoder_task_manager_free(xavs2_handler_t *h_mgr); #define proc_wrapper_thread FPFX(proc_wrapper_thread) void *proc_wrapper_thread(void *args); /** * =========================================================================== * API function defines * =========================================================================== */ /** * --------------------------------------------------------------------------- * Function : get buffer for the encoder caller * Parameters : * [in ] : coder - pointer to wrapper of the xavs2 encoder * : pic - pointer to struct xavs2_picture_t * [out] : none * Return : zero for success, otherwise failed * --------------------------------------------------------------------------- */ int xavs2_encoder_get_buffer(void *coder, xavs2_picture_t *pic); /** * --------------------------------------------------------------------------- * Function : Output help parameters * Parameters : * [in ] : param - pointer to struct xavs2_param_t * [out] : none * Return : none * --------------------------------------------------------------------------- */ void xavs2_encoder_opt_help(void); /** * --------------------------------------------------------------------------- * Function : initialize default parameters for the xavs2 video encoder * Parameters : * [in ] : none * Return : parameter handler, can be further configured * --------------------------------------------------------------------------- */ xavs2_param_t *xavs2_encoder_opt_alloc(void); /** * --------------------------------------------------------------------------- * Function : Parsing encoding parameters * Parameters : * [in ] : param - pointer to struct xavs2_param_t * [in ] : argc - number of command line parameters * [in ] : argv - pointer to parameter strings * Return : int - zero for success, otherwise failed * --------------------------------------------------------------------------- */ int xavs2_encoder_opt_set(xavs2_param_t *param, int argc, char *argv[]); /** * --------------------------------------------------------------------------- * Function : set parameter value * Parameters : * [in ] : param - pointer to struct xavs2_param_t * [in ] : name - name of parameter * [in ] : value_string - parameter value * Return : int - zero for success, otherwise failed * --------------------------------------------------------------------------- */ int xavs2_encoder_opt_set2(xavs2_param_t *param, const char *name, const char *value_string); /** * --------------------------------------------------------------------------- * Function : get value of a specific parameter * Parameters : * [in ] : param - pointer to struct xavs2_param_t * [in ] : name - name of a parameter (input, output, width, height, frames) * Return : const char *: value string * --------------------------------------------------------------------------- */ const char *xavs2_encoder_opt_get(xavs2_param_t *param, const char *name); /** * --------------------------------------------------------------------------- * Function : free memory of parameter * Parameters : * [in ] : none * [out] : parameter handler, can be further configured * Return : none * --------------------------------------------------------------------------- */ void xavs2_encoder_opt_destroy(xavs2_param_t *param); /** * =========================================================================== * interface function declares: encoding * =========================================================================== */ /** * --------------------------------------------------------------------------- * Function : create and initialize the xavs2 video encoder * Parameters : * [in ] : param - pointer to struct xavs2_param_t * : dump_func - pointer to struct xavs2_dump_func_t * : opaque - user data * [out] : none * Return : handle of xavs2 encoder wrapper, none zero for success, otherwise false * --------------------------------------------------------------------------- */ void *xavs2_encoder_create(xavs2_param_t *param); /** * --------------------------------------------------------------------------- * Function : write (send) data to the xavs2 encoder * Parameters : * [in ] : coder - pointer to wrapper of the xavs2 encoder * : pic - pointer to struct xavs2_picture_t * [out] : none * Return : zero for success, otherwise failed * --------------------------------------------------------------------------- */ int xavs2_encoder_encode(void *coder, xavs2_picture_t *pic, xavs2_outpacket_t *packet); /** * --------------------------------------------------------------------------- * Function : label a packet to be recycled * Parameters : * [in ] : coder - pointer to wrapper of the xavs2 encoder * : packet - pointer to struct xavs2_outpacket_t * [out] : none * Return : zero for success, otherwise failed * --------------------------------------------------------------------------- */ int xavs2_encoder_packet_unref(void *coder, xavs2_outpacket_t *packet); /** * --------------------------------------------------------------------------- * Function : destroy the xavs2 video encoder * Parameters : * [in ] : coder - pointer to wrapper of the xavs2 encoder * [out] : none * Return : none * Note : this API is *NOT* thread-safe, * and can not be called simultaneously with other APIs. * --------------------------------------------------------------------------- */ void xavs2_encoder_destroy(void *coder); #endif // XAVS2_WRAPPER_H xavs2-1.3/source/encoder/xavs2.c000066400000000000000000000555231340660520300165570ustar00rootroot00000000000000/* * xavs2.c * * Description of this file: * API functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "wrapper.h" #include "frame.h" #include "encoder.h" #include "cpu.h" #include "ratecontrol.h" #include "tdrdo.h" #include "presets.h" #include "rps.h" /* --------------------------------------------------------------------------- */ static INLINE int get_num_frame_threads(xavs2_param_t *param, int num_frame_threads, int num_row_threads) { int a = ((param->search_range + (1 << param->lcu_bit_level) - 1) >> param->lcu_bit_level) + 1; int i; if (num_frame_threads > 0 && num_frame_threads < XAVS2_THREAD_MAX) { return num_frame_threads; } for (i = 2; i < XAVS2_THREAD_MAX; i++) { int n_row_threads_need = ((a * (i + 1) - 4) * i) >> 1; if (n_row_threads_need > num_row_threads) { break; } } return i - 1; } /** * =========================================================================== * interface function defines (xavs2 encoder library APIs for AVS2 video encoder) * =========================================================================== */ /** * --------------------------------------------------------------------------- * Function : initialize default parameters for the xavs2 video encoder * Parameters : * [in ] : param - pointer to struct xavs2_param_t * [out] : none * Return : none * --------------------------------------------------------------------------- */ xavs2_param_t *xavs2_encoder_opt_alloc(void) { xavs2_param_t *param = (xavs2_param_t *)xavs2_malloc(sizeof(xavs2_param_t)); if (param == NULL) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Failed to malloc space for xavs2_param_t with %d bytes\n", sizeof(xavs2_param_t)); return NULL; } memset(param, 0, sizeof(xavs2_param_t)); /* --- sequence --------------------------------------------- */ param->profile_id = MAIN_PROFILE; param->level_id = 66; param->progressive_sequence = 1; param->chroma_format = CHROMA_420; param->input_sample_bit_depth = 8; param->sample_bit_depth = 8; param->sample_precision = 1; param->aspect_ratio_information = 1; param->frame_rate_code = 3; param->lcu_bit_level = MAX_CU_SIZE_IN_BIT; param->scu_bit_level = MIN_CU_SIZE_IN_BIT; param->org_width = 1920; param->org_height = 1080; strcpy(param->psz_in_file, "input.yuv"); strcpy(param->psz_bs_file, "test.avs"); strcpy(param->psz_dump_yuv, ""); #if XAVS2_TRACE strcpy(param->psz_trace_file, "trace_enc.txt"); #endif /* --- stream structure ------------------------------------- */ param->enable_f_frame = TRUE; param->InterlaceCodingOption = 0; param->b_open_gop = 1; param->i_gop_size = -8; param->num_bframes = 7; param->intra_period_max = -1; param->intra_period_min = -1; /* --- picture ---------------------------------------------- */ param->progressive_frame = 1; param->time_code_flag = 0; param->top_field_first = 0; param->repeat_first_field = 0; param->fixed_picture_qp = TRUE; param->i_initial_qp = 32; /* --- slice ------------------------------------------------ */ param->slice_num = 1; /* --- analysis options ------------------------------------- */ param->enable_hadamard = TRUE; param->me_method = XAVS2_ME_UMH; param->search_range = 64; param->num_max_ref = XAVS2_MAX_REFS; param->inter_2pu = TRUE; param->enable_amp = TRUE; param->enable_intra = TRUE; param->i_rd_level = RDO_ALL; param->preset_level = 5; param->is_preset_configured = FALSE; param->rdo_bit_est_method = 0; /* encoding tools ------------------------------------------- */ param->enable_mhp_skip = FALSE; param->enable_dhp = TRUE; param->enable_wsm = TRUE; param->enable_nsqt = TRUE; param->enable_sdip = TRUE; param->enable_secT = TRUE; param->enable_sao = TRUE; param->b_sao_before_deblock = FALSE; param->enable_alf = TRUE; param->alf_LowLatencyEncoding = FALSE; param->enable_pmvr = TRUE; param->b_cross_slice_loop_filter = FALSE; // Ӱ֡бٶȣĬϽ param->enable_dmh = TRUE; param->b_fast_2lelvel_tu = FALSE; /* RDOQ */ param->i_rdoq_level = RDOQ_ALL; param->lambda_factor_rdoq = 75; param->lambda_factor_rdoq_p = 120; param->lambda_factor_rdoq_b = 100; param->enable_refine_qp = TRUE; param->enable_tdrdo = FALSE; /* loop filter */ param->loop_filter_disable = FALSE; param->loop_filter_parameter_flag = 0; param->alpha_c_offset = 0; param->beta_offset = 0; /* weight quant */ param->enable_wquant = FALSE; #if ENABLE_WQUANT param->SeqWQM = 0; param->PicWQEnable = FALSE; param->PicWQDataIndex = 0; param->MBAdaptQuant = 0; param->chroma_quant_param_delta_u = 0; param->chroma_quant_param_delta_v = 0; param->WQParam = 2; param->WQModel = 1; #endif /* --- rate control ----------------------------------------- */ param->i_rc_method = XAVS2_RC_CQP; param->i_min_qp = 20; param->i_max_qp = MAX_QP; param->i_target_bitrate = 1000000; /* --- parallel --------------------------------------------- */ param->num_parallel_gop = 1; param->i_frame_threads = 0; param->i_lcurow_threads = 0; param->enable_aec_thread = 1; /* --- log -------------------------------------------------- */ param->i_log_level = 3; param->enable_psnr = 1; param->enable_ssim = 0; /* --- input/output for testing ----------------------------- */ param->infile_header = 0; param->output_merged_picture = 0; parse_preset_level(param, param->preset_level); return param; } /** * --------------------------------------------------------------------------- * Function : free memory of parameter * Parameters : * [in ] : none * [out] : parameter handler, can be further configured * Return : none * --------------------------------------------------------------------------- */ void xavs2_encoder_opt_destroy(xavs2_param_t *param) { if (param != NULL) { xavs2_free(param); } } /** * --------------------------------------------------------------------------- * Function : create and initialize the xavs2 video encoder * Parameters : * [in ] : param - pointer to struct xavs2_param_t * [out] : handle of xavs2 encoder wrapper * Return : handle of xavs2 encoder wrapper, none zero for success, otherwise false * --------------------------------------------------------------------------- */ void *xavs2_encoder_create(xavs2_param_t *param) { xavs2_handler_t *h_mgr = NULL; xavs2_frame_t *frm = NULL; uint8_t *mem_ptr = NULL; size_t size_ratecontrol; /* size for rate control module */ size_t size_tdrdo; size_t mem_size; int i; if (param == NULL) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Null input parameters for encoder creation\n"); return NULL; } /* confirm the input parameters (log_level) */ if (param->i_log_level < XAVS2_LOG_NONE || param->i_log_level > XAVS2_LOG_DEBUG) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Invalid parameter: log_level %d\n", param->i_log_level); return NULL; } g_xavs2_default_log.i_log_level = param->i_log_level; /* init all function handlers */ memset(&g_funcs, 0, sizeof(g_funcs)); #if HAVE_MMX g_funcs.cpuid = xavs2_cpu_detect(); #endif xavs2_init_all_primitives(param, &g_funcs); /* check parameters */ if (encoder_check_parameters(param) < 0) { xavs2_log(NULL, XAVS2_LOG_ERROR, "error encoder parameters\n"); goto fail; } size_ratecontrol = xavs2_rc_get_buffer_size(param); /* rate control */ size_tdrdo = tdrdo_get_buffer_size(param); /* compute the memory size */ mem_size = sizeof(xavs2_handler_t) + /* M0, size of the encoder wrapper */ xavs2_frame_buffer_size(param, FT_ENC) * XAVS2_INPUT_NUM + /* M4, size of buffered input frames */ size_ratecontrol + /* M5, rate control information */ size_tdrdo + /* M6, TDRDO */ CACHE_LINE_SIZE * (XAVS2_INPUT_NUM + 4); /* alloc memory for the encoder wrapper */ CHECKED_MALLOC(mem_ptr, uint8_t *, mem_size); /* M0: assign the wrapper */ h_mgr = (xavs2_handler_t *)mem_ptr; memset(h_mgr, 0, sizeof(xavs2_handler_t)); mem_ptr += sizeof(xavs2_handler_t); ALIGN_POINTER(mem_ptr); /* init log module */ h_mgr->module_log.i_log_level = param->i_log_level; sprintf(h_mgr->module_log.module_name, "Manager %06llx", (intptr_t)(h_mgr)); /* counter: number of frames */ h_mgr->num_input = 0; h_mgr->num_encode = 0; h_mgr->num_output = 0; /* counters for encoding */ h_mgr->i_exit_flag = 0; h_mgr->i_input = 0; h_mgr->i_output = -1; h_mgr->i_frame_in = 0; h_mgr->i_frame_aec = 0; h_mgr->b_seq_end = 0; h_mgr->max_out_dts = 0; h_mgr->max_out_pts = 0; h_mgr->create_time = xavs2_mdate(); srand((uint32_t)h_mgr->create_time); #if XAVS2_DUMP_REC if (strlen(param->psz_dump_yuv) > 0) { /* open dump file */ if ((h_mgr->h_rec_file = fopen(param->psz_dump_yuv, "wb")) == NULL) { xavs2_log(h_mgr, XAVS2_LOG_ERROR, "Error open file %s\n", param->psz_dump_yuv); } } #endif if (xavs2_thread_mutex_init(&h_mgr->mutex, NULL)) { goto fail; } for (i = 0; i < SIG_COUNT; i++) { if (xavs2_thread_cond_init(&h_mgr->cond[i], NULL)) { goto fail; } } /* decide all thread numbers */ h_mgr->i_row_threads = param->i_lcurow_threads == 0 ? xavs2_cpu_num_processors() : param->i_lcurow_threads; h_mgr->i_frm_threads = get_num_frame_threads(param, param->i_frame_threads, h_mgr->i_row_threads); h_mgr->num_pool_threads = 0; h_mgr->num_row_contexts = 0; param->i_lcurow_threads = h_mgr->i_row_threads; param->i_frame_threads = h_mgr->i_frm_threads; /* create RDO thread pool */ if (h_mgr->i_frm_threads > 1 || h_mgr->i_row_threads > 1) { int thread_num = h_mgr->i_frm_threads + h_mgr->i_row_threads; /* total threads */ h_mgr->num_row_contexts = thread_num + h_mgr->i_frm_threads; /* create the thread pool */ if (xavs2_threadpool_init(&h_mgr->threadpool_rdo, thread_num, NULL, NULL)) { xavs2_log(h_mgr, XAVS2_LOG_ERROR, "Error init thread pool RDO. %d", thread_num); goto fail; } h_mgr->num_pool_threads = thread_num; } /* create AEC thread pool */ h_mgr->threadpool_aec = NULL; if (param->enable_aec_thread) { xavs2_threadpool_init(&h_mgr->threadpool_aec, h_mgr->i_frm_threads, NULL, NULL); } /* init all lists */ if (xl_init(&h_mgr->list_frames_free) != 0 || xl_init(&h_mgr->list_frames_output) != 0 || xl_init(&h_mgr->list_frames_ready) != 0) { goto fail; } /* init rate-control buffer */ ALIGN_POINTER(mem_ptr); h_mgr->rate_control = (ratectrl_t *)mem_ptr; mem_ptr += size_ratecontrol; ALIGN_POINTER(mem_ptr); if (xavs2_rc_init(h_mgr->rate_control, param) < 0) { xavs2_log(h_mgr, XAVS2_LOG_ERROR, "create rate control fail\n"); goto fail; } /* TD-RDO */ if (param->enable_tdrdo) { h_mgr->td_rdo = (td_rdo_t *)mem_ptr; mem_ptr += size_tdrdo; ALIGN_POINTER(mem_ptr); if (tdrdo_init(h_mgr->td_rdo, param) != 0) { xavs2_log(h_mgr, XAVS2_LOG_ERROR, "init td-rdo fail\n"); goto fail; } } /* create an encoder handler */ h_mgr->p_coder = encoder_open(param, h_mgr); if (h_mgr->p_coder == NULL) { goto fail; } /* create encoder handlers for multi-thread */ if (h_mgr->i_frm_threads > 1 || h_mgr->i_row_threads > 1) { if (encoder_contexts_init(h_mgr->p_coder, h_mgr) < 0) { goto fail; } } /* M4: alloc memory for each node and append to image idle list */ frame_buffer_init(h_mgr, &mem_ptr, &h_mgr->ipb, XAVS2_INPUT_NUM, FT_ENC); for (i = 0; i < XAVS2_INPUT_NUM; i++) { frm = h_mgr->ipb.frames[i]; if (frm) { xl_append(&h_mgr->list_frames_free, frm); } else { goto fail; } } /* allocate DPB */ frame_buffer_init(h_mgr, NULL, &h_mgr->dpb, XAVS2_MIN(FREF_BUF_SIZE, MAX_REFS + h_mgr->i_frm_threads * 4), FT_DEC); /* memory check */ if (mem_ptr - (uint8_t *)h_mgr > mem_size) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Failed to create input frame buffer.\n"); goto fail; } /* init lookahead in the encoder wrapper */ h_mgr->lookahead.bpframes = param->i_gop_size; h_mgr->lookahead.start = 0; memset(h_mgr->blocked_frm_set, 0, sizeof(h_mgr->blocked_frm_set)); memset(h_mgr->blocked_pts_set, 0, sizeof(h_mgr->blocked_pts_set)); h_mgr->num_blocked_frames = 0; h_mgr->fp_trace = NULL; /* create wrapper thread */ if (xavs2_create_thread(&h_mgr->thread_wrapper, proc_wrapper_thread, h_mgr)) { xavs2_log(h_mgr, XAVS2_LOG_ERROR, "create encoding thread\n"); goto fail; } return h_mgr; fail: if (mem_ptr && h_mgr) { xavs2_encoder_destroy(h_mgr); } return NULL; } /** * --------------------------------------------------------------------------- * Function : destroy the xavs2 video encoder * Parameters : * [in ] : coder - pointer to wrapper of the xavs2 encoder * [out] : none * Return : none * --------------------------------------------------------------------------- */ void xavs2_encoder_destroy(void *coder) { xavs2_handler_t *h_mgr = (xavs2_handler_t *)coder; xavs2_frame_t frm_flush = { 0 }; xavs2_frame_t frm_exit = { 0 }; /* destroy all threads: lookahead and wrapper threads */ if (h_mgr->p_coder != NULL) { frm_flush.i_state = XAVS2_FLUSH; /* signal to flush encoder */ frm_exit.i_state = XAVS2_EXIT_THREAD; /* signal to exit */ send_frame_to_enc_queue(h_mgr, &frm_flush); send_frame_to_enc_queue(h_mgr, &frm_exit); /* wait until the RDO process exit, then memory can be released */ xavs2_thread_join(h_mgr->thread_wrapper, NULL); } /* close the encoder */ encoder_close(h_mgr); xavs2_log(h_mgr, XAVS2_LOG_DEBUG, "Encoded %d frames, %.3f secs\n", h_mgr->num_input, 0.000001 * (xavs2_mdate() - h_mgr->create_time)); if (h_mgr->fp_trace) { fclose(h_mgr->fp_trace); } /* free memory of encoder wrapper */ memset(h_mgr, 0, sizeof(xavs2_handler_t)); xavs2_free(h_mgr); } /** * --------------------------------------------------------------------------- * Function : get buffer for the encoder caller * Parameters : * [in ] : coder - pointer to wrapper of the xavs2 encoder * : pic - pointer to struct xavs2_picture_t * [out] : pic - memory would be allocated for the image planes * Return : zero for success, otherwise failed * --------------------------------------------------------------------------- */ int xavs2_encoder_get_buffer(void *coder, xavs2_picture_t *pic) { xavs2_handler_t *h_mgr = (xavs2_handler_t *)coder; xavs2_t *p_coder = h_mgr->p_coder; const xavs2_param_t *param = p_coder->param; xavs2_frame_t *frame; assert(h_mgr != NULL && pic != NULL); if (h_mgr == NULL || pic == NULL) { return -1; } memset(pic, 0, sizeof(xavs2_picture_t)); /* fetch an empty node from unused list */ frame = frame_buffer_get_free_frame_ipb(h_mgr); /* set properties */ pic->img.in_sample_size = param->input_sample_bit_depth == 8 ? 1 : 2; pic->img.enc_sample_size = sizeof(pel_t); pic->img.i_width[0] = param->org_width; pic->img.i_width[1] = param->org_width >> 1; pic->img.i_width[2] = param->org_width >> 1; pic->img.i_lines[0] = param->org_height; pic->img.i_lines[1] = param->org_height >> (param->chroma_format <= CHROMA_420 ? 1 : 0); pic->img.i_lines[2] = param->org_height >> (param->chroma_format <= CHROMA_420 ? 1 : 0); pic->img.i_csp = XAVS2_CSP_I420; pic->img.i_plane = frame->i_plane; pic->img.i_stride[0] = frame->i_stride[0] * sizeof(pel_t); pic->img.i_stride[1] = frame->i_stride[1] * sizeof(pel_t); pic->img.i_stride[2] = frame->i_stride[2] * sizeof(pel_t); pic->img.img_planes[0] = (uint8_t *)frame->planes[0]; pic->img.img_planes[1] = (uint8_t *)frame->planes[1]; pic->img.img_planes[2] = (uint8_t *)frame->planes[2]; pic->priv = frame; /* keep trace of this frame */ return 0; } /** * --------------------------------------------------------------------------- * Function : label a packet to be recycled * Parameters : * [in ] : coder - pointer to handle of xavs2 encoder (return by `encoder_create()`) * : packet - pointer to struct xavs2_outpacket_t, whose bit-stream buffer would be recycled * [out] : none * Return : zero for success, otherwise failed * --------------------------------------------------------------------------- */ int xavs2_encoder_packet_unref(void *coder, xavs2_outpacket_t *packet) { if (coder == NULL || packet == NULL) { return 0; } if (packet->private_data != NULL) { xavs2_handler_t *h_mgr = (xavs2_handler_t *)coder; xl_append(&h_mgr->list_frames_free, packet->private_data); } return 0; } /** * --------------------------------------------------------------------------- * Function : write (send) data to the xavs2 encoder * Parameters : * [in ] : coder - pointer to wrapper of the xavs2 encoder * : pic - pointer to struct xavs2_picture_t * [out] : packet- output bit-stream * Return : zero for success, otherwise failed * --------------------------------------------------------------------------- */ int xavs2_encoder_encode(void *coder, xavs2_picture_t *pic, xavs2_outpacket_t *packet) { xavs2_handler_t *h_mgr = (xavs2_handler_t *)coder; xavs2_frame_t *frame = NULL; assert(h_mgr != NULL); if (pic != NULL) { xavs2_t *h = NULL; /* is this our own frame buffer ? */ assert(pic->priv != NULL); if (pic->priv == NULL) { return -1; } frame = (xavs2_frame_t *)pic->priv; if (pic->i_state != XAVS2_STATE_NO_DATA) { /* copy frame properties */ frame->i_frm_type = pic->i_type; frame->i_qpplus1 = pic->i_qpplus1; frame->i_pts = pic->i_pts; frame->b_keyframe = pic->b_keyframe; /* set encoder handle */ h = h_mgr->p_coder; /* expand border if need */ if (h->param->org_width != h->i_width || h->param->org_height != h->i_height) { xavs2_frame_expand_border_mod8(h, frame); } /* set frame number here */ frame->i_frame = h_mgr->i_input; h_mgr->i_input = get_next_frame_id(h_mgr->i_input); /* set flag */ frame->i_state = 0; /* counter number of input frames */ h_mgr->num_input++; } else { /* recycle space for the pic handler */ xl_append(&h_mgr->list_frames_free, frame); frame = NULL; } } else { /* fetch an empty node from unused list */ frame = frame_buffer_get_free_frame_ipb(h_mgr); /* set flag to flush delayed frames */ frame->i_state = XAVS2_FLUSH; } /* decide slice type and send frames into encoding queue */ if (frame != NULL) { send_frame_to_enc_queue(h_mgr, frame); } /* fetch a frame */ encoder_fetch_one_encoded_frame(h_mgr, packet, pic == NULL); return 0; } xavs2-1.3/source/encoder/xavs2_api.c000066400000000000000000000076201340660520300174030ustar00rootroot00000000000000/* * xavs2_api.c * * Description of this file: * API wrapper for multi bit-depth * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "wrapper.h" #include "encoder.h" #include "version.h" /* --------------------------------------------------------------------------- * Macros */ #if SYS_WINDOWS #define ext_dyn_lib "dll" #elif SYS_MACOSX #include #define ext_dyn_lib "dylib" #else #include #define ext_dyn_lib "so" #endif /* --------------------------------------------------------------------------- */ static xavs2_api_t api_default = { XVERSION_STR, VER_MAJOR * 10 + VER_MINOR, BIT_DEPTH, xavs2_encoder_opt_help, xavs2_encoder_opt_alloc, xavs2_encoder_opt_set, xavs2_encoder_opt_set2, xavs2_encoder_opt_get, xavs2_encoder_opt_destroy, xavs2_encoder_get_buffer, xavs2_encoder_create, xavs2_encoder_destroy, xavs2_encoder_encode, xavs2_encoder_packet_unref, }; typedef const xavs2_api_t *(*xavs2_api_get_t)(int bit_depth); /* --------------------------------------------------------------------------- */ static const xavs2_api_t *xavs2_load_new_module(const char *dll_path, const char *methofd_name, int bit_depth) { /* TODO: ʹôĿʱ, ֵݹô˺յ± */ #if _WIN32 HMODULE h = LoadLibraryA(dll_path); if (h) { xavs2_api_get_t get = (xavs2_api_get_t)GetProcAddress(h, methofd_name); if (get) { return get(bit_depth); } } #else void* h = dlopen(dll_path, RTLD_LAZY | RTLD_LOCAL); if (h) { xavs2_api_get_t get = (xavs2_api_get_t)dlsym(h, methofd_name); if (get) { return get(bit_depth); } } #endif xavs2_log(NULL, XAVS2_LOG_ERROR, "Failed to load library: %s, %d-bit \n", dll_path, bit_depth); return NULL; } /** * --------------------------------------------------------------------------- * Function : get xavs2 APi handler * Parameters : * [in ] : bit_depth - required bit-depth for encoding * Return : NULL when failure * --------------------------------------------------------------------------- */ XAVS2_API const xavs2_api_t * xavs2_api_get(int bit_depth) { char s_lib_name[64]; const char* method_name = "xavs2_api_get"; switch (bit_depth) { case BIT_DEPTH: return &api_default; default: sprintf(s_lib_name, "libxavs2-%d-%dbit.%s", VER_MAJOR * 10 + VER_MINOR, bit_depth, ext_dyn_lib); return xavs2_load_new_module(s_lib_name, method_name, bit_depth); } } xavs2-1.3/source/encoder/xlist.c000066400000000000000000000134741340660520300166560ustar00rootroot00000000000000/* * xlist.c * * Description of this file: * list structure with multi-thread support of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "xlist.h" #if !defined(_MSC_VER) #include #include #endif /** * =========================================================================== * xlist * =========================================================================== */ /** * --------------------------------------------------------------------------- * Function : initialize a list * Parameters : * [in ] : xlist - pointer to the node list * [out] : none * Return : zero for success, otherwise failed * Remarks : also create 2 synchronous objects, but without any node * --------------------------------------------------------------------------- */ int xl_init(xlist_t *const xlist) { if (xlist == NULL) { return -1; } /* set list empty */ xlist->p_list_head = NULL; xlist->p_list_tail = NULL; /* set node number */ xlist->i_node_num = 0; /* create lock and conditions */ if (xavs2_thread_mutex_init(&xlist->list_mutex, NULL) < 0 || xavs2_thread_cond_init(&xlist->list_cond, NULL) < 0) { xavs2_log(NULL, XAVS2_LOG_ERROR, "Failed to init lock for xl_init()"); return -1; } return 0; } /** * --------------------------------------------------------------------------- * Function : destroy a list * Parameters : * [in ] : xlist - the list, pointer to struct xlist_t * [out] : none * Return : none * --------------------------------------------------------------------------- */ void xl_destroy(xlist_t *const xlist) { if (xlist == NULL) { return; } /* destroy lock and conditions */ xavs2_thread_mutex_destroy(&xlist->list_mutex); xavs2_thread_cond_destroy(&xlist->list_cond); /* clear */ memset(xlist, 0, sizeof(xlist_t)); } /** * --------------------------------------------------------------------------- * Function : append data to the tail of a list * Parameters : * [in ] : xlist - the node list, pointer to struct xlist * : data - the data to append * [out] : none * Return : none * --------------------------------------------------------------------------- */ void xl_append(xlist_t *const xlist, void *node) { node_t *new_node = (node_t *)node; if (xlist == NULL || new_node == NULL) { return; /* error */ } new_node->next = NULL; /* set NULL */ xavs2_thread_mutex_lock(&xlist->list_mutex); /* lock */ /* append this node */ if (xlist->p_list_tail != NULL) { /* append this node at tail */ xlist->p_list_tail->next = new_node; } else { xlist->p_list_head = new_node; } xlist->p_list_tail = new_node; /* point to the tail node */ xlist->i_node_num++; /* increase the node number */ xavs2_thread_mutex_unlock(&xlist->list_mutex); /* unlock */ /* all is done, notify one waiting thread to work */ xavs2_thread_cond_signal(&xlist->list_cond); } /** * --------------------------------------------------------------------------- * Function : remove one node from the list's head position * Parameters : * [in ] : xlist - the node list, pointer to struct xlist_t * : wait - wait the semaphore? * [out] : none * Return : node pointer for success, or NULL for failure * --------------------------------------------------------------------------- */ void *xl_remove_head(xlist_t *const xlist, const int wait) { node_t *node = NULL; if (xlist == NULL) { return NULL; /* error */ } xavs2_thread_mutex_lock(&xlist->list_mutex); while (wait && !xlist->i_node_num) { xavs2_thread_cond_wait(&xlist->list_cond, &xlist->list_mutex); } /* remove the header node */ if (xlist->i_node_num > 0) { node = xlist->p_list_head; /* point to the header node */ /* modify the list */ xlist->p_list_head = node->next; if (xlist->p_list_head == NULL) { /* there are no any node in this list, reset the tail pointer */ xlist->p_list_tail = NULL; } xlist->i_node_num--; /* decrease the number */ } xavs2_thread_mutex_unlock(&xlist->list_mutex); return node; } xavs2-1.3/source/encoder/xlist.h000066400000000000000000000060431340660520300166550ustar00rootroot00000000000000/* * xlist.h * * Description of this file: * list structure with multi-thread support of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_XLIST_H #define XAVS2_XLIST_H /** * =========================================================================== * type defines * =========================================================================== */ /* --------------------------------------------------------------------------- * node */ typedef struct node_t node_t; struct node_t { node_t *next; /* pointer to next node */ }; /* --------------------------------------------------------------------------- * xlist_t */ typedef struct xlist_t { node_t *p_list_head; /* pointer to head of node list */ node_t *p_list_tail; /* pointer to tail of node list */ xavs2_thread_cond_t list_cond; /* list condition variable */ xavs2_thread_mutex_t list_mutex; /* list mutex lock */ int i_node_num; /* node number in the list */ } xlist_t; /** * =========================================================================== * interface function declares * =========================================================================== */ /* --------------------------------------------------------------------------- * xlist */ #define xl_init FPFX(xl_init) int xl_init(xlist_t *const xlist); #define xl_destroy FPFX(xl_destroy) void xl_destroy(xlist_t *const xlist); #define xl_append FPFX(xl_append) void xl_append(xlist_t *const xlist, void *node); #define xl_remove_head FPFX(xl_remove_head) void *xl_remove_head(xlist_t *const xlist, const int wait); #endif // XAVS2_XLIST_H xavs2-1.3/source/encoder/yuv_writer.c000066400000000000000000000043451340660520300177270ustar00rootroot00000000000000/* * yuv_writer.c * * Description of this file: * YUV Writing functions definition of the xavs2 library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "wrapper.h" #include "encoder.h" /* --------------------------------------------------------------------------- */ void dump_yuv_out(xavs2_t *h, FILE *fp, xavs2_frame_t *frame, int img_w, int img_h) { int j; if (fp != NULL) { UNUSED_PARAMETER(h); for (j = 0; j < img_h; j++) { fwrite(frame->planes[0] + j * frame->i_stride[0], img_w, 1, fp); } if (frame->i_plane == 3) { for (j = 0; j < (img_h >> 1); j++) { fwrite(frame->planes[1] + j * frame->i_stride[1], img_w >> 1, 1, fp); } for (j = 0; j < (img_h >> 1); j++) { fwrite(frame->planes[2] + j * frame->i_stride[2], img_w >> 1, 1, fp); } } } } xavs2-1.3/source/test/000077500000000000000000000000001340660520300146765ustar00rootroot00000000000000xavs2-1.3/source/test/test.c000066400000000000000000000212321340660520300160210ustar00rootroot00000000000000/* * test.c * * Description of this file: * Main function * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ /* --------------------------------------------------------------------------- * disable warning C4996: functions or variables may be unsafe. */ #if defined(_MSC_VER) #define _CRT_SECURE_NO_WARNINGS #endif /* --------------------------------------------------------------------------- * include files */ #include #include #include #include #include #if defined(_MSC_VER) #include #include #else #include #include #endif #include "xavs2.h" /* --------------------------------------------------------------------------- */ static FILE *g_infile = NULL; static FILE *g_outfile = NULL; const xavs2_api_t *g_api = NULL; /* --------------------------------------------------------------------------- */ static void dump_encoded_data(void *coder, xavs2_outpacket_t *packet) { if (g_outfile) { if (packet->state == XAVS2_STATE_ENCODED) { fwrite(packet->stream, packet->len, 1, g_outfile); } else if (packet->state == XAVS2_STATE_FLUSH_END) { fwrite(packet->stream, packet->len, 1, g_outfile); } g_api->encoder_packet_unref(coder, packet); } } /* --------------------------------------------------------------------------- * read one frame data from file line by line */ static int read_one_frame(xavs2_image_t *img, int shift_in) { int k, j; if (img->in_sample_size != img->enc_sample_size) { static uint8_t p_buffer[16 * 1024]; for (k = 0; k < img->i_plane; k++) { int i_width = img->i_width[k]; int i_stride = img->i_stride[k]; if (img->in_sample_size == 1) { for (j = 0; j < img->i_lines[k]; j++) { uint16_t *p_plane = (uint16_t *)&img->img_planes[k][j * i_stride]; int i; if (fread(p_buffer, i_width, 1, g_infile) != 1) { return -1; } memset(p_plane, 0, i_stride); for (i = 0; i < i_width; i++) { p_plane[i] = p_buffer[i] << shift_in; } } } else { printf("Not supported high bit-depth for reading\n"); return -1; } } } else { for (k = 0; k < img->i_plane; k++) { int size_line = img->i_width[k] * img->in_sample_size; for (j = 0; j < img->i_lines[k]; j++) { if (fread(img->img_planes[k] + img->i_stride[k] * j, size_line, 1, g_infile) != 1) { return -1; } } } } return 0; } int test_encoder(const xavs2_api_t *api, xavs2_param_t *param) { const char *in_file = api->opt_get(param, "input"); const char *bs_file = api->opt_get(param, "output"); const int shift_in = atoi(api->opt_get(param, "SampleShift")); int num_frames = atoi(api->opt_get(param, "frames")); xavs2_picture_t pic; void *encoder = NULL; int k; xavs2_outpacket_t packet = {0}; /* open input & output files */ if ((g_infile = fopen(in_file, "rb")) == NULL) { fprintf(stderr, "error opening input file: \"%s\"\n", in_file); return -1; } if ((g_outfile = fopen(bs_file, "wb")) == NULL) { fprintf(stderr, "error opening output file: \"%s\"\n", bs_file); fclose(g_infile); return -1; } if (num_frames == 0) { num_frames = 1 << 30; } /* create the xavs2 video encoder */ encoder = api->encoder_create(param); if (encoder == NULL) { fprintf(stderr, "Error: Can not create encoder. Null pointer returned.\n"); fclose(g_infile); fclose(g_outfile); return -1; } /* read frame data and send to the xavs2 video encoder */ for (k = 0; k < num_frames; k++) { if (api->encoder_get_buffer(encoder, &pic) < 0) { fprintf(stderr, "failed to get frame buffer [%3d,%3d].\n", k, num_frames); break; } if (read_one_frame(&pic.img, shift_in) < 0) { fprintf(stderr, "failed to read one YUV frame [%3d/%3d]\n", k, num_frames); /* return the buffer to the encoder */ pic.i_state = XAVS2_STATE_NO_DATA; api->encoder_encode(encoder, &pic, &packet); dump_encoded_data(encoder, &packet); break; } pic.i_state = 0; pic.i_type = XAVS2_TYPE_AUTO; pic.i_pts = k; api->encoder_encode(encoder, &pic, &packet); dump_encoded_data(encoder, &packet); } /* flush delayed frames */ for (; packet.state != XAVS2_STATE_FLUSH_END;) { api->encoder_encode(encoder, NULL, &packet); dump_encoded_data(encoder, &packet); } /* destroy the encoder */ api->encoder_destroy(encoder); return 0; } /* --------------------------------------------------------------------------- */ const xavs2_api_t *load_xavs2_library(int argc, char **argv, xavs2_param_t **p_param) { const xavs2_api_t *api = NULL; /* encoding parameters */ int guess_bit_depth; xavs2_param_t *param = NULL; /* get API handler */ for (guess_bit_depth = 8; guess_bit_depth <= 10; guess_bit_depth += 2) { int enc_bit_depth = 0; api = xavs2_api_get(guess_bit_depth); if (api == NULL) { continue; } fprintf(stdout, "CAVS2Enc lib loaded: version %s %d-bit\n", api->s_version_source, api->internal_bit_depth); fflush(stdout); if (argc < 2) { api->opt_help(); /* at lease one additional parameter needed */ return NULL; } /* parse parameters and modify the parameters */ param = api->opt_alloc(); if (api->opt_set(param, argc, argv) < 0) { fprintf(stdout, "parse contents error.\n"); return NULL; } enc_bit_depth = atoi(api->opt_get(param, "BitDepth")); if (enc_bit_depth == api->internal_bit_depth) { fprintf(stdout, "using CAVS2Enc lib: version %s %d-bit success\n", api->s_version_source, api->internal_bit_depth); break; } fprintf(stdout, "Incompatible encoding bit-depth to library: %d-bit lib for encoding %d-bit source\n", api->internal_bit_depth, enc_bit_depth); /* free spaces */ api->opt_destroy(param); param = NULL; api = NULL; } *p_param = param; return api; } /* --------------------------------------------------------------------------- */ int main(int argc, char **argv) { /* encoding parameters */ xavs2_param_t *param = NULL; int ret; /* get API handler */ g_api = load_xavs2_library(argc, argv, ¶m); if (g_api == NULL) { fprintf(stdout, "CAVS2Enc lib load error\n"); return -1; } fflush(NULL); // flush all output streams /* test encoding */ ret = test_encoder(g_api, param); /* free spaces */ g_api->opt_destroy(param); return ret; } xavs2-1.3/source/xavs2.h000066400000000000000000000370041340660520300151370ustar00rootroot00000000000000/* * xavs2.h * * Description of this file: * API interface of the xavs2 encoder library * * -------------------------------------------------------------------------- * * xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * Homepage1: http://vcl.idm.pku.edu.cn/xavs2 * Homepage2: https://github.com/pkuvcl/xavs2 * Homepage3: https://gitee.com/pkuvcl/xavs2 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef XAVS2_XAVS2_H #define XAVS2_XAVS2_H #include #ifdef __cplusplus extern "C" { // only need to export C interface if used by C++ source code #endif #define XAVS2_BUILD 13 /* xavs2 build version */ /** * =========================================================================== * define XAVS2_API * =========================================================================== */ #ifdef XAVS2_EXPORTS # ifdef __GNUC__ /* for Linux */ # if __GNUC__ >= 4 # define XAVS2_API __attribute__((visibility("default"))) # else # define XAVS2_API __attribute__((dllexport)) # endif # else /* for windows */ # define XAVS2_API __declspec(dllexport) # endif #else # ifdef __GNUC__ /* for Linux */ # define XAVS2_API # else /* for windows */ # define XAVS2_API __declspec(dllimport) # endif #endif /** * =========================================================================== * const defines * =========================================================================== */ /* --------------------------------------------------------------------------- * state defines */ #define XAVS2_UNDEFINE 0 #define XAVS2_STATE_NO_DATA 1 /* no bitstream data */ #define XAVS2_STATE_ENCODED 2 /* one frame has been encoded */ #define XAVS2_STATE_FLUSH_END 9 /* flush end */ #define XAVS2_FLUSH 99 /* flush (fetch bitstream data only) */ /* --------------------------------------------------------------------------- * slice type */ #define XAVS2_TYPE_AUTO 0 /* Let xavs2 encoder choose the right type */ #define XAVS2_TYPE_IDR 1 #define XAVS2_TYPE_I 2 #define XAVS2_TYPE_P 3 #define XAVS2_TYPE_F 4 #define XAVS2_TYPE_B 5 #define XAVS2_TYPE_KEYFRAME 6 /* IDR or I depending on b_open_gop option */ #define XAVS2_TYPE_G 7 #define XAVS2_TYPE_GB 8 /* --------------------------------------------------------------------------- * color space type */ #define XAVS2_CSP_MASK 0x00ff /* mask */ #define XAVS2_CSP_NONE 0x0000 /* invalid mode */ #define XAVS2_CSP_I420 0x0001 /* yuv 4:2:0 planar */ #define XAVS2_CSP_YV12 0x0002 /* yvu 4:2:0 planar */ #define XAVS2_CSP_NV12 0x0003 /* yuv 4:2:0, with one y plane and one packed u+v */ #define XAVS2_CSP_MAX 0x0004 /* end of list */ #define XAVS2_CSP_VFLIP 0x1000 /* the csp is vertically flipped */ #define XAVS2_CSP_HIGH_DEPTH 0x2000 /* the csp has a depth of 16 bits per pixel component */ /* --------------------------------------------------------------------------- * log level */ enum log_level_e { XAVS2_LOG_NONE = -1, /* none */ XAVS2_LOG_ERROR = 0, /* level 0 */ XAVS2_LOG_WARNING = 1, /* level 1 */ XAVS2_LOG_INFO = 2, /* level 2 */ XAVS2_LOG_DEBUG = 3, /* level 3 */ }; /* --------------------------------------------------------------------------- * others */ /** * =========================================================================== * interface struct type defines * =========================================================================== */ /* ----------------------------- * xavs2 encoder input parameters * * For version safety you may use * xavs2_encoder_opt_alloc(), xavs2_encoder_opt_destroy() * to manage the allocation of xavs2_param_t instances, and * xavs2_encoder_opt_set(), xavs2_encoder_opt_set2() * to assign values by name, and * xavs2_encoder_opt_get(p, "param_name") * to get specific configuration value string. * * Just treat xavs2_param_t as an opaque data structure */ typedef struct xavs2_param_t xavs2_param_t; /* --------------------------------------------------------------------------- * xavs2_image_t */ typedef struct xavs2_image_t { int i_csp; /* color space */ int in_sample_size; /* input sample size in byte */ int enc_sample_size; /* encoding sample size in byte */ int i_plane; /* number of image planes */ int i_width[3]; /* widths for each plane */ int i_lines[3]; /* heights for each plane */ int i_stride[3]; /* strides for each plane */ uint8_t *img_planes[4]; /* pointers to each plane (planes[3]: start buffer address) */ } xavs2_image_t; /* --------------------------------------------------------------------------- * xavs2_picture_t */ typedef struct xavs2_picture_t { /* [IN ] flush or not * [OUT] encoding state */ int i_state; /* [IN ] force picture type (if not auto) * if xavs2 encoder encoding parameters are violated in the forcing of picture * types, xavs2 encoder will correct the input picture type and log a warning. * the quality of frame type decisions may suffer if a great deal of * fine-grained mixing of auto and forced frametypes is done * [OUT] type of the picture encoded */ int i_type; /* [IN ] force quantizer for != XAVS2_QP_AUTO */ int i_qpplus1; /* [OUT] whether this frame is a keyframe. important when using modes that * result in SEI recovery points being used instead of IDR frames */ int b_keyframe; /* [IN ] user pts. [OUT]: pts of encoded picture (user) */ int64_t i_pts; /* [OUT] frame dts. When the pts of the first frame is close to zero, * initial frames may have a negative dts which must be dealt * with by any muxer */ int64_t i_dts; /* [IN ] raw data */ xavs2_image_t img; /* [IN ] private pointer, DO NOT change it */ void *priv; } xavs2_picture_t; /* --------------------------------------------------------------------------- * xavs2_outpacket_t */ typedef struct xavs2_outpacket_t { void *private_data; /* private pointer, DONOT change it */ const uint8_t *stream; /* pointer to bitstream data buffer */ int len; /* length of bitstream data */ int state; /* state of current frame encoded */ int type; /* type of current frame encoded */ int64_t pts; /* pts of current frame encoded */ int64_t dts; /* dts of current frame encoded */ void *opaque; /* pointer to user data */ } xavs2_outpacket_t; /** * =========================================================================== * interface function declares: parameters * =========================================================================== */ typedef struct xavs2_api_t { /** * =========================================================================== * version information * =========================================================================== */ const char *s_version_source; /* source tree version SHA */ int version_build; /* XAVS2_BUILD version (10 * VER_MAJOR + VER_MINOR) */ int internal_bit_depth; /* internal bit-depth for encoding */ /** * =========================================================================== * function pointers * =========================================================================== */ /** * --------------------------------------------------------------------------- * Function : Output help parameters * Parameters : * [in ] : none * [out] : instructions would be output through standard output stream (stdout) * Return : none * --------------------------------------------------------------------------- */ void (*opt_help)(void); /** * --------------------------------------------------------------------------- * Function : initialize default parameters for the xavs2 video encoder * Parameters : * [in ] : none * Return : parameter handler, can be further configured * --------------------------------------------------------------------------- */ xavs2_param_t *(*opt_alloc)(void); /** * --------------------------------------------------------------------------- * Function : Parsing encoding parameters * Parameters : * [in ] : param - pointer to struct xavs2_param_t * [in ] : argc - number of command line parameters * [in ] : argv - pointer to parameter strings * Return : int - zero for success, otherwise failed * --------------------------------------------------------------------------- */ int (*opt_set)(xavs2_param_t *param, int argc, char *argv[]); /** * --------------------------------------------------------------------------- * Function : Parsing encoding parameters * Parameters : * [in ] : param - pointer to struct xavs2_param_t * [in ] : name - name of parameter * [in ] : value_string - parameter value * Return : int - zero for success, otherwise failed * --------------------------------------------------------------------------- */ int (*opt_set2)(xavs2_param_t *param, const char *name, const char *value_string); /** * --------------------------------------------------------------------------- * Function : get value of a specific parameter * Parameters : * [in ] : param - pointer to struct xavs2_param_t * [in ] : name - name of a parameter (input, output, width, height, frames) * Return : const char *: value string * --------------------------------------------------------------------------- */ const char *(*opt_get)(xavs2_param_t *param, const char *name); /** * --------------------------------------------------------------------------- * Function : free memory of parameter * Parameters : * [in ] : none * [out] : none * Return : none * --------------------------------------------------------------------------- */ void (*opt_destroy)(xavs2_param_t *param); /** * =========================================================================== * encoder API * =========================================================================== */ /** * --------------------------------------------------------------------------- * Function : get buffer for the encoder caller * Parameters : * [in ] : coder - pointer to handle of xavs2 encoder * : pic - pointer to struct xavs2_picture_t * [out] : pic - memory would be allocated for the image planes * Return : zero for success, otherwise failed * --------------------------------------------------------------------------- */ int (*encoder_get_buffer)(void *coder, xavs2_picture_t *pic); /** * --------------------------------------------------------------------------- * Function : create and initialize the xavs2 video encoder * Parameters : * [in ] : param - pointer to struct xavs2_param_t * : dump_func - pointer to struct xavs2_dump_func_t * : opaque - user data * [out] : none * Return : handle of xavs2 encoder, none zero for success, otherwise false * --------------------------------------------------------------------------- */ void *(*encoder_create)(xavs2_param_t *param); /** * --------------------------------------------------------------------------- * Function : destroy the xavs2 video encoder * Parameters : * [in ] : coder - pointer to handle of xavs2 encoder (return by `encoder_create()`) * [out] : none * Return : none * Note : this API is *NOT* thread-safe, * and can not be called simultaneously with other APIs. * --------------------------------------------------------------------------- */ void (*encoder_destroy)(void *coder); /** * --------------------------------------------------------------------------- * Function : write (send) data to the xavs2 encoder * Parameters : * [in ] : coder - pointer to handle of xavs2 encoder (return by `encoder_create()`) * : pic - pointer to struct xavs2_picture_t * [out] : packet- output bit-stream * Return : zero for success, otherwise failed * --------------------------------------------------------------------------- */ int (*encoder_encode)(void *coder, xavs2_picture_t *pic, xavs2_outpacket_t *packet); /** * --------------------------------------------------------------------------- * Function : label a packet to be recycled * Parameters : * [in ] : coder - pointer to handle of xavs2 encoder (return by `encoder_create()`) * : packet - pointer to struct xavs2_outpacket_t, whose bit-stream buffer would be recycled * [out] : none * Return : zero for success, otherwise failed * --------------------------------------------------------------------------- */ int (*encoder_packet_unref)(void *coder, xavs2_outpacket_t *packet); } xavs2_api_t; /** * --------------------------------------------------------------------------- * Function : get xavs2 APi handler * Parameters : * [in ] : bit_depth - required bit-depth for encoding * Return : NULL when failure * --------------------------------------------------------------------------- */ XAVS2_API const xavs2_api_t * xavs2_api_get(int bit_depth); #ifdef __cplusplus } #endif #endif // XAVS2_XAVS2_H xavs2-1.3/version.sh000077500000000000000000000071731340660520300144530ustar00rootroot00000000000000#!/bin/sh # ============================================================================ # File: # version.sh # - get version of repository and generate the file version.h # Author: # Falei LUO # ============================================================================ # setting API version api="$(grep '#define XAVS2_BUILD' < source/xavs2.h | sed 's/^.* \([1-9][0-9]*\).*$/\1/')" VER_R=0 VER_SHA='not-in-git-tree' # get version of remote origin/master and local HEAD if [ -d .git ] && command -v git >/dev/null 2>&1 ; then VER_R=`git rev-list --count origin/master` VER_SHA=`git rev-parse HEAD | cut -c -16` fi # generate version numbers VER_MAJOR=`echo $(($api / 10))` VER_MINOR=`echo $(($api % 10))` # date and time information BUILD_TIME=`date "+%Y-%m-%d %H:%M:%S"` # generate the file version.h echo "// ===========================================================================" > version.h echo "// version.h" >> version.h echo "// - collection of version numbers" >> version.h echo "//" >> version.h echo "// Author: Falei LUO " >> version.h echo "//" >> version.h echo "// ===========================================================================" >> version.h echo "" >> version.h echo "#ifndef XAVS2_VERSION_H" >> version.h echo "#define XAVS2_VERSION_H" >> version.h echo "" >> version.h echo "// version number" >> version.h echo "#define VER_MAJOR $VER_MAJOR // major version number" >> version.h echo "#define VER_MINOR $VER_MINOR // minor version number" >> version.h echo "#define VER_BUILD $VER_R // build number" >> version.h echo "#define VER_SHA_STR \"$VER_SHA\" // commit id" >> version.h echo "" >> version.h echo "// stringify" >> version.h echo "#define _TOSTR(x) #x // stringify x" >> version.h echo "#define TOSTR(x) _TOSTR(x) // stringify x, perform macro expansion" >> version.h echo "" >> version.h echo "// define XVERSION string" >> version.h echo "#define XVERSION VER_MAJOR, VER_MINOR, VER_BUILD" >> version.h echo "#define XVERSION_STR TOSTR(VER_MAJOR) \".\" TOSTR(VER_MINOR) \".\" TOSTR(VER_BUILD) \" \" VER_SHA_STR" >> version.h echo "#define XBUILD_TIME \"$BUILD_TIME\"" >> version.h echo "" >> version.h echo "#endif // __VERSION_H__" >> version.h mv version.h source/version.h # show version informations echo "#define XAVS2_VERSION $api" echo "#define XAVS2_POINTVER \"$VER_MAJOR.$VER_MINOR.$VER_R\""